CUDA 缩减最小值和指数

问题描述 投票:0回答:2

我通过遵循

this
很好的解释并修改它,使用CUDA 8实现了最小减少

__inline__ __device__ int warpReduceMin(int val) 
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
    {
        int tmpVal = __shfl_down(val, offset);
        if (tmpVal < val)
        {
            val = tmpVal;
        }
    }
    return val;
}

__inline__ __device__ int blockReduceMin(int val) 
{

    static __shared__ int shared[32]; // Shared mem for 32 partial mins
    int lane = threadIdx.x % warpSize;
    int wid = threadIdx.x / warpSize;

    val = warpReduceMin(val);     // Each warp performs partial reduction

    if (lane == 0)
    {
        shared[wid] = val; // Write reduced value to shared memory
    }

    __syncthreads();              // Wait for all partial reductions

    //read from shared memory only if that warp existed
    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;

    if (wid == 0)
    {
        val = warpReduceMin(val); //Final reduce within first warp
    }

    return val;
}

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
    int minVal = INT_MAX;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
        i < N;
        i += blockDim.x * gridDim.x) 
    {
        minVal = min(minVal, in[i]);
    }
    minVal = blockReduceMin(minVal);
    if (threadIdx.x == 0)
    {
        atomicMin(out, minVal);
    }
}

它效果很好,我得到了最小值。但是,我不关心最小值,只关心它在原始输入数组中的索引。

我尝试稍微修改一下我的代码

__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
    {
        int tmpVal = __shfl_down(val, offset);
        if (tmpVal < val)
        {
            *idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here
            val = tmpVal;
        }
    }
    return val;
}

...
blockReduceMin stayed the same only adding idx to function calls
...

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
    int minVal = INT_MAX;
    int minIdx = 0; // Added this
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
        i < N;
        i += blockDim.x * gridDim.x) 
    {
        if (in[i] < minVal)
        {
            minVal = in[i];
            minIdx = i; // Added this
        }
    }
    minVal = blockReduceMin(minVal, &minIdx);
    if (threadIdx.x == 0)
    {
        int old = atomicMin(out, minVal);
        if (old != minVal) // value was updated
        {
            atomicExch(out + 1, minIdx);
        }
    }
}

但这不起作用。我觉得我错过了一些重要的东西,这不是解决问题的方法,但我的搜索没有结果。

cuda nvidia reduction
2个回答
4
投票

这里有几个问题。您需要修改扭曲和块最小值函数,以在每次找到新的局部最小值时传播最小值及其索引。也许是这样的:

__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        int tmpVal = __shfl_down(val, offset);
        int tmpIdx = __shfl_down(idx, offset);
        if (tmpVal < val) {
            val = tmpVal;
            idx = tmpIdx;
        }
    }
}

__inline__ __device__  void blockReduceMin(int& val, int& idx) 
{

    static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins
    int lane = threadIdx.x % warpSize;
    int wid = threadIdx.x / warpSize;

    warpReduceMin(val, idx);     // Each warp performs partial reduction

    if (lane == 0) {
        values[wid] = val; // Write reduced value to shared memory
        indices[wid] = idx; // Write reduced value to shared memory
    }

    __syncthreads();              // Wait for all partial reductions

    //read from shared memory only if that warp existed
    if (threadIdx.x < blockDim.x / warpSize) {
        val = values[lane];
        idx = indices[lane];
    } else {
        val = INT_MAX;
        idx = 0;
    }

    if (wid == 0) {
         warpReduceMin(val, idx); //Final reduce within first warp
    }
}

[注意:在浏览器中编写,从未编译或测试,使用风险自担]

这应该让每个块都保持正确的局部最小值和索引。那么你还有第二个问题。这个:

int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
    atomicExch(out + 1, minIdx);
}

坏了。不保证在此代码中正确设置最小值及其索引。这是因为无法保证两个原子操作具有任何同步,并且存在潜在的竞争,其中一个块可能正确地覆盖另一个块的最小值,但随后其索引被它替换的块覆盖。这里唯一的解决方案是某种互斥体,或者对每个块的结果运行第二个缩减内核。


0
投票

在 Ampere 和更新的

#if __CUDA_ARCH__ > 800
中,您可以用几行代码完成此操作:

对于单经纱:

constexpr auto Everyone = -1u;
const auto minval = __reduce_min_sync(Everyone, value);
const auto minmask = __ballot_sync(Everyone, value == minval);
const auto minpos = __ffs(minmask) -1;

然后,您可以将最小值和 min_positions 存储在数组中,然后 提取保持最小值的扭曲及其位置。

__shared__ float mins[32];
__shared__ unsigned min_pos[32];
const auto WarpId = threadIdx.x / 32;
mins[WarpId] = minval;
min_pos[WarpId] = minpos;
__syncthreads();
const auto WarpCount = blockDim.x / 32;
const auto ActiveMask = __ballot_sync(Everyone, threadIdx.x < WarpCount);
//Never use __activemask(), that does not work beyond Pascal.
if (threadIdx.x < WarpCount) { //This will all be in warp 0.
    const auto val2 = mins[threadIdx.x];
    const auto minpos2 = min_pos[threadIdx.x];
    //another reduction
    const auto minval = __reduce_min_sync(ActiveMask, val2);
    const auto minmask = __ballot_sync(ActiveMask, val2 == minval);
    const auto minwarp = __ffs(minmask) -1;
    printf("Minval: %f, MinThread: %i", minval, minwarp*32 + minpos);
}
    

不再循环。请注意,尽管内置

__reduce_xxx_sync
方法很酷,但如果参与线程中的掩码不相同,它们会非常慢。以下代码的运行速度比您预期的慢 32 倍。

const auto LaneId = threadIdx.x % 32;
const auto JustMe = 1 << LaneId;
//I know this is a no-op.
const auto example = __reduce_or_sync(JustMe, value); //execution is serialized
 
© www.soinside.com 2019 - 2024. All rights reserved.