我通过遵循
this很好的解释并修改它,使用
CUDA 8
实现了最小减少
__inline__ __device__ int warpReduceMin(int val)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
val = tmpVal;
}
}
return val;
}
__inline__ __device__ int blockReduceMin(int val)
{
static __shared__ int shared[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceMin(val); // Each warp performs partial reduction
if (lane == 0)
{
shared[wid] = val; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
if (wid == 0)
{
val = warpReduceMin(val); //Final reduce within first warp
}
return val;
}
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
minVal = min(minVal, in[i]);
}
minVal = blockReduceMin(minVal);
if (threadIdx.x == 0)
{
atomicMin(out, minVal);
}
}
它效果很好,我得到了最小值。但是,我不关心最小值,只关心它在原始输入数组中的索引。
我尝试稍微修改一下我的代码
__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx
{
for (int offset = warpSize / 2; offset > 0; offset /= 2)
{
int tmpVal = __shfl_down(val, offset);
if (tmpVal < val)
{
*idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here
val = tmpVal;
}
}
return val;
}
...
blockReduceMin stayed the same only adding idx to function calls
...
__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
int minVal = INT_MAX;
int minIdx = 0; // Added this
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x)
{
if (in[i] < minVal)
{
minVal = in[i];
minIdx = i; // Added this
}
}
minVal = blockReduceMin(minVal, &minIdx);
if (threadIdx.x == 0)
{
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
}
}
但这不起作用。我觉得我错过了一些重要的东西,这不是解决问题的方法,但我的搜索没有结果。
这里有几个问题。您需要修改扭曲和块最小值函数,以在每次找到新的局部最小值时传播最小值及其索引。也许是这样的:
__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
int tmpVal = __shfl_down(val, offset);
int tmpIdx = __shfl_down(idx, offset);
if (tmpVal < val) {
val = tmpVal;
idx = tmpIdx;
}
}
}
__inline__ __device__ void blockReduceMin(int& val, int& idx)
{
static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
warpReduceMin(val, idx); // Each warp performs partial reduction
if (lane == 0) {
values[wid] = val; // Write reduced value to shared memory
indices[wid] = idx; // Write reduced value to shared memory
}
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
if (threadIdx.x < blockDim.x / warpSize) {
val = values[lane];
idx = indices[lane];
} else {
val = INT_MAX;
idx = 0;
}
if (wid == 0) {
warpReduceMin(val, idx); //Final reduce within first warp
}
}
[注意:在浏览器中编写,从未编译或测试,使用风险自担]
这应该让每个块都保持正确的局部最小值和索引。那么你还有第二个问题。这个:
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
atomicExch(out + 1, minIdx);
}
坏了。不保证在此代码中正确设置最小值及其索引。这是因为无法保证两个原子操作具有任何同步,并且存在潜在的竞争,其中一个块可能正确地覆盖另一个块的最小值,但随后其索引被它替换的块覆盖。这里唯一的解决方案是某种互斥体,或者对每个块的结果运行第二个缩减内核。
在 Ampere 和更新的
#if __CUDA_ARCH__ > 800
中,您可以用几行代码完成此操作:
对于单经纱:
constexpr auto Everyone = -1u;
const auto minval = __reduce_min_sync(Everyone, value);
const auto minmask = __ballot_sync(Everyone, value == minval);
const auto minpos = __ffs(minmask) -1;
然后,您可以将最小值和 min_positions 存储在数组中,然后 提取保持最小值的扭曲及其位置。
__shared__ float mins[32];
__shared__ unsigned min_pos[32];
const auto WarpId = threadIdx.x / 32;
mins[WarpId] = minval;
min_pos[WarpId] = minpos;
__syncthreads();
const auto WarpCount = blockDim.x / 32;
const auto ActiveMask = __ballot_sync(Everyone, threadIdx.x < WarpCount);
//Never use __activemask(), that does not work beyond Pascal.
if (threadIdx.x < WarpCount) { //This will all be in warp 0.
const auto val2 = mins[threadIdx.x];
const auto minpos2 = min_pos[threadIdx.x];
//another reduction
const auto minval = __reduce_min_sync(ActiveMask, val2);
const auto minmask = __ballot_sync(ActiveMask, val2 == minval);
const auto minwarp = __ffs(minmask) -1;
printf("Minval: %f, MinThread: %i", minval, minwarp*32 + minpos);
}
不再循环。请注意,尽管内置
__reduce_xxx_sync
方法很酷,但如果参与线程中的掩码不相同,它们会非常慢。以下代码的运行速度比您预期的慢 32 倍。
const auto LaneId = threadIdx.x % 32;
const auto JustMe = 1 << LaneId;
//I know this is a no-op.
const auto example = __reduce_or_sync(JustMe, value); //execution is serialized