假设我想求局部
int X
的块和。最简单的方法是将 atomicAdd
与 __shared__
变量一起使用:
__device__ int blocksum(int x) {
__shared__ int sum;
sum = 0;
__syncthreads();
atomicAdd(&sum, X);
__syncthreads();
return sum;
}
但据我了解,应该避免这样做,因为这是单点争用,并且会导致所有线程顺序执行。因此,我开始寻找一种更快/更简单的方法来做到这一点,但找不到任何答案。我实现了两种替代方案:逐块扫描和、
__reduce_add_sync
和atomicAdd
的混合;然而,两者的表现都比atomicAdd
差。
幕后是否存在某种“添加广播”优化?简单的
atomicAdd
是执行逐块求和的正确方法吗?
你是对的,当所有线程添加相同的地址时,由于添加线程序列化,执行块范围的atomicAdd将导致显着的速度减慢。
有一个例外,如果你使用atomicAdd(address, boolean)。在这种情况下,GPU 将执行以下代码。
const auto count = __popc(__ballot_sync(__activemask(), boolean));
if ((threadIdx.x % 32) == 0) { atomicAdd(address, count); }
代码的另一个问题是 __reduce_add_sync 非常慢。我怀疑这只是以微代码实现的预览功能(它确实编译为单个 SASS 升级代码)。它总是比您自己编写的任何代码都要慢。
让我们做一些比较:
__global__ testAtomic(int i) {
__shared__ int counter;
counter = 0;
__synthreads();
const auto StartTime = clock64(); //do not use clock32! it is slow.
atomicAdd(&counter, i);
__syncthreads(); //wait for atomicAdd to resolve, or timings will be off
const auto EndTime = clock64();
printf("total = %i, should be: %i, time = %i cycles\n", counter, i * blockDim.x, int(EndTime - StartTime));
}
__global__ testReduceAddSync(int i) {
__shared__ int counter;
__shared__ int counters[32]; //1024 /warpsize = 32 warps max
counter = 0;
counters[threadIdx.x % 32] = 0;
__synthreads();
const auto StartTime = clock64(); //do not use clock32! it is slow.
counters[threadIdx / 32] = __reduce_add_sync(-1u, i);
__syncthreads();
if (threadIdx.x < 32) {
counter = __reduce_add_sync(-1u, counters[threadIdx.x]);
}
__syncthreads(); //wait for atomicAdd to resolve, or timings will be off
const auto EndTime = clock64();
printf("total = %i, should be: %i, time = %i cycles\n", counter, i * blockDim.x, int(EndTime - StartTime));
}
__global__ testShflDown(int i) {
__shared__ int counter;
__shared__ int counters[32]; //1024 /warpsize = 32 warps max
counter = 0;
counters[threadIdx.x % 32] = 0;
auto sum = i;
__synthreads();
const auto StartTime = clock64(); //do not use clock32! it is slow.
sum += __shfl_down_sync(-1u, 1, sum);
sum += __shfl_down_sync(-1u, 2, sum);
sum += __shfl_down_sync(-1u, 4, sum);
sum += __shfl_down_sync(-1u, 8, sum);
sum += __shfl_down_sync(-1u, 16, sum);
if ((threadIdx.x % 32) == 0) { atomicAdd(&counter, sum); }
__syncthreads(); //wait for atomicAdd to resolve, or timings will be off
const auto EndTime = clock64();
printf("total = %i, should be: %i, time = %i cycles\n", counter, i * blockDim.x, int(EndTime - StartTime));
}