在 CUDA 中,执行分块求和的正确方法是什么?

问题描述 投票:0回答:1

假设我想求局部

int X
的块和。最简单的方法是将
atomicAdd
__shared__
变量一起使用:

__device__ int blocksum(int x) {
  __shared__ int sum;
  
  sum = 0;
  __syncthreads();

  atomicAdd(&sum, X);
  __syncthreads();

  return sum;
}

但据我了解,应该避免这样做,因为这是单点争用,并且会导致所有线程顺序执行。因此,我开始寻找一种更快/更简单的方法来做到这一点,但找不到任何答案。我实现了两种替代方案:逐块扫描和、

__reduce_add_sync
atomicAdd
的混合;然而,两者的表现都比
atomicAdd
差。

幕后是否存在某种“添加广播”优化?简单的

atomicAdd
是执行逐块求和的正确方法吗?

cuda
1个回答
0
投票

你是对的,当所有线程添加相同的地址时,由于添加线程序列化,执行块范围的atomicAdd将导致显着的速度减慢。

有一个例外,如果你使用atomicAdd(address, boolean)。在这种情况下,GPU 将执行以下代码。

const auto count = __popc(__ballot_sync(__activemask(), boolean));
if ((threadIdx.x % 32) == 0) { atomicAdd(address, count); }

代码的另一个问题是 __reduce_add_sync 非常慢。我怀疑这只是以微代码实现的预览功能(它确实编译为单个 SASS 升级代码)。它总是比您自己编写的任何代码都要慢。

让我们做一些比较:

__global__ testAtomic(int i) { 
    __shared__ int counter;
    counter = 0;
    __synthreads();
    const auto StartTime = clock64(); //do not use clock32! it is slow.
    atomicAdd(&counter, i);
    __syncthreads(); //wait for atomicAdd to resolve, or timings will be off
    const auto EndTime = clock64();
    printf("total = %i, should be: %i, time = %i cycles\n", counter, i * blockDim.x, int(EndTime - StartTime));
}


__global__ testReduceAddSync(int i) { 
    __shared__ int counter;
    __shared__ int counters[32]; //1024 /warpsize = 32 warps max
    counter = 0;
    counters[threadIdx.x % 32] = 0;

    __synthreads();
    const auto StartTime = clock64(); //do not use clock32! it is slow.
    counters[threadIdx / 32] = __reduce_add_sync(-1u, i);
    __syncthreads();
    if (threadIdx.x < 32) {
        counter = __reduce_add_sync(-1u, counters[threadIdx.x]);
    }
    __syncthreads(); //wait for atomicAdd to resolve, or timings will be off
    const auto EndTime = clock64();
    printf("total = %i, should be: %i, time = %i cycles\n", counter, i * blockDim.x, int(EndTime - StartTime));
}

__global__ testShflDown(int i) { 
    __shared__ int counter;
    __shared__ int counters[32]; //1024 /warpsize = 32 warps max
    counter = 0;
    counters[threadIdx.x % 32] = 0;
    auto sum = i;
    __synthreads();
    const auto StartTime = clock64(); //do not use clock32! it is slow.
    sum += __shfl_down_sync(-1u, 1, sum);
    sum += __shfl_down_sync(-1u, 2, sum); 
    sum += __shfl_down_sync(-1u, 4, sum);
    sum += __shfl_down_sync(-1u, 8, sum);
    sum += __shfl_down_sync(-1u, 16, sum);
    if ((threadIdx.x % 32) == 0) { atomicAdd(&counter, sum); }
    __syncthreads(); //wait for atomicAdd to resolve, or timings will be off
    const auto EndTime = clock64();
    printf("total = %i, should be: %i, time = %i cycles\n", counter, i * blockDim.x, int(EndTime - StartTime));
}
  
© www.soinside.com 2019 - 2024. All rights reserved.