没有块级ThreadFence的随机缩减结果

Question

我正在为树缩减编写一个简单的内核，它将多个

blockDim.x

元素切片复制到每个块的共享内存数组中。

每个块映射的切片数量是一个编译时变量

slicesPerBlock

。

template<unsigned blockSize, unsigned  slicesPerBlock>
__device__ void CopySlices(volatile const float* pIn1, volatile float* sdata, unsigned bid, unsigned tid) {
    volatile float sum = 0;
#pragma unroll
    for (unsigned i = 0; i < slicesPerBlock; i++) {
        unsigned idx = bid * (slicesPerBlock * blockSize) + i * blockSize+ tid;
        sum += pIn1[idx];
    }
    sdata[tid] = sum;
    __syncthreads();
}

问题是上面的代码会导致随机输出（很少见，但确实会发生，到目前为止我只发现了发布版本中的不匹配情况）！

我尝试了

compute-sanitizer --tool <TOOL> ./MyProgram

对于

racecheck

并且对于调试版本我没有得到任何错误，而对于发布版本我得到了很多错误，它实际上只报告了其中的一些错误：

========= Warning: Race reported between Read access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x580
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x560 [1201312 hazards]
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x5a0 [1201250 hazards]
========= 
========= Warning: Race reported between Read access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x540
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x520 [1162680 hazards]
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x560 [1162594 hazards]
========= 
========= Warning: Race reported between Read access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x500
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x4e0 [1085354 hazards]
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x520 [1085197 hazards]
========= 
========= Warning: Race reported between Read access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x4c0
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x4a0 [930477 hazards]
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x4e0 [930432 hazards]
========= 
========= Warning: Race reported between Read access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x480
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x460 [620352 hazards]
=========     and Write access at void ReductionR1A0<(unsigned int)512, (unsigned int)8>(unsigned long, const float *, float *)+0x4a0 [620352 hazards]
=========

无论如何，我发现的唯一可行的解决方案是使用

__threadfence_block()

，如下所示（即使没有所有这些

volatile

，它也能工作）：

template<unsigned blockSize, unsigned  slicesPerBlock>
__device__ void CopySlices(const float* pIn1, float* sdata, unsigned bid, unsigned tid) {
    float sum = 0;
#pragma unroll
    for (unsigned i = 0; i < slicesPerBlock; i++) {
        unsigned idx = bid * (slicesPerBlock * blockSize) + i * blockSize+ tid;
        sum += pIn1[idx];
        __threadfence_block();
    }
    sdata[tid] = sum;
    __syncthreads();
}

总之，我不明白上述观察背后的逻辑。根据 nVidia 的说法：

void __threadfence_block();相当于 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_block）并确保：

调用线程在调用之前对所有内存进行的所有写入 __threadfence_block() 被调用线程块中的所有线程观察到发生在对所有内存进行的所有写入之前调用 __threadfence_block() 后的调用线程；

调用线程在调用之前对所有内存进行的所有读取到 __threadfence_block() 在所有内存的所有读取之前排序由调用线程在调用 __threadfence_block() 之后创建。

但在我的例子中，从全局内存读取的值以未知的顺序累积在私有寄存器（

sum

）中（因为循环完全展开），当前线程的最终值（

tid

）是存储在共享内存位置

sdata[tid]

。没有其他线程依赖于

sdata[tid]

的值，至少在此之前不依赖于

__syncthreads()

。更重要的是，为什么

volatile

不能解决这里的问题？怎么只有

__threadfence_block()

可以解决？

起初，我认为可能与

Float32

及其非关联性有关，但我也用

unsigned

进行了测试，同样的问题仍然存在。

我错过了什么？

完整示例：

#include <cassert>
#include <charconv>
#include <iostream>
#include <cuda_runtime.h>


#define CHECK(E) if(E!=cudaError_t::cudaSuccess) std::cerr<<"CUDA API FAILED, File: "<<__FILE__<<", Line: "<< __LINE__ << ", Error: "<< cudaGetErrorString(E) << std::endl;

using Type = unsigned;

template<unsigned blockSize, unsigned  slicesPerBlock>
__device__ void CopySlices(volatile const Type* pIn1, volatile Type* sdata, unsigned bid, unsigned tid) {
    volatile Type sum = 0;
#pragma unroll
    for (unsigned i = 0; i < slicesPerBlock; i++) {
        unsigned idx = bid * (slicesPerBlock * blockSize) + i * blockSize+ tid;
        sum += pIn1[idx];
        //__threadfence_block();
    }
    sdata[tid] = sum;
}

__device__ void WarpReduce(volatile Type* sdata, unsigned tid) {
#pragma unroll
    for (unsigned s = 32; s > 0; s >>= 1) {
        sdata[tid] += sdata[tid + s];
    }
}

template<unsigned blockSize, unsigned slicesPerBlock>
__global__ void ReductionR1A0(size_t len, const Type *__restrict__ pIn1, Type *__restrict__ pOut1) {
    extern __shared__ Type smem[];

    const unsigned tid = threadIdx.x;
    const size_t gid = blockIdx.x * blockDim.x + tid;

    // Phase 0. Init the output tensor
    // Comment out this phase and test the kernel with
    // `sudo compute-sanitizer --tool initcheck ./ReductionR1A0 12345`.
    if (gid == 0) {
        pOut1[0] = 0;
    }


    // Phase 1. Copy the assigned chunk to the shared memory.
    CopySlices<blockSize, slicesPerBlock>(pIn1, smem, blockIdx.x, threadIdx.x);
    __syncthreads();

    // Phase 2. Tree Reduction
#pragma unroll
    for (unsigned s = blockSize / 2; s > 32; s >>= 1) {
        if (tid < s) {
            smem[tid] += smem[tid + s];
        }
        __syncthreads();
    }

    if (tid < 32) { // Handling s=32,16,8,4,2,1
        WarpReduce(smem, tid);
    }



    // Phase 3. Accumulate the partial sum of this block with the rest.
    if (tid == 0) {
        atomicAdd(pOut1, smem[0]);
        // This is bad. For large samples, there will be huge amount of operations forcibly serialized by the atomic operation above across the grid.
        // It affects the final latency of the whole kernel, see [Thread Fence Reduction](https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction_kernel.cuh).
    }

}

size_t GetPaddedLen(unsigned blockSize, size_t wordCount, unsigned slicesPerBlock) {
    assert(wordCount > 0);
    return ((wordCount - 1) / (slicesPerBlock * blockSize) + 1) * blockSize * slicesPerBlock;
}

int main(int argc, char *argv[]) {
    constexpr unsigned BLOCKSIZE = 512; // power of two only
    constexpr unsigned SLICESPERBANK = 8;

    unsigned LEN = 512*8*1024*4;
    unsigned LENPADDED = GetPaddedLen(BLOCKSIZE, LEN, SLICESPERBANK);

    auto *hIn1 = new Type[LENPADDED];
    auto *uut = new Type [1];
    Type *dIn1, *dOut1;

    CHECK(cudaMalloc((void **) &dIn1, LENPADDED*sizeof(Type)));
    CHECK(cudaMalloc((void **) &dOut1, sizeof(Type)));

    for (unsigned i = 0; i < LENPADDED; i++) {
        hIn1[i] = i < LEN ? 1 : 0;
    }

    CHECK(cudaMemcpy(dIn1, hIn1, LENPADDED * sizeof(Type), cudaMemcpyHostToDevice));

    Type gold = 0;
    for (size_t idx = 0; idx < LENPADDED; idx++) {
        gold += hIn1[idx];
    }

    {
        size_t grid = LENPADDED / (BLOCKSIZE * SLICESPERBANK);
        std::cout << "Grid: " << grid << std::endl;
        ReductionR1A0<BLOCKSIZE, SLICESPERBANK><<<grid, BLOCKSIZE, BLOCKSIZE * sizeof(Type)>>>(LENPADDED, dIn1, dOut1);
    }

    CHECK(cudaMemcpy(uut, dOut1, 1 * sizeof(Type), cudaMemcpyDeviceToHost));

    std::cout << "Gold: " << gold << ", UUT: " << uut[0] << ", DIFF: " << (uut[0] - gold) << std::endl;
}

CMakeLists.txt 脚本：

cmake_minimum_required(VERSION 3.7)

include(CheckLanguage)
check_language(CXX)
check_language(CUDA)

set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage.")
set(CUDA_ARCHS "86" CACHE STRING "semi-colon seperated Compute Capability codes without dots, like 62;86")

# Enforce C++17 for nvcc
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

project(Reduction LANGUAGES CUDA CXX)

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G -O0 -Xptxas=\"-v\"")
endif()

if(CMAKE_BUILD_TYPE STREQUAL "Release")
    set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-O3")
    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-O3 -Xptxas=\"-v\"")
endif()


add_executable(main main.cu)
set_target_properties(main PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCHS}")

Answer 1

您所显示的代码中至少存在 2 个问题。

您在内核代码中初始化
```
pOut1[0]
```
是不安全的。 CUDA 不保证线程执行的顺序或块执行的顺序。无法保证与
```
gid
```
为零关联的线程将在何时执行。因此，即使在其他线程块完成之后，它也可能是最后一个执行的线程。在这种情况下，您会将其他线程块存放在那里的结果清零。相反，在主机代码中初始化
```
pOut1[0]
```
，例如在内核启动之前通过
```
cudaMemset
```
。
您的
```
WarpReduce
```
函数不再被视为有效的 CUDA 编程。它依赖于隐式扭曲同步行为，这“不再被认为是安全的”。 <--That linked blog gives an example of a safe refactoring in listing 8.

compute-sanitizer --tool racecheck ...

测试时没有显示任何警告：

# cat t92.cu
#include <cassert>
#include <charconv>
#include <iostream>
#include <cuda_runtime.h>


#define CHECK(E) if(E!=cudaError_t::cudaSuccess) std::cerr<<"CUDA API FAILED, File: "<<__FILE__<<", Line: "<< __LINE__ << ", Error: "<< cudaGetErrorString(E) << std::endl;

using Type = unsigned;

template<unsigned blockSize, unsigned  slicesPerBlock>
__device__ void CopySlices(volatile const Type* pIn1, volatile Type* sdata, unsigned bid, unsigned tid) {
    volatile Type sum = 0;
#pragma unroll
    for (unsigned i = 0; i < slicesPerBlock; i++) {
        unsigned idx = bid * (slicesPerBlock * blockSize) + i * blockSize+ tid;
        sum += pIn1[idx];
        //__threadfence_block();
    }
    sdata[tid] = sum;
}

__device__ void WarpReduce(volatile Type* sdata, unsigned tid) {
    Type v = sdata[tid];
    v += sdata[tid+32]; __syncwarp();
    sdata[tid] = v;     __syncwarp();
    v += sdata[tid+16]; __syncwarp();
    sdata[tid] = v;     __syncwarp();
    v += sdata[tid+8];  __syncwarp();
    sdata[tid] = v;     __syncwarp();
    v += sdata[tid+4];  __syncwarp();
    sdata[tid] = v;     __syncwarp();
    v += sdata[tid+2];  __syncwarp();
    sdata[tid] = v;     __syncwarp();
    v += sdata[tid+1];  __syncwarp();
    sdata[tid] = v;
//#pragma unroll
//    for (unsigned s = 32; s > 0; s >>= 1) {
//        sdata[tid] += sdata[tid + s];
//    }
}

template<unsigned blockSize, unsigned slicesPerBlock>
__global__ void ReductionR1A0(size_t len, const Type *__restrict__ pIn1, Type *__restrict__ pOut1) {
    extern __shared__ Type smem[];

    const unsigned tid = threadIdx.x;
    //const size_t gid = blockIdx.x * blockDim.x + tid;

    // Phase 0. Init the output tensor
    // Comment out this phase and test the kernel with
    // `sudo compute-sanitizer --tool initcheck ./ReductionR1A0 12345`.
//    if (gid == 0) {
//        pOut1[0] = 0;
//    }


    // Phase 1. Copy the assigned chunk to the shared memory.
    CopySlices<blockSize, slicesPerBlock>(pIn1, smem, blockIdx.x, threadIdx.x);
    __syncthreads();

    // Phase 2. Tree Reduction
#pragma unroll
    for (unsigned s = blockSize / 2; s > 32; s >>= 1) {
        if (tid < s) {
            smem[tid] += smem[tid + s];
        }
        __syncthreads();
    }

    if (tid < 32) { // Handling s=32,16,8,4,2,1
        WarpReduce(smem, tid);
    }



    // Phase 3. Accumulate the partial sum of this block with the rest.
    if (tid == 0) {
        atomicAdd(pOut1, smem[0]);
        // This is bad. For large samples, there will be huge amount of operations forcibly serialized by the atomic operation above across the grid.
        // It affects the final latency of the whole kernel, see [Thread Fence Reduction](https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/threadFenceReduction/threadFenceReduction_kernel.cuh).
    }

}

size_t GetPaddedLen(unsigned blockSize, size_t wordCount, unsigned slicesPerBlock) {
    assert(wordCount > 0);
    return ((wordCount - 1) / (slicesPerBlock * blockSize) + 1) * blockSize * slicesPerBlock;
}

int main(int argc, char *argv[]) {
    constexpr unsigned BLOCKSIZE = 512; // power of two only
    constexpr unsigned SLICESPERBANK = 8;

    unsigned LEN = 512*8*1024*4;
    unsigned LENPADDED = GetPaddedLen(BLOCKSIZE, LEN, SLICESPERBANK);

    auto *hIn1 = new Type[LENPADDED];
    auto *uut = new Type [1];
    Type *dIn1, *dOut1;

    CHECK(cudaMalloc((void **) &dIn1, LENPADDED*sizeof(Type)));
    CHECK(cudaMalloc((void **) &dOut1, sizeof(Type)));

    for (unsigned i = 0; i < LENPADDED; i++) {
        hIn1[i] = i < LEN ? 1 : 0;
    }

    CHECK(cudaMemcpy(dIn1, hIn1, LENPADDED * sizeof(Type), cudaMemcpyHostToDevice));

    Type gold = 0;
    for (size_t idx = 0; idx < LENPADDED; idx++) {
        gold += hIn1[idx];
    }

    {
        size_t grid = LENPADDED / (BLOCKSIZE * SLICESPERBANK);
        std::cout << "Grid: " << grid << std::endl;
        CHECK(cudaMemset(dOut1, 0, sizeof(dOut1[0])));
        ReductionR1A0<BLOCKSIZE, SLICESPERBANK><<<grid, BLOCKSIZE, BLOCKSIZE * sizeof(Type)>>>(LENPADDED, dIn1, dOut1);
    }

    CHECK(cudaMemcpy(uut, dOut1, 1 * sizeof(Type), cudaMemcpyDeviceToHost));

    std::cout << "Gold: " << gold << ", UUT: " << uut[0] << ", DIFF: " << (uut[0] - gold) << std::endl;
}
# nvcc -o t92 t92.cu
# ./t92
Grid: 4096
Gold: 16777216, UUT: 16777216, DIFF: 0
# compute-sanitizer ./t92
========= COMPUTE-SANITIZER
Grid: 4096
Gold: 16777216, UUT: 16777216, DIFF: 0
========= ERROR SUMMARY: 0 errors
# compute-sanitizer --tool racecheck ./t92
========= COMPUTE-SANITIZER
Grid: 4096
Gold: 16777216, UUT: 16777216, DIFF: 0
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
#

CUDA 12.2，L4 GPU

没有块级ThreadFence的随机缩减结果

问题描述投票：0回答：1

1个回答

最新问题

没有块级ThreadFence的随机缩减结果

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1