多流cuda程序速度下降很多

问题描述 投票:0回答:0

我检查了很多整数表的正确性。我正在使用 GeForce RTX-3060 并行执行此操作。每个 gpu 线程检查一张表。
我使用两个 cuda 流,以便同时进行计算和数据传输。它运行良好,但将四个字节的结果从设备复制到主机会使速度降低一半。

const uint32_t TABLE_SIZE = 1 << 15, TABLES_NUMBER = 1 << 20;
const uint32_t BLOCKS_NUMBER = 64, BLOCK_SIZE = 256; // CUDA
const uint32_t ROUNDS_NUMBER = TABLES_NUMBER / (BLOCKS_NUMBER * BLOCK_SIZE);
const uint32_t BUFFER_SIZE = BLOCKS_NUMBER * BLOCK_SIZE * TABLE_SIZE * sizeof(uint32_t);


__global__
void test_tables(int32_t *buffer, int32_t *result) {
    int32_t idx = threadIdx.x + blockDim.x * blockIdx.x;
    int32_t *elem = buffer + idx * TABLE_SIZE;
    float sum = 0;

    for (int i = 0; i < TABLE_SIZE; ++i) {
        sum += sinf(*elem) * sinf(*elem);
        elem++;
    }

    if (sum < 0.01) {
        *result = idx;
    }
}


int main() {
    cudaStream_t streams[2];
    int32_t *h_buffers[2];
    int32_t h_result[2] = {-1, -1};
    int32_t *d_buffers[2];
    int32_t *d_result;
    cudaEvent_t startEvent, stopEvent;
    float ms;

    check_cuda(cudaMallocHost((void **)&h_buffers[0], BUFFER_SIZE));
    check_cuda(cudaMallocHost((void **)&h_buffers[1], BUFFER_SIZE));
    check_cuda(cudaMalloc((void **)&d_buffers[0], BUFFER_SIZE));
    check_cuda(cudaMalloc((void **)&d_buffers[1], BUFFER_SIZE));
    check_cuda(cudaMalloc((void **)&d_result, 2 * sizeof(int32_t)));

    check_cuda(cudaStreamCreate(&streams[0]));
    check_cuda(cudaStreamCreate(&streams[1]));

    check_cuda(cudaEventCreate(&startEvent));
    check_cuda(cudaEventCreate(&stopEvent));
    check_cuda(cudaEventRecord(startEvent, 0));

#ifdef ONE_STREAM
    for (uint32_t i = 0; i < ROUNDS_NUMBER; ++i) {
        check_cuda(cudaMemcpyAsync(d_buffers[0], h_buffers[0], BUFFER_SIZE, cudaMemcpyHostToDevice, streams[0]));
        test_tables<<<BLOCKS_NUMBER, BLOCK_SIZE, 0, streams[0]>>>(d_buffers[0], d_result);
        check_cuda(cudaMemcpyAsync(h_result, d_result, sizeof(int32_t), cudaMemcpyDeviceToHost, streams[0]));
    }
#else // ONE_STREAM
    for (uint32_t i = 0; i < ROUNDS_NUMBER; ++i) {
        uint32_t ind = i % 2;
        check_cuda(cudaMemcpyAsync(d_buffers[ind], h_buffers[ind], BUFFER_SIZE, cudaMemcpyHostToDevice, streams[ind]));
        check_cuda(cudaStreamSynchronize(streams[1 - ind]));
        test_tables<<<BLOCKS_NUMBER, BLOCK_SIZE, 0, streams[ind]>>>(d_buffers[ind], d_result + ind);
        check_cuda(cudaMemcpyAsync(h_result + ind, d_result + ind, sizeof(int32_t), cudaMemcpyDeviceToHost, streams[ind]));
    }
#endif // ONE_STREAM

    check_cuda(cudaEventRecord(stopEvent, 0) );
    check_cuda(cudaEventSynchronize(stopEvent) );
    check_cuda(cudaEventElapsedTime(&ms, startEvent, stopEvent) );

    printf("Time (ms): %f\n", ms);

为了测试速度,我首先使用单流。数据传输时间为 5596ms。查表时间为4877ms。总执行时间为10452ms。由于传输时间大约等于检查时间,似乎两个流将大大提高性能。
这就是最初发生的事情。如果我只将数据复制到设备并检查表格,则执行时间为 5718 毫秒。但是如果我加上从设备复制4字节结果到主机的操作,速度减半,时间是10437ms。
由于我只需要有关表中是否存在错误的信息,因此我通过添加第三个流来解决问题,该流仅将 8 字节的结果从设备异步复制到主机。这样的话,执行速度又高了
是否可以仅使用两个流来保持高速?为什么传输 4 个字节的结果会使速度下降一半?

performance cuda memcpy cuda-streams
© www.soinside.com 2019 - 2024. All rights reserved.