无法成功实现多CUDA GPU的并行缩减

Question

我尝试运行下面的代码来计算两个向量的点积，当GPU的输入数量为1时，即没有真正使用Omp包，但是当GPU的数量为1时，代码可以很好地运行。 GPU是2，GPU结果总是0，我不知道哪里错了，我只是在gpu代码中使用通常的并行缩减，并将工作分开在N个GPU中。当我不在 GPU 代码中使用并行缩减时，我检查了多 GPU 的代码运行良好，也就是说，我让 C[i] = A[i]+B[i] 并在主机上计算总和。

// using multiple GPUs with OpenMP

// Includes
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>          // header for OpenMP
#include <cuda_runtime.h>

// Variables
float* h_A;   // host vectors
float* h_B;
float* h_C;
float* h_D;

// Functions
void RandomInit(float*, int);

// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
    extern __shared__ float cache[];

    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int cacheIndex = threadIdx.x;

    float temp = 0.0;  // register for each thread
    while (i < N) {
        temp += A[i]*B[i];
        i += blockDim.x*gridDim.x;  
    }
   
    cache[cacheIndex] = temp;   // set the cache value 

    __syncthreads();

    // perform parallel reduction, threadsPerBlock must be 2^m

    int ib = blockDim.x/2;
    while (ib != 0) {
      if(cacheIndex < ib)
        cache[cacheIndex] += cache[cacheIndex + ib]; 

      __syncthreads();

      ib /=2;
    }
    
    if(cacheIndex == 0)
      C[blockIdx.x] = cache[0];
}

// Host code

int main(void)
{
    printf("\n");
    printf("Vector Dot Product with multiple GPUs \n");
    int N, NGPU, cpu_thread_id=0;
    int *Dev; 
    long mem = 1024*1024*1024;     // 4 Giga for float data type.

    printf("Enter the number of GPUs: ");
    scanf("%d", &NGPU);
    printf("%d\n", NGPU);
    Dev = (int *)malloc(sizeof(int)*NGPU);

    int numDev = 0;
    printf("GPU device number: ");
    for(int i = 0; i < NGPU; i++) {
      scanf("%d", &Dev[i]);
      printf("%d ",Dev[i]);
      numDev++;
      if(getchar() == '\n') break;
    }
    printf("\n");
    if(numDev != NGPU) {
      fprintf(stderr,"Should input %d GPU device numbers\n", NGPU);
      exit(1);
    }

    printf("Enter the size of the vectors: ");
    scanf("%d", &N);        
    printf("%d\n", N);        
    if (3*N > mem) {
        printf("The size of these 3 vectors cannot be fitted into 4 Gbyte\n");
        exit(1);
    }
    long size = N*sizeof(float);

    // Set the sizes of threads and blocks
    int threadsPerBlock;
    printf("Enter the number of threads per block: ");
    scanf("%d", &threadsPerBlock);
    printf("%d\n", threadsPerBlock);
    if(threadsPerBlock > 1024) {
      printf("The number of threads per block must be less than 1024 ! \n");
      exit(1);
    }
    int blocksPerGrid = (N + threadsPerBlock*NGPU - 1) / (threadsPerBlock*NGPU);
    printf("The number of blocks is %d\n", blocksPerGrid);
    if(blocksPerGrid > 2147483647) {
      printf("The number of blocks must be less than 2147483647 ! \n");
      exit(1);
    }
    long sb = blocksPerGrid*sizeof(float);
    long sm = threadsPerBlock*sizeof(float);
    // Allocate input vectors h_A and h_B in host memory
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);
    h_C = (float*)malloc(sb);
    if (! h_A || ! h_B || ! h_C) {
      printf("!!! Not enough memory.\n");
      exit(1);
    }
    
    // Initialize input vectors

    RandomInit(h_A, N);
    RandomInit(h_B, N);

    // declare cuda event for timer
    cudaEvent_t start, stop;
//    cudaEventCreate(&start);    // events must be created after devices are set 
//    cudaEventCreate(&stop);

    float Intime,gputime,Outime;
    double h_G = 0.0;
    omp_set_num_threads(NGPU);

    #pragma omp parallel private(cpu_thread_id)
    {
      float *d_A, *d_B, *d_C;
      cpu_thread_id = omp_get_thread_num();
      cudaSetDevice(Dev[cpu_thread_id]);
    //  cudaSetDevice(cpu_thread_id);

      // start the timer
      if(cpu_thread_id == 0) {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start,0);
      }

      // Allocate vectors in device memory
      cudaMalloc((void**)&d_A, size/NGPU);
      cudaMalloc((void**)&d_B, size/NGPU);
      cudaMalloc((void**)&d_C, sb/NGPU);

      // Copy vectors from host memory to device memory
      cudaMemcpy(d_A, h_A+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
      cudaMemcpy(d_B, h_B+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
      #pragma omp barrier

        // stop the timer
      if(cpu_thread_id == 0) {
              cudaEventRecord(stop,0);
              cudaEventSynchronize(stop);
              cudaEventElapsedTime( &Intime, start, stop);
              printf("Data input time for GPU: %f (ms) \n",Intime);
      }

        // start the timer
      if(cpu_thread_id == 0) cudaEventRecord(start,0);

        VecAdd<<<blocksPerGrid, threadsPerBlock, sm>>>(d_A, d_B, d_C, N/NGPU);

        cudaDeviceSynchronize();

        // stop the timer

      if(cpu_thread_id == 0) {
              cudaEventRecord(stop,0);
              cudaEventSynchronize(stop);
              cudaEventElapsedTime( &gputime, start, stop);
              printf("Processing time for GPU: %f (ms) \n",gputime);
              printf("GPU Gflops: %f\n",3*N/(1000000.0*gputime));
      }

        // Copy result from device memory to host memory
        // h_C contains the result in host memory

        // start the timer
      if(cpu_thread_id == 0) cudaEventRecord(start,0);

      cudaMemcpy(h_C+blocksPerGrid/NGPU*cpu_thread_id, d_C, sb/NGPU, cudaMemcpyDeviceToHost);

      cudaFree(d_A);
      cudaFree(d_B);
      cudaFree(d_C);
      //compute the solution
      for (int i = 0; i < blocksPerGrid; i++) {
          h_G += (double) h_C[i];
      }
      // stop the timer

      if(cpu_thread_id == 0) {
              cudaEventRecord(stop,0);
              cudaEventSynchronize(stop);
              cudaEventElapsedTime( &Outime, start, stop);
              printf("Data output time for GPU: %f (ms) \n",Outime);
      }
    } 

    float gputime_tot;
    gputime_tot = Intime + gputime + Outime;
    printf("Total time for GPU: %f (ms) \n",gputime_tot);

    // start the timer
    cudaEventRecord(start,0);

    double h_D = 0.0;     // compute the reference solution
    for (int i = 0; i < N; ++i) 
        h_D += (double) h_A[i]*h_B[i];

//        h_D[i] = 1.0/cos(h_A[i]) + 1.0/cos(h_B[i]);
    
    // stop the timer
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);

    float cputime;
    cudaEventElapsedTime( &cputime, start, stop);
    printf("Processing time for CPU: %f (ms) \n",cputime);
    printf("CPU Gflops: %f\n",3*N/(1000000.0*cputime));
    printf("Speed up of GPU = %f\n", cputime/gputime_tot);

    // Destroy timer
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // check result
    printf("Check result:\n");
    // for (int i = 0; i < N; ++i) {
    //     diff = abs(h_D[i] - h_C[i]);
    //     sum += diff*diff; 
    // }
    double diff = abs( (h_D - h_G)/h_D );
    printf("|(h_G - h_D)/h_D|=%20.15e\n",diff);
    printf("h_G =%20.15e\n",h_G);
    printf("h_D =%20.15e\n",h_D);

    for (int i=0; i < NGPU; i++) {
        cudaSetDevice(i);
        cudaDeviceReset();
    }

    return 0;
}


// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
    for (int i = 0; i < n; ++i)
        data[i] = rand() / (float)RAND_MAX;
}

Answer 1

首先，使用正确的 CUDA 错误检查是一个很好的做法。

显然，工作量需要除以 GPU 的数量。但目前还不清楚你的变量应该意味着什么。让我们在地上打一个桩，并假设

blocksPerGrid

将定义内核启动中的块数量（对于每个 GPU）。这（至少）与您所展示的实际内核调用一致。

如果我们从这里开始，那么

blocksPerGrid

将“乘以”（即按比例放大）GPU 数量，以覆盖整个问题规模。让我们检查一下您的代码并“协调”计算。例如，对于两个 GPU，向量大小为 1048576，每块有 512 个线程，我们预计

blocksPerGrid

为 1024，因为 2x1024x512 = 1048576。这与您对

blocksPerGrid

本身的计算和内核调用一致。

1. 这是错误的：

long sb = blocksPerGrid*sizeof(float);
...
h_C = (float*)malloc(sb);

结果的主机存储需要（至少）匹配问题的大小。每个块需要一个

float

项，乘以 GPU 数量。但

sb

是每个 GPU 的存储大小。我们需要将其乘以 GPU 的数量。

2. 这是错误的：

  cudaMalloc((void**)&d_C, sb/NGPU);

由于您对

sb

 的计算，

blocksPerGrid 已经是

每个 GPU

的存储大小。您不应该再次将其除以

NGPU

。当您这样做时，现在每个 GPU 中都有线程块尝试将结果写入不存在的分配，并且您的内核将执行非法行为。考虑到一个足够大的问题和/或使用

compute-sanitizer

，你肯定会通过我提到的正确的 CUDA 错误检查来见证这一点。

3. 这是错误的：

  cudaMemcpy(h_C+blocksPerGrid/NGPU*cpu_thread_id, d_C, sb/NGPU, cudaMemcpyDeviceToHost);

原因我们已经介绍过。

h_C

需要覆盖整个问题大小，并且每个GPU的问题大小已经被

blocksPerGrid

覆盖。它不应该被NGPU进一步划分，并且

sb

已经是每个GPU的缩放，它不应该被NGPU进一步划分。

4. 这是错误的：

  for (int i = 0; i < blocksPerGrid; i++) {
      h_G += (double) h_C[i];
  }

我们已经介绍过这样一个事实：您的

blocksPerGrid

计算本质上是每 GPU 计算。它没有涵盖多 GPU 情况下的整个问题规模。

5. 您对

h_G

计算的放置不正确。我们要求所有 OMP 线程在计算

h_G

结果之前完成其工作。因此，此计算需要在 OMP 并行区域关闭之后进行，以保证所有线程都已更新其

h_C

部分。

以下代码已进行更改以解决这些问题，并且似乎对我来说可以正确运行。为了避免用户输入和不确定性，我对一些输入值进行了硬编码，并将随机初始化更改为易于评估正确性的初始化：

$ cat t3.cu
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>          // header for OpenMP
#include <cuda_runtime.h>

// Variables
float* h_A;   // host vectors
float* h_B;
float* h_C;
float* h_D;

// Functions
void RandomInit(float*, int);

// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
    extern __shared__ float cache[];

    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int cacheIndex = threadIdx.x;

    float temp = 0.0;  // register for each thread
    while (i < N) {
        temp += A[i]*B[i];
        i += blockDim.x*gridDim.x;
    }

    cache[cacheIndex] = temp;   // set the cache value

    __syncthreads();

    // perform parallel reduction, threadsPerBlock must be 2^m

    int ib = blockDim.x/2;
    while (ib != 0) {
      if(cacheIndex < ib)
        cache[cacheIndex] += cache[cacheIndex + ib];

      __syncthreads();

      ib /=2;
    }

    if(cacheIndex == 0)
      C[blockIdx.x] = cache[0];
}

// Host code

int main(void)
{
    printf("\n");
    printf("Vector Dot Product with multiple GPUs \n");
    int N, NGPU, cpu_thread_id=0;
    int *Dev;
    long mem = 1024*1024*1024;     // 4 Giga for float data type.

    printf("Enter the number of GPUs: ");
    //scanf("%d", &NGPU);
    NGPU = 2;
    printf("%d\n", NGPU);
    Dev = (int *)malloc(sizeof(int)*NGPU);

    int numDev = 0;
    printf("GPU device number: ");
    for(int i = 0; i < NGPU; i++) {
      //scanf("%d", &Dev[i]);
      Dev[i] = i;
      printf("%d ",Dev[i]);
      numDev++;
//      if(getchar() == '\n') break;
    }
    printf("\n");
    if(numDev != NGPU) {
      fprintf(stderr,"Should input %d GPU device numbers\n", NGPU);
      exit(1);
    }

    printf("Enter the size of the vectors: ");
    //scanf("%d", &N);
    N = 1048576;
    printf("%d\n", N);
    if (3*N > mem) {
        printf("The size of these 3 vectors cannot be fitted into 4 Gbyte\n");
        exit(1);
    }
    long size = N*sizeof(float);

    // Set the sizes of threads and blocks
    int threadsPerBlock;
    printf("Enter the number of threads per block: ");
    //scanf("%d", &threadsPerBlock);
    threadsPerBlock = 512;
    printf("%d\n", threadsPerBlock);
    if(threadsPerBlock > 1024) {
      printf("The number of threads per block must be less than 1024 ! \n");
      exit(1);
    }
    int blocksPerGrid = (N + threadsPerBlock*NGPU - 1) / (threadsPerBlock*NGPU);
    printf("The number of blocks is %d\n", blocksPerGrid);
    if(blocksPerGrid > 2147483647) {
      printf("The number of blocks must be less than 2147483647 ! \n");
      exit(1);
    }
    long sb = blocksPerGrid*sizeof(float);
    long sm = threadsPerBlock*sizeof(float);
    // Allocate input vectors h_A and h_B in host memory
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);
    h_C = (float*)malloc(sb*NGPU);
    if (! h_A || ! h_B || ! h_C) {
      printf("!!! Not enough memory.\n");
      exit(1);
    }

    // Initialize input vectors

    RandomInit(h_A, N);
    RandomInit(h_B, N);

    // declare cuda event for timer
    cudaEvent_t start, stop;
//    cudaEventCreate(&start);    // events must be created after devices are set
//    cudaEventCreate(&stop);

    float Intime,gputime,Outime;
    double h_G = 0.0;
    omp_set_num_threads(NGPU);

    #pragma omp parallel private(cpu_thread_id)
    {
      float *d_A, *d_B, *d_C;
      cpu_thread_id = omp_get_thread_num();
      cudaSetDevice(Dev[cpu_thread_id]);
    //  cudaSetDevice(cpu_thread_id);

      // start the timer
      if(cpu_thread_id == 0) {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start,0);
      }

      // Allocate vectors in device memory
      cudaMalloc((void**)&d_A, size/NGPU);
      cudaMalloc((void**)&d_B, size/NGPU);
      cudaMalloc((void**)&d_C, sb);

      // Copy vectors from host memory to device memory
      cudaMemcpy(d_A, h_A+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
      cudaMemcpy(d_B, h_B+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
      #pragma omp barrier

        // stop the timer
      if(cpu_thread_id == 0) {
              cudaEventRecord(stop,0);
              cudaEventSynchronize(stop);
              cudaEventElapsedTime( &Intime, start, stop);
              printf("Data input time for GPU: %f (ms) \n",Intime);
      }

        // start the timer
      if(cpu_thread_id == 0) cudaEventRecord(start,0);

        VecAdd<<<blocksPerGrid, threadsPerBlock, sm>>>(d_A, d_B, d_C, N/NGPU);

        cudaDeviceSynchronize();

        // stop the timer

      if(cpu_thread_id == 0) {
              cudaEventRecord(stop,0);
              cudaEventSynchronize(stop);
              cudaEventElapsedTime( &gputime, start, stop);
              printf("Processing time for GPU: %f (ms) \n",gputime);
              printf("GPU Gflops: %f\n",3*N/(1000000.0*gputime));
      }

        // Copy result from device memory to host memory
        // h_C contains the result in host memory

        // start the timer
      if(cpu_thread_id == 0) cudaEventRecord(start,0);

      cudaMemcpy(h_C+blocksPerGrid*cpu_thread_id, d_C, sb, cudaMemcpyDeviceToHost);

      cudaFree(d_A);
      cudaFree(d_B);
      cudaFree(d_C);
      // stop the timer

      if(cpu_thread_id == 0) {
              cudaEventRecord(stop,0);
              cudaEventSynchronize(stop);
              cudaEventElapsedTime( &Outime, start, stop);
              printf("Data output time for GPU: %f (ms) \n",Outime);
      }
    }
    //compute the solution
    for (int i = 0; i < blocksPerGrid*NGPU; i++) {
          h_G += (double) h_C[i];
    }

    float gputime_tot;
    gputime_tot = Intime + gputime + Outime;
    printf("Total time for GPU: %f (ms) \n",gputime_tot);

    // start the timer
    cudaEventRecord(start,0);

    double h_D = 0.0;     // compute the reference solution
    for (int i = 0; i < N; ++i)
        h_D += (double) h_A[i]*h_B[i];

//        h_D[i] = 1.0/cos(h_A[i]) + 1.0/cos(h_B[i]);

    // stop the timer
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);

    float cputime;
    cudaEventElapsedTime( &cputime, start, stop);
    printf("Processing time for CPU: %f (ms) \n",cputime);
    printf("CPU Gflops: %f\n",3*N/(1000000.0*cputime));
    printf("Speed up of GPU = %f\n", cputime/gputime_tot);

    // Destroy timer
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // check result
    printf("Check result:\n");
    // for (int i = 0; i < N; ++i) {
    //     diff = abs(h_D[i] - h_C[i]);
    //     sum += diff*diff;
    // }
    double diff = abs( (h_D - h_G)/h_D );
    printf("|(h_G - h_D)/h_D|=%20.15e\n",diff);
    printf("h_G =%20.15e\n",h_G);
    printf("h_D =%20.15e\n",h_D);

    for (int i=0; i < NGPU; i++) {
        cudaSetDevice(i);
        cudaDeviceReset();
    }

    return 0;
}


// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
    for (int i = 0; i < n; ++i)
        data[i] = 1.0f; //rand() / (float)RAND_MAX;
}
$ nvcc -o t3 t3.cu -Xcompiler -fopenmp
$ compute-sanitizer ./t3
========= COMPUTE-SANITIZER

Vector Dot Product with multiple GPUs
Enter the number of GPUs: 2
GPU device number: 0 1
Enter the size of the vectors: 1048576
Enter the number of threads per block: 512
The number of blocks is 1024
Data input time for GPU: 2.405280 (ms)
Processing time for GPU: 8.202272 (ms)
GPU Gflops: 0.383519
Data output time for GPU: 0.429728 (ms)
Total time for GPU: 11.037280 (ms)
Processing time for CPU: 2.361696 (ms)
CPU Gflops: 1.331978
Speed up of GPU = 0.213974
Check result:
|(h_G - h_D)/h_D|=0.000000000000000e+00
h_G =1.048576000000000e+06
h_D =1.048576000000000e+06
========= ERROR SUMMARY: 0 errors
$

我并不是说我已经发现了您代码中所有可能的错误。我尝试过的唯一测试用例就是所描述的。

无法成功实现多CUDA GPU的并行缩减

问题描述投票：0回答：1

1个回答

最新问题

无法成功实现多CUDA GPU的并行缩减

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1