我尝试运行下面的代码来计算两个向量的点积,当GPU的输入数量为1时,即没有真正使用Omp包,但是当GPU的数量为1时,代码可以很好地运行。 GPU是2,GPU结果总是0,我不知道哪里错了,我只是在gpu代码中使用通常的并行缩减,并将工作分开在N个GPU中。当我不在 GPU 代码中使用并行缩减时,我检查了多 GPU 的代码运行良好,也就是说,我让 C[i] = A[i]+B[i] 并在主机上计算总和。
// using multiple GPUs with OpenMP
// Includes
#include <stdio.h>
#include <stdlib.h>
#include <omp.h> // header for OpenMP
#include <cuda_runtime.h>
// Variables
float* h_A; // host vectors
float* h_B;
float* h_C;
float* h_D;
// Functions
void RandomInit(float*, int);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
extern __shared__ float cache[];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int cacheIndex = threadIdx.x;
float temp = 0.0; // register for each thread
while (i < N) {
temp += A[i]*B[i];
i += blockDim.x*gridDim.x;
}
cache[cacheIndex] = temp; // set the cache value
__syncthreads();
// perform parallel reduction, threadsPerBlock must be 2^m
int ib = blockDim.x/2;
while (ib != 0) {
if(cacheIndex < ib)
cache[cacheIndex] += cache[cacheIndex + ib];
__syncthreads();
ib /=2;
}
if(cacheIndex == 0)
C[blockIdx.x] = cache[0];
}
// Host code
int main(void)
{
printf("\n");
printf("Vector Dot Product with multiple GPUs \n");
int N, NGPU, cpu_thread_id=0;
int *Dev;
long mem = 1024*1024*1024; // 4 Giga for float data type.
printf("Enter the number of GPUs: ");
scanf("%d", &NGPU);
printf("%d\n", NGPU);
Dev = (int *)malloc(sizeof(int)*NGPU);
int numDev = 0;
printf("GPU device number: ");
for(int i = 0; i < NGPU; i++) {
scanf("%d", &Dev[i]);
printf("%d ",Dev[i]);
numDev++;
if(getchar() == '\n') break;
}
printf("\n");
if(numDev != NGPU) {
fprintf(stderr,"Should input %d GPU device numbers\n", NGPU);
exit(1);
}
printf("Enter the size of the vectors: ");
scanf("%d", &N);
printf("%d\n", N);
if (3*N > mem) {
printf("The size of these 3 vectors cannot be fitted into 4 Gbyte\n");
exit(1);
}
long size = N*sizeof(float);
// Set the sizes of threads and blocks
int threadsPerBlock;
printf("Enter the number of threads per block: ");
scanf("%d", &threadsPerBlock);
printf("%d\n", threadsPerBlock);
if(threadsPerBlock > 1024) {
printf("The number of threads per block must be less than 1024 ! \n");
exit(1);
}
int blocksPerGrid = (N + threadsPerBlock*NGPU - 1) / (threadsPerBlock*NGPU);
printf("The number of blocks is %d\n", blocksPerGrid);
if(blocksPerGrid > 2147483647) {
printf("The number of blocks must be less than 2147483647 ! \n");
exit(1);
}
long sb = blocksPerGrid*sizeof(float);
long sm = threadsPerBlock*sizeof(float);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
h_B = (float*)malloc(size);
h_C = (float*)malloc(sb);
if (! h_A || ! h_B || ! h_C) {
printf("!!! Not enough memory.\n");
exit(1);
}
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// declare cuda event for timer
cudaEvent_t start, stop;
// cudaEventCreate(&start); // events must be created after devices are set
// cudaEventCreate(&stop);
float Intime,gputime,Outime;
double h_G = 0.0;
omp_set_num_threads(NGPU);
#pragma omp parallel private(cpu_thread_id)
{
float *d_A, *d_B, *d_C;
cpu_thread_id = omp_get_thread_num();
cudaSetDevice(Dev[cpu_thread_id]);
// cudaSetDevice(cpu_thread_id);
// start the timer
if(cpu_thread_id == 0) {
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
}
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, size/NGPU);
cudaMalloc((void**)&d_B, size/NGPU);
cudaMalloc((void**)&d_C, sb/NGPU);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
#pragma omp barrier
// stop the timer
if(cpu_thread_id == 0) {
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime( &Intime, start, stop);
printf("Data input time for GPU: %f (ms) \n",Intime);
}
// start the timer
if(cpu_thread_id == 0) cudaEventRecord(start,0);
VecAdd<<<blocksPerGrid, threadsPerBlock, sm>>>(d_A, d_B, d_C, N/NGPU);
cudaDeviceSynchronize();
// stop the timer
if(cpu_thread_id == 0) {
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime( &gputime, start, stop);
printf("Processing time for GPU: %f (ms) \n",gputime);
printf("GPU Gflops: %f\n",3*N/(1000000.0*gputime));
}
// Copy result from device memory to host memory
// h_C contains the result in host memory
// start the timer
if(cpu_thread_id == 0) cudaEventRecord(start,0);
cudaMemcpy(h_C+blocksPerGrid/NGPU*cpu_thread_id, d_C, sb/NGPU, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//compute the solution
for (int i = 0; i < blocksPerGrid; i++) {
h_G += (double) h_C[i];
}
// stop the timer
if(cpu_thread_id == 0) {
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime( &Outime, start, stop);
printf("Data output time for GPU: %f (ms) \n",Outime);
}
}
float gputime_tot;
gputime_tot = Intime + gputime + Outime;
printf("Total time for GPU: %f (ms) \n",gputime_tot);
// start the timer
cudaEventRecord(start,0);
double h_D = 0.0; // compute the reference solution
for (int i = 0; i < N; ++i)
h_D += (double) h_A[i]*h_B[i];
// h_D[i] = 1.0/cos(h_A[i]) + 1.0/cos(h_B[i]);
// stop the timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float cputime;
cudaEventElapsedTime( &cputime, start, stop);
printf("Processing time for CPU: %f (ms) \n",cputime);
printf("CPU Gflops: %f\n",3*N/(1000000.0*cputime));
printf("Speed up of GPU = %f\n", cputime/gputime_tot);
// Destroy timer
cudaEventDestroy(start);
cudaEventDestroy(stop);
// check result
printf("Check result:\n");
// for (int i = 0; i < N; ++i) {
// diff = abs(h_D[i] - h_C[i]);
// sum += diff*diff;
// }
double diff = abs( (h_D - h_G)/h_D );
printf("|(h_G - h_D)/h_D|=%20.15e\n",diff);
printf("h_G =%20.15e\n",h_G);
printf("h_D =%20.15e\n",h_D);
for (int i=0; i < NGPU; i++) {
cudaSetDevice(i);
cudaDeviceReset();
}
return 0;
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
首先,使用正确的 CUDA 错误检查是一个很好的做法。
显然,工作量需要除以 GPU 的数量。但目前还不清楚你的变量应该意味着什么。让我们在地上打一个桩,并假设
blocksPerGrid
将定义内核启动中的块数量(对于每个 GPU)。这(至少)与您所展示的实际内核调用一致。
如果我们从这里开始,那么
blocksPerGrid
将“乘以”(即按比例放大)GPU 数量,以覆盖整个问题规模。让我们检查一下您的代码并“协调”计算。例如,对于两个 GPU,向量大小为 1048576,每块有 512 个线程,我们预计 blocksPerGrid
为 1024,因为 2x1024x512 = 1048576。这与您对 blocksPerGrid
本身的计算和内核调用一致。
1. 这是错误的:
long sb = blocksPerGrid*sizeof(float);
...
h_C = (float*)malloc(sb);
结果的主机存储需要(至少)匹配问题的大小。每个块需要一个
float
项,乘以 GPU 数量。但 sb
是每个 GPU 的存储大小。我们需要将其乘以 GPU 的数量。
2. 这是错误的:
cudaMalloc((void**)&d_C, sb/NGPU);
由于您对 sb
的计算,
blocksPerGrid
已经是每个 GPU的存储大小。您不应该再次将其除以
NGPU
。当您这样做时,现在每个 GPU 中都有线程块尝试将结果写入不存在的分配,并且您的内核将执行非法行为。考虑到一个足够大的问题和/或使用 compute-sanitizer
,你肯定会通过我提到的正确的 CUDA 错误检查来见证这一点。
3. 这是错误的:
cudaMemcpy(h_C+blocksPerGrid/NGPU*cpu_thread_id, d_C, sb/NGPU, cudaMemcpyDeviceToHost);
原因我们已经介绍过。
h_C
需要覆盖整个问题大小,并且每个GPU的问题大小已经被blocksPerGrid
覆盖。它不应该被NGPU进一步划分,并且sb
已经是每个GPU的缩放,它不应该被NGPU进一步划分。
4. 这是错误的:
for (int i = 0; i < blocksPerGrid; i++) {
h_G += (double) h_C[i];
}
我们已经介绍过这样一个事实:您的
blocksPerGrid
计算本质上是每 GPU 计算。它没有涵盖多 GPU 情况下的整个问题规模。
5. 您对
h_G
计算的放置不正确。我们要求所有 OMP 线程在计算 h_G
结果之前完成其工作。因此,此计算需要在 OMP 并行区域关闭之后进行,以保证所有线程都已更新其 h_C
部分。
以下代码已进行更改以解决这些问题,并且似乎对我来说可以正确运行。为了避免用户输入和不确定性,我对一些输入值进行了硬编码,并将随机初始化更改为易于评估正确性的初始化:
$ cat t3.cu
#include <stdio.h>
#include <stdlib.h>
#include <omp.h> // header for OpenMP
#include <cuda_runtime.h>
// Variables
float* h_A; // host vectors
float* h_B;
float* h_C;
float* h_D;
// Functions
void RandomInit(float*, int);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
extern __shared__ float cache[];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int cacheIndex = threadIdx.x;
float temp = 0.0; // register for each thread
while (i < N) {
temp += A[i]*B[i];
i += blockDim.x*gridDim.x;
}
cache[cacheIndex] = temp; // set the cache value
__syncthreads();
// perform parallel reduction, threadsPerBlock must be 2^m
int ib = blockDim.x/2;
while (ib != 0) {
if(cacheIndex < ib)
cache[cacheIndex] += cache[cacheIndex + ib];
__syncthreads();
ib /=2;
}
if(cacheIndex == 0)
C[blockIdx.x] = cache[0];
}
// Host code
int main(void)
{
printf("\n");
printf("Vector Dot Product with multiple GPUs \n");
int N, NGPU, cpu_thread_id=0;
int *Dev;
long mem = 1024*1024*1024; // 4 Giga for float data type.
printf("Enter the number of GPUs: ");
//scanf("%d", &NGPU);
NGPU = 2;
printf("%d\n", NGPU);
Dev = (int *)malloc(sizeof(int)*NGPU);
int numDev = 0;
printf("GPU device number: ");
for(int i = 0; i < NGPU; i++) {
//scanf("%d", &Dev[i]);
Dev[i] = i;
printf("%d ",Dev[i]);
numDev++;
// if(getchar() == '\n') break;
}
printf("\n");
if(numDev != NGPU) {
fprintf(stderr,"Should input %d GPU device numbers\n", NGPU);
exit(1);
}
printf("Enter the size of the vectors: ");
//scanf("%d", &N);
N = 1048576;
printf("%d\n", N);
if (3*N > mem) {
printf("The size of these 3 vectors cannot be fitted into 4 Gbyte\n");
exit(1);
}
long size = N*sizeof(float);
// Set the sizes of threads and blocks
int threadsPerBlock;
printf("Enter the number of threads per block: ");
//scanf("%d", &threadsPerBlock);
threadsPerBlock = 512;
printf("%d\n", threadsPerBlock);
if(threadsPerBlock > 1024) {
printf("The number of threads per block must be less than 1024 ! \n");
exit(1);
}
int blocksPerGrid = (N + threadsPerBlock*NGPU - 1) / (threadsPerBlock*NGPU);
printf("The number of blocks is %d\n", blocksPerGrid);
if(blocksPerGrid > 2147483647) {
printf("The number of blocks must be less than 2147483647 ! \n");
exit(1);
}
long sb = blocksPerGrid*sizeof(float);
long sm = threadsPerBlock*sizeof(float);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
h_B = (float*)malloc(size);
h_C = (float*)malloc(sb*NGPU);
if (! h_A || ! h_B || ! h_C) {
printf("!!! Not enough memory.\n");
exit(1);
}
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// declare cuda event for timer
cudaEvent_t start, stop;
// cudaEventCreate(&start); // events must be created after devices are set
// cudaEventCreate(&stop);
float Intime,gputime,Outime;
double h_G = 0.0;
omp_set_num_threads(NGPU);
#pragma omp parallel private(cpu_thread_id)
{
float *d_A, *d_B, *d_C;
cpu_thread_id = omp_get_thread_num();
cudaSetDevice(Dev[cpu_thread_id]);
// cudaSetDevice(cpu_thread_id);
// start the timer
if(cpu_thread_id == 0) {
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
}
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, size/NGPU);
cudaMalloc((void**)&d_B, size/NGPU);
cudaMalloc((void**)&d_C, sb);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B+N/NGPU*cpu_thread_id, size/NGPU, cudaMemcpyHostToDevice);
#pragma omp barrier
// stop the timer
if(cpu_thread_id == 0) {
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime( &Intime, start, stop);
printf("Data input time for GPU: %f (ms) \n",Intime);
}
// start the timer
if(cpu_thread_id == 0) cudaEventRecord(start,0);
VecAdd<<<blocksPerGrid, threadsPerBlock, sm>>>(d_A, d_B, d_C, N/NGPU);
cudaDeviceSynchronize();
// stop the timer
if(cpu_thread_id == 0) {
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime( &gputime, start, stop);
printf("Processing time for GPU: %f (ms) \n",gputime);
printf("GPU Gflops: %f\n",3*N/(1000000.0*gputime));
}
// Copy result from device memory to host memory
// h_C contains the result in host memory
// start the timer
if(cpu_thread_id == 0) cudaEventRecord(start,0);
cudaMemcpy(h_C+blocksPerGrid*cpu_thread_id, d_C, sb, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// stop the timer
if(cpu_thread_id == 0) {
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime( &Outime, start, stop);
printf("Data output time for GPU: %f (ms) \n",Outime);
}
}
//compute the solution
for (int i = 0; i < blocksPerGrid*NGPU; i++) {
h_G += (double) h_C[i];
}
float gputime_tot;
gputime_tot = Intime + gputime + Outime;
printf("Total time for GPU: %f (ms) \n",gputime_tot);
// start the timer
cudaEventRecord(start,0);
double h_D = 0.0; // compute the reference solution
for (int i = 0; i < N; ++i)
h_D += (double) h_A[i]*h_B[i];
// h_D[i] = 1.0/cos(h_A[i]) + 1.0/cos(h_B[i]);
// stop the timer
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float cputime;
cudaEventElapsedTime( &cputime, start, stop);
printf("Processing time for CPU: %f (ms) \n",cputime);
printf("CPU Gflops: %f\n",3*N/(1000000.0*cputime));
printf("Speed up of GPU = %f\n", cputime/gputime_tot);
// Destroy timer
cudaEventDestroy(start);
cudaEventDestroy(stop);
// check result
printf("Check result:\n");
// for (int i = 0; i < N; ++i) {
// diff = abs(h_D[i] - h_C[i]);
// sum += diff*diff;
// }
double diff = abs( (h_D - h_G)/h_D );
printf("|(h_G - h_D)/h_D|=%20.15e\n",diff);
printf("h_G =%20.15e\n",h_G);
printf("h_D =%20.15e\n",h_D);
for (int i=0; i < NGPU; i++) {
cudaSetDevice(i);
cudaDeviceReset();
}
return 0;
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = 1.0f; //rand() / (float)RAND_MAX;
}
$ nvcc -o t3 t3.cu -Xcompiler -fopenmp
$ compute-sanitizer ./t3
========= COMPUTE-SANITIZER
Vector Dot Product with multiple GPUs
Enter the number of GPUs: 2
GPU device number: 0 1
Enter the size of the vectors: 1048576
Enter the number of threads per block: 512
The number of blocks is 1024
Data input time for GPU: 2.405280 (ms)
Processing time for GPU: 8.202272 (ms)
GPU Gflops: 0.383519
Data output time for GPU: 0.429728 (ms)
Total time for GPU: 11.037280 (ms)
Processing time for CPU: 2.361696 (ms)
CPU Gflops: 1.331978
Speed up of GPU = 0.213974
Check result:
|(h_G - h_D)/h_D|=0.000000000000000e+00
h_G =1.048576000000000e+06
h_D =1.048576000000000e+06
========= ERROR SUMMARY: 0 errors
$
我并不是说我已经发现了您代码中所有可能的错误。我尝试过的唯一测试用例就是所描述的。