我正在尝试在 CUDA 中实现任意大小的一维数组的前缀和。我试图弄清楚的第一步是将数组分成多个块。
在我的第一个实验中,我有一个大小为 8 的数组。我使用 https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39 中示例 39-2 中提供的并行算法为每个块指定 2 个块和 4 个线程.html
我的想法是
现在我陷入了步骤2。我无法分别从两个块中得到正确的结果。有人可以给我一些更正吗?谢谢!
下面是我的代码:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define N 8
#define thread_num 4
#define block_num 2
__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end - d_start);
}
int main()
{
float a[N], c[N], g[N];
timeval start, end;
float *dev_a, *dev_g;
int size = N * sizeof(float);
double d_gpuTime, d_cpuTime;
// initialize matrices a
for (int i = 0; i < N; i++)
{
// a[i] = (float)(rand() % 1000000) / 1000.0;
a[i] = i+1;
printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);
gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();
cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);
gettimeofday(&start, NULL);
scanCPU(c, a, N);
gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);
cudaFree(dev_a); cudaFree(dev_g);
for (int i = 0; i < N; i++)
{
printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
}
printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}
__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[];
// allocated on invocation
int thid = threadIdx.x;
int bid = blockIdx.x;
int fake_n = block_num * thread_num;
int offset = 1;
if(bid * thread_num + thid<n){ temp[bid * thread_num + thid] = g_idata[bid * thread_num + thid];
}else{ temp[bid * thread_num + thid] = 0;
} // Make the "empty" spots zeros, so it won't affect the final result.
for (int d = thread_num>>1; d > 0; d >>= 1)
// build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = bid * thread_num + offset*(2*thid+1)-1;
int bi = bid * thread_num + offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0)
{
temp[thread_num - 1] = 0;
}
// clear the last element
for (int d = 1; d < thread_num; d *= 2)
// traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = bid * thread_num + offset*(2*thid+1)-1;
int bi = bid * thread_num + offset*(2*thid+2)-1;
float t = temp[bid * thread_num + ai];
temp[ai] = temp[ bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[bid * thread_num + thid] = temp[bid * thread_num + thid];
}
void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
f_out[i] = f_out[i-1] + f_in[i-1];
}
您犯的基本错误是认为您将创建一个共享内存数组,该数组是要扫描的数组的完整大小。
相反,当我们将工作分解为线程块时,我们只创建一个对于每个线程块的大小来说足够大的共享内存数组。您很可能对共享内存有一些误解。从逻辑上讲,它是一个per-threadblock资源。每个线程块都有自己的、独立的、逻辑共享内存数组。此外,共享内存受限于每个SM可以支持的大小,或者逻辑线程块限制,通常为48KB。因此,尝试分配与数据数组大小相同的共享内存数组是行不通的,因为您的实际问题大小变得“大”:
prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
^
相反,通常的方法是为每个线程块分配足够的空间,在这种情况下,这由每个块的线程数决定:
prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);
与此更改相对应,我们必须更改内核代码中的索引,以便全局内存访问使用全数组索引,但共享内存访问使用“本地”索引,该索引由共享内存数组的大小决定。
这其实很简单;我们需要从内核代码中删除使用或创建共享内存 (
bid*thread_num
) 索引的每个位置的所有 temp
构造。这样,一旦我们将数据存储在共享内存中,每个线程块的行为就其自己的共享内存副本而言是相同的。
这是一个有效的示例:
$ cat t80.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define N 8
#define thread_num 4
#define block_num 2
__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end - d_start);
}
int main()
{
float a[N], c[N], g[N];
timeval start, end;
float *dev_a, *dev_g;
int size = N * sizeof(float);
double d_gpuTime, d_cpuTime;
// initialize matrices a
for (int i = 0; i < N; i++)
{
// a[i] = (float)(rand() % 1000000) / 1000.0;
a[i] = i+1;
printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);
gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();
cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);
gettimeofday(&start, NULL);
scanCPU(c, a, N);
gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);
cudaFree(dev_a); cudaFree(dev_g);
for (int i = 0; i < N; i++)
{
printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
}
printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}
__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[];
// allocated on invocation
int thid = threadIdx.x;
int bid = blockIdx.x;
int offset = 1;
if((bid * thread_num + thid)<n){ temp[thid] = g_idata[bid * thread_num + thid];
}else{ temp[thid] = 0;
} // Make the "empty" spots zeros, so it won't affect the final result.
for (int d = thread_num>>1; d > 0; d >>= 1)
// build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0)
{
temp[thread_num - 1] = 0;
}
// clear the last element
for (int d = 1; d < thread_num; d *= 2)
// traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
float t = temp[ai];
temp[ai] = temp[ bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[bid * thread_num + thid] = temp[thid];
}
void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
f_out[i] = f_out[i-1] + f_in[i-1];
}
$ nvcc -arch=sm_60 -o t80 t80.cu
$ cuda-memcheck ./t80
========= CUDA-MEMCHECK
a[0] = 1.000000
a[1] = 2.000000
a[2] = 3.000000
a[3] = 4.000000
a[4] = 5.000000
a[5] = 6.000000
a[6] = 7.000000
a[7] = 8.000000
c[0] = 0.000, g[0] = 0.000
c[1] = 1.000, g[1] = 1.000
c[2] = 3.000, g[2] = 3.000
c[3] = 6.000, g[3] = 6.000
c[4] = 10.000, g[4] = 0.000
c[5] = 15.000, g[5] = 5.000
c[6] = 21.000, g[6] = 11.000
c[7] = 28.000, g[7] = 18.000
GPU Time for scan size 8: 0.004464
CPU Time for scan size 8: 0.000000
========= ERROR SUMMARY: 0 errors
$
请注意,无论我们在哪里访问全局内存数组,我们都会使用精心设计的索引从全局内存数组中选择块大小的块。但无论我们在共享内存中建立索引,我们都会使用为单个块大小的块设计的索引。