使用 CUDA 实现任意大小的前缀和

Question

我正在尝试在 CUDA 中实现任意大小的一维数组的前缀和。我试图弄清楚的第一步是将数组分成多个块。

在我的第一个实验中，我有一个大小为 8 的数组。我使用 https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39 中示例 39-2 中提供的并行算法为每个块指定 2 个块和 4 个线程.html

我的想法是

我将数组分成两个块，每四个线程将处理每个块（“块”）中的四个元素。
然后我期望分别从两个块中获得两个前缀和结果。
然后我可以将第一个“块”中前缀和输出的最后一个元素添加到第二个“块”中的所有元素。并得到最终答案。

现在我陷入了步骤2。我无法分别从两个块中得到正确的结果。有人可以给我一些更正吗？谢谢！

下面是我的代码：

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define N 8
#define thread_num 4
#define block_num 2


__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end - d_start);
}

int main()
{
float a[N], c[N], g[N];
timeval start, end;

float *dev_a, *dev_g;
int size = N * sizeof(float);

double d_gpuTime, d_cpuTime;

// initialize matrices a
for (int i = 0; i < N; i++)
{
//        a[i] = (float)(rand() % 1000000) / 1000.0;
    a[i] = i+1;
    printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);

gettimeofday(&start, NULL);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);

prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);

gettimeofday(&start, NULL);
scanCPU(c, a, N);

gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);

cudaFree(dev_a); cudaFree(dev_g);

for (int i = 0; i < N; i++)
{
    printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
}

printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}


__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern  __shared__  float temp[];
// allocated on invocation
int thid = threadIdx.x;
int bid = blockIdx.x;
int fake_n = block_num * thread_num;


int offset = 1;
if(bid * thread_num + thid<n){ temp[bid * thread_num + thid]  = g_idata[bid * thread_num + thid];
}else{ temp[bid * thread_num + thid]  = 0;
} // Make the "empty" spots zeros, so it won't affect the final result.

for (int d = thread_num>>1; d > 0; d >>= 1)
    // build sum in place up the tree
{
    __syncthreads();
    if (thid < d)
    {
        int ai = bid * thread_num + offset*(2*thid+1)-1;
        int bi = bid * thread_num + offset*(2*thid+2)-1;
        temp[bi] += temp[ai];
    }
    offset *= 2;
}

if (thid == 0)
{
    temp[thread_num - 1] = 0;
}

// clear the last element
for (int d = 1; d < thread_num; d *= 2)
    // traverse down tree & build scan
{
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
        int ai = bid * thread_num + offset*(2*thid+1)-1;
        int bi = bid * thread_num + offset*(2*thid+2)-1;
        float t = temp[bid * thread_num + ai];
        temp[ai]  = temp[ bi];
        temp[bi] += t;
    }
}
__syncthreads();

g_odata[bid * thread_num + thid] = temp[bid * thread_num + thid];
}

void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
    f_out[i] = f_out[i-1] + f_in[i-1];

}

Answer 1

您犯的基本错误是认为您将创建一个共享内存数组，该数组是要扫描的数组的完整大小。

相反，当我们将工作分解为线程块时，我们只创建一个对于每个线程块的大小来说足够大的共享内存数组。您很可能对共享内存有一些误解。从逻辑上讲，它是一个per-threadblock资源。每个线程块都有自己的、独立的、逻辑共享内存数组。此外，共享内存受限于每个SM可以支持的大小，或者逻辑线程块限制，通常为48KB。因此，尝试分配与数据数组大小相同的共享内存数组是行不通的，因为您的实际问题大小变得“大”：

prescan<<<block_num,thread_num,2*N*sizeof(float)>>>(dev_g, dev_a, N);
                                 ^

相反，通常的方法是为每个线程块分配足够的空间，在这种情况下，这由每个块的线程数决定：

prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);

与此更改相对应，我们必须更改内核代码中的索引，以便全局内存访问使用全数组索引，但共享内存访问使用“本地”索引，该索引由共享内存数组的大小决定。

这其实很简单；我们需要从内核代码中删除使用或创建共享内存 (

bid*thread_num

) 索引的每个位置的所有

temp

构造。这样，一旦我们将数据存储在共享内存中，每个线程块的行为就其自己的共享内存副本而言是相同的。

这是一个有效的示例：

$ cat t80.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define N 8
#define thread_num 4
#define block_num 2


__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end - d_start);
}

int main()
{
float a[N], c[N], g[N];
timeval start, end;

float *dev_a, *dev_g;
int size = N * sizeof(float);

double d_gpuTime, d_cpuTime;

// initialize matrices a
for (int i = 0; i < N; i++)
{
//        a[i] = (float)(rand() % 1000000) / 1000.0;
    a[i] = i+1;
    printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);

gettimeofday(&start, NULL);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);

prescan<<<block_num,thread_num,2*thread_num*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);

gettimeofday(&start, NULL);
scanCPU(c, a, N);

gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);

cudaFree(dev_a); cudaFree(dev_g);

for (int i = 0; i < N; i++)
{
    printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
}

printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}


__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern  __shared__  float temp[];
// allocated on invocation
int thid = threadIdx.x;
int bid = blockIdx.x;


int offset = 1;
if((bid * thread_num + thid)<n){ temp[thid]  = g_idata[bid * thread_num + thid];
}else{ temp[thid]  = 0;
} // Make the "empty" spots zeros, so it won't affect the final result.

for (int d = thread_num>>1; d > 0; d >>= 1)
    // build sum in place up the tree
{
    __syncthreads();
    if (thid < d)
    {
        int ai = offset*(2*thid+1)-1;
        int bi = offset*(2*thid+2)-1;
        temp[bi] += temp[ai];
    }
    offset *= 2;
}

if (thid == 0)
{
    temp[thread_num - 1] = 0;
}

// clear the last element
for (int d = 1; d < thread_num; d *= 2)
    // traverse down tree & build scan
{
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
        int ai = offset*(2*thid+1)-1;
        int bi = offset*(2*thid+2)-1;
        float t = temp[ai];
        temp[ai]  = temp[ bi];
        temp[bi] += t;
    }
}
__syncthreads();

g_odata[bid * thread_num + thid] = temp[thid];
}

void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
    f_out[i] = f_out[i-1] + f_in[i-1];

}
$ nvcc -arch=sm_60 -o t80 t80.cu
$ cuda-memcheck ./t80
========= CUDA-MEMCHECK
a[0] = 1.000000
a[1] = 2.000000
a[2] = 3.000000
a[3] = 4.000000
a[4] = 5.000000
a[5] = 6.000000
a[6] = 7.000000
a[7] = 8.000000
c[0] = 0.000, g[0] = 0.000
c[1] = 1.000, g[1] = 1.000
c[2] = 3.000, g[2] = 3.000
c[3] = 6.000, g[3] = 6.000
c[4] = 10.000, g[4] = 0.000
c[5] = 15.000, g[5] = 5.000
c[6] = 21.000, g[6] = 11.000
c[7] = 28.000, g[7] = 18.000
GPU Time for scan size 8: 0.004464
CPU Time for scan size 8: 0.000000
========= ERROR SUMMARY: 0 errors
$

请注意，无论我们在哪里访问全局内存数组，我们都会使用精心设计的索引从全局内存数组中选择块大小的块。但无论我们在共享内存中建立索引，我们都会使用为单个块大小的块设计的索引。

使用 CUDA 实现任意大小的前缀和

问题描述投票：0回答：1

1个回答

最新问题

使用 CUDA 实现任意大小的前缀和

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1