大型数组的前缀扫描

问题描述 投票:0回答:1

我想使用GPUgem中的指令编写大型数组的前缀扫描,这是我的并行类的作业。我确实遵循了书中的所有步骤,但我的代码仍然无法工作。我让它适用于数组大小 4096,但它不适用于更大的数组。这是我的代码:

#include <stdio.h>
#include <sys/time.h>
#define THREADS 1024
typedef int mytype;

__global__ void phaseI(mytype *g_odata, mytype *g_idata, int n, mytype *aux)
{
  __shared__ mytype temp[THREADS];
  const int tid1 = threadIdx.x;
  int offset = 1;
  temp[2*tid1] = g_idata[2*tid1]; // load input into shared memory
  temp[2*tid1+1] = g_idata[2*tid1+1];
  for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
  {
    __syncthreads();
    if (tid1 < d)
    {
      int ai = offset*(2*tid1+1)-1;
      int bi = offset*(2*tid1+2)-1;
      temp[bi] += temp[ai];
    }
    offset *= 2;
  }
  __syncthreads();
  if (tid1 == 0) {
    aux[blockIdx.x] = temp[THREADS - 1]; 
    temp[THREADS - 1] = 0;
  }
 for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
    {
      offset >>= 1;
      __syncthreads();
      if (tid1 < d)
      {
         int ai = offset*(2*tid1+1)-1;
         int bi = offset*(2*tid1+2)-1;
         mytype t = temp[ai];
         temp[ai] = temp[bi];
         temp[bi] += t;
      }
    }
  __syncthreads();
  g_odata[2*thid] = temp[2*thid]; // write results to device memory
  g_odata[2*thid+1] = temp[2*thid+1];
  }

__global__ void phaseII(mytype *g_odata, mytype *aux, int n)
{
  const int tid1 = threadIdx.x;
  const int B = (n / THREADS);
  int offset = 1;
 for (int d = B>>1; d > 0; d >>= 1) // build sum in place up the tree
  {
    __syncthreads();
    if (tid1 < d)
    {
      int ai = offset*(2*tid1+1)-1;
      int bi = offset*(2*tid1+2)-1;
      temp[bi] += temp[ai];
    }
    offset *= 2;
  }
  __syncthreads();
  if (tid1 == 0 && blockIdx.x == 0) {
    aux[B - 1] = 0;
  }
for (int d = 1; d < B; d *= 2) // traverse down tree & build scan
    {
      offset >>= 1;
      __syncthreads();
      if (tid1 < d)
      {
         int ai = offset*(2*tid1+1)-1;
         int bi = offset*(2*tid1+2)-1;
         mytype t = temp[ai];
         temp[ai] = temp[bi];
         temp[bi] += t;
      }
    }
  __syncthreads();  
  g_odata[2*thid] += aux[blockIdx.x];
  g_odata[2*thid+1] += aux[blockIdx.x];
}

int main(int argc, char *argv[])
{
  if (argc != 2) {
    printf("usage: %s n\n", argv[0]);
    return -1;
  }
  const int n = atoi(argv[1]);
  mytype *h_i, *d_i, *h_o, *d_o, *d_temp;
  const int size = n * sizeof(mytype);
  h_i = (mytype *)malloc(size);
  h_o = (mytype *)malloc(size);
  if ((h_i == NULL) || (h_o == NULL)) {
    printf("malloc failed\n");
    return -1;
  }
  for (int i = 0; i < n; i++) {
    h_i[i] = i;
    h_o[i] = 0;
  }
  cudaMalloc(&d_i, size);
  cudaMalloc(&d_temp, (n / THREADS) );
  cudaMalloc(&d_o, size);
  cudaMemset(d_o, 0, size);
  cudaMemset(d_temp, 0, (n / THREADS));
  cudaMemcpy(d_i, h_i, size, cudaMemcpyHostToDevice);
  int blocks = n / THREADS;
  phaseI<<<blocks, THREADS / 2 >>>(d_o, d_i, n, d_temp);
  phaseII<<<blocks, THREADS / 2>>>(d_o, d_temp, n);
  cudaThreadSynchronize();
  cudaMemcpy(h_o, d_o, size, cudaMemcpyDeviceToHost);
  printf("\n");
  for (int i = 0; i < n ; i++) {
    printf(" %d", h_o[i]); 
  }
  printf("\n\n");

  return 0;
}

有人知道我做错了什么吗?

cuda gpgpu prefix-sum
1个回答
2
投票

我在您的代码中看到的一个可能的错误在这里:

  aux[thid] = temp[THREADS]; 

如果你的

temp
数组是
temp[1024]
,如你所说,并且每个块有 1024 个线程,如你所说,那么如果 THREADS 是 1024,temp[THREADS] 将越界访问你的共享内存数组(一个超过末尾。) 1024 个元素的数组仅具有从 0 到 1023 的有效索引。

除此之外,您似乎在问如何从共享内存数组(

temp
)中取出最后一个元素并将其放置在(大概是全局的)
aux
数组中的某个位置,该数组有一个元素每个区块。

这是一个完整的示例:

$ cat t831.cu
#include <stdio.h>

#define THREADS 1024
#define BLOCKS    20

__global__ void kernel(int *aux){

  __shared__ int temp[THREADS];
  temp[threadIdx.x] = threadIdx.x + blockIdx.x;
  __syncthreads();
  if (threadIdx.x == 0)
    aux[blockIdx.x] = temp[THREADS-1];
}

int main(){

  int *h_data, *d_data;
  const int dsize = BLOCKS*sizeof(int);
  h_data=(int *)malloc(dsize);
  cudaMalloc(&d_data, dsize);
  memset(h_data, 0, dsize);
  cudaMemset(d_data, 0, dsize);
  kernel<<<BLOCKS, THREADS>>>(d_data);
  cudaMemcpy(h_data, d_data, dsize, cudaMemcpyDeviceToHost);
  for (int i = 0; i < BLOCKS; i++) printf("%d, ", h_data[i]);
  printf("\n");
  return 0;
}

$ nvcc -o t831 t831.cu
$ cuda-memcheck ./t831
========= CUDA-MEMCHECK
1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042,
========= ERROR SUMMARY: 0 errors
$
最新问题
© www.soinside.com 2019 - 2024. All rights reserved.