CUDA中素数的不正确加法[重复]

Question

我参考How to find the sum of array in CUDA by reduction在代码中使用归约逻辑。

但是它给出了一些错误。我没有弄错，可以请你帮我吗？

所需规格：1.Cuda工具包v6.52.显卡：GTX 210（计算能力1.2）3. Visual Studio 2013

#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>

#define SIZE 10
#define N 100

__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
    __shared__ int sdata[256];

    int i = threadIdx.x + (blockIdx.x*blockDim.x);

    sdata[threadIdx.x] = d_a[i];
    __syncthreads();

    if (i<SIZE)

    for (i = 2; i<SIZE; i++)
    {
        int counter = 0;
        for (int j = 2; j<d_a[i]; j++)
        {
            if (d_a[i] % j == 0)
            {
            counter = 1; break;
            }
        }
if (counter == 0)
        {
            d_b[i] = d_a[i];
        }

    }
    // do reduction in shared mem
    for (int s = 1; s < blockDim.x; s *= 2)
    {
        int index = 2 * s * threadIdx.x;;

        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (threadIdx.x == 0)
        atomicAdd(d_c, sdata[0]);
}

}
int main()
{
    clock_t tic = clock();
    int *a, *b, *summation=0, sum = 0,count=-1;       //declare summation as double/long if needed
    int *d_a, *d_b, *d_c;

    //int blocks, block_size = 512;

    int size = N * sizeof(int); 

    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    summation = (int *)malloc(SIZE*sizeof(int));


    cudaMalloc((void**)&d_a, SIZE * sizeof(int));
    cudaMalloc((void**)&d_b, SIZE * sizeof(int));
    cudaMalloc((void**)&d_c, SIZE * sizeof(int));


    for (int i = 1; i<SIZE; i++)
    {
        a[i] = i;
        b[i] = 0;

    }

    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    /*blocks = SIZE / block_size;
    if (SIZE% block_size != 0)
        blocks++;   */

    dim3 blocksize(256); // create 1D threadblock
    dim3 gridsize(N / blocksize.x);  //create 1D grid

    vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);


    //cudaThreadSynchronize();

    cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    for (int m = 0; m < SIZE; m++)
    {
        if (b[m] != 0)
        {
            printf("\n prime no is:%d", b[m]);
            count = count + 1;
        }
    }
    printf("\n\n Total prime no. are: %d", count);
/*      for (int j = 1; j<SIZE; j++)
    {
        sum = sum + b[j];
    }*/

    printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);

    clock_t toc = clock();
    printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);

    free(a);    free(b);    free(summation);
    cudaFree(d_a);      cudaFree(d_b);      cudaFree(d_c);

    getchar();  return 0;
}

Answer 1

您的代码中有很多错误：

cudaMalloc（（（void **）＆d_a，SIZE * sizeof（int））;

应该是：

cudaMalloc（（void **）＆d_a，N * sizeof（int））; //或
cudaMalloc（（void **）＆d_a，size）;

您已经计算过但没有通过。如果是malloc（），则相同//主机代码

CUDA中素数的不正确加法[重复]

问题描述投票：-1回答：1

1个回答

最新问题

CUDA中素数的不正确加法[重复]

问题描述 投票：-1回答：1

1个回答

最新问题

问题描述投票：-1回答：1