为什么每次我尝试输入大数据时,我的 CUDA 程序都会退出并显示代码 -1073741571?

问题描述 投票:0回答:1

这是我的程序代码。而且这个错误甚至不限于此代码,无论我编写什么程序,它都不会输入大数据。并不是说我现在必须对这么大的数据做任何事情。但我只是想知道,为什么它会出现这样的错误。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
#include <stdio.h>
#include <math.h>

cudaError_t multiplyWithCuda(int* d_P, int* d_M, int* d_N, int size);

__global__ void multiplyKernel(int* d_P, int* d_M, int* d_N, int size)
{
    int row = blockIdx.x*blockDim.x+ threadIdx.x ;
    int column = blockIdx.y * blockDim.y + threadIdx.y;
    if ((row  < size) && (column <size) )
    {
        float Pval = 0;
        for (int k = 0; k < size; k++)
        {
            Pval += d_M[row * size + k] * d_N[k*size+column];

        }
        d_P[row * size + column] = Pval;
    }
}

int main()
{
    
    const int max_n = 291;
    int n ;
    printf("Enter order of matrix: ");
    scanf("%d", &n);
    
    int a[max_n * max_n] = {0};
    int b[max_n * max_n] = {0};
    for (int ele1 = 0; ele1 < n*n; ele1++)
    {
        printf("Enter element %d for matrix1: ",(ele1+1));
        scanf("%d", &a[ele1]);
    }

    for (int ele2 = 0; ele2 < n * n; ele2++)
    {
        printf("Enter element %d for matrix 2: ", (ele2 + 1));
        scanf("%d", &b[ele2]);
    }
    int c[max_n* max_n] ;

    // Add vectors in parallel.
    clock_t t;
    t = clock();
    cudaError_t cudaStatus = multiplyWithCuda(c, a, b, n);

    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }
    for(int m=0; m < n; m++)
    {
        for (int l = 0; l < n; l++)
        {
            if (l == 0 && m > 0) { printf("\n%d ", c[l + n * m]); }
            else { printf("%d ", c[l + n * m]); }
        }
        
    }

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    t = clock() - t;
    double time_taken = ((double)t) / CLOCKS_PER_SEC;
    printf("\n %f", time_taken);

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t multiplyWithCuda(int* c, int* a,  int* b, int size)
{
    int* dev_a = 0;
    int* dev_b = 0;
    int* dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    dim3 gridSize(ceil(size/32.0),ceil(size/32.0));
    dim3 blockSize(32,32);
    multiplyKernel <<<gridSize,blockSize >>> (dev_c, dev_a, dev_b,size);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}

一旦将我的 max_n >292,它就开始显示错误。但它对于 291 和低阶矩阵非常有效。

c cuda nvidia
1个回答
0
投票

我曾经遇到过这些问题,但对于一个非常不同且更大的值,我认为它大约是 1000ich 或其他东西。

我使用Heap的时候就解决了。尝试在这里使用 calloc() 看到您将数组初始化为 0。如果这没有帮助,请告诉我。

其余代码看起来非常好。

© www.soinside.com 2019 - 2024. All rights reserved.