cuPy 的内存泄漏（Python 中的 CUDA）

Question

我在 python 脚本中使用原始 CUDA 内核。在下面的 MWE 中，我有一个超级简单的原始内核，它没有做任何事情。在下面的代码中，我只是创建一个大数组（大约 2 GB）并将其传递给 CUDA 内核。

MWE（Python - cupPy，不工作）：

import numpy as np
import cupy as cp


# custom raw kernel
custom_kernel = cp.RawKernel(r'''
extern "C" __global__
void custom_kernel(double* large_array)
{
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int frame = blockIdx.z * blockDim.z + threadIdx.z;
}
''', 'custom_kernel')


# launch kernel
large_array_gpu = cp.zeros((101*101*9*9*301), dtype=cp.float64) # around 2 GB
block_dim_2 = (32, 32, 1)
bx2 = int((101 * 101 + block_dim_2[0] - 1) / block_dim_2[0])
by2 = int((9 * 9 + block_dim_2[1] - 1) / block_dim_2[1])
bz2 = int((301 + block_dim_2[2] -1 ) / block_dim_2[2])
grid_dim_2 = (bx2, by2, bz2)

# Launch kernel
custom_kernel(grid_dim_2, block_dim_2, (large_array_gpu)) # gets stuck at this statement, and RAM usage keeps increasing

large_array_cpu = cp.asnumpy(large_array_gpu)

print('done')

问题：一旦在

custom_kernel(grid_dim_2, block_dim_2, large_array_gpu)

行调用内核，我的 RAM 使用量就开始增加到最大容量 32 GB（几乎呈指数级增长），并且内核永远不会完成。如下图所示，GPU 内存使用量约为 2 GB（这是预期的），但 CPU RAM 使用量不断增加。 作为测试，我编写了该程序的 C++ 版本，它运行良好且速度相当快（如下所示）。

为什么CPU端会出现这样的内存泄漏？
为什么 CUDA 内核永远无法完成？

C++ 版本（工作正常）：

#include <stdio.h>

// gpu
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

__global__ void test_kernel(double* large_array)
{
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int frame = blockIdx.z * blockDim.z + threadIdx.z;

    if (y < (9 * 9) && x < (101 * 101) && frame < 301)
    {
        int resultIdx = (frame * (101 * 101) * (9 * 9)) + (y * (101 * 101) + x);
        large_array[resultIdx] = 1.1;
    }
}

int main()
{
    printf("start...");

    cudaError_t cudaStatus;

    // device
    double* dev_largeArray = 0;

    // Memory allocations   
    cudaStatus = cudaMalloc((void**)&dev_largeArray, 101 * 101 * 9 * 9 * 301 * sizeof(double));
    cudaMemset(dev_largeArray, 0, 101 * 101 * 9 * 9 * 301 * sizeof(double)); // initialize the result with zeros

    dim3 blockSize(32, 32, 1);
    int bx2 = ((101 * 101) + blockSize.x - 1) / blockSize.x;
    int by2 = ((9 * 9) + blockSize.y - 1) / blockSize.y;
    int bz2 = (301 + blockSize.z - 1) / blockSize.z;
    dim3 gridSize = dim3(bx2, by2, bz2);
    test_kernel << <gridSize, blockSize >> > (dev_largeArray);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
    }

    // Copy the results back to the host
    double* h_largeArray = new double[101 * 101 * 9 * 9 * 301];
    cudaStatus = cudaMemcpy(h_largeArray, dev_largeArray, 101 * 101 * 9 * 9 * 301 * sizeof(double), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    delete[] h_largeArray;

    cudaFree(dev_largeArray);
    return 0;
}

Answer 1

内核启动的语法中有一个小（但不是那么明显）问题。内核的参数必须作为元组传递。因此，在指定第一个参数后，我必须添加一个逗号

。所以，基本上我的内核启动参数应该是这样的

(large_array_gpu, )

（注意后面的逗号）。

错误语法：

# Launch kernel
custom_kernel(grid_dim_2, block_dim_2, (large_array_gpu))

正确语法：

# Launch kernel
custom_kernel(grid_dim_2, block_dim_2, (large_array_gpu, ))

cuPy 的内存泄漏（Python 中的 CUDA）

问题描述投票：0回答：1

1个回答

最新问题

cuPy 的内存泄漏（Python 中的 CUDA）

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1