我有一个带有 3 个
Color
变量的类 double
和另一个带有 Image
数组的类 Color
。问题是我无法在 GPU 代码上分配大小为 Color
的 1960*1080
数组:
#include <iostream>
// limited version of checkCudaErrors from helper_cuda.h in CUDA examples
#define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
void check_cuda(cudaError_t result, char const* const func, const char* const file,
int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":"
<< line << " '" << func << "' \n";
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(-1);
}
}
class Color {
public:
double r, g, b;
__host__ __device__ Color() : r(0.0), g(0.0), b(0.0) {
}
};
class Image {
public:
int width = -1;
int height = -1;
Color* frame_buffer = nullptr;
__device__ Image(int _width, int _height) : width(_width), height(_height) {
frame_buffer = new Color[width * height];
}
__device__ ~Image() {
delete frame_buffer;
}
};
__global__ void init_gpu_image(Image* image, int width,
int height) {
printf("block id: (%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);
printf("thread id: (%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);
*image = Image(width, height);
}
int main() {
int width = 1960;
int height = 1080;
printf("image dimension: %d\n", width * height);
printf("image size: %d\n", sizeof(Color) * width * height);
/*
// works fine when allocating with cudaMallocManaged()
Color* frame_buffer;
checkCudaErrors(cudaMallocManaged((void **)&frame_buffer, sizeof(Color) * width * height));
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
*/
Image* gpu_image;
checkCudaErrors(cudaMallocManaged((void **)&gpu_image, sizeof(Image)));
init_gpu_image<<<1, 1>>>(gpu_image, width, height);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
return 0;
}
完整输出:
image dimension: 2116800
image size: 50803200
block id: (0, 0, 0)
thread id: (0, 0, 0)
CUDA error = 700 at /home/wentao/Desktop/cuda-test/main.cu:68 'cudaDeviceSynchronize()'
CMakeLists.txt:
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
if (NOT CMAKE_CUDA_COMPILER)
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
# required by CLion
endif ()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
project(cuda_test LANGUAGES CUDA CXX)
add_executable(cuda_test main.cu)
target_compile_options(cuda_test PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--expt-relaxed-constexpr
>)
new
和
delete
运算符依赖于设备端
malloc
(有关详细信息,请参阅 CUDA C++ 编程指南的本节),由于未设计,因此很可能会失败处理大型分配,例如您尝试分配的 50MB。 为了解决该问题,您可以使用以下方法手动扩展堆:
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (sizeof(Color) + sizeof(double)) * width * height);
我建议不要在设备堆上分配如此大的分配,而是建议研究 cudaMallocAsync
来优化分配(请参阅CUDA 运行时 API 文档的这一部分)。 为了将来参考,您可以使用
NVIDIA 的compute-sanitizer 来调试这些类型的问题。计算消毒程序清楚地突出显示了分配失败(malloc(...)
返回
nullptr
):
$ compute-sanitizer --show-backtrace device ./test
========= COMPUTE-SANITIZER
image dimension: 2116800
image size: 50803200
block id: (0, 0, 0)
thread id: (0, 0, 0)
========= Invalid __global__ write of size 8 bytes
========= at Image::Image(int, int)+0x3b0 in /tmp/tmp.QmgDY96jb8/test.cu:33
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x0 is out of bounds
========= and is 8,675,917,824 bytes before the nearest allocation at 0x205200000 of size 8,388,864 bytes
========= Device Frame:init_gpu_image(Image *, int, int)+0x290 in /tmp/tmp.QmgDY96jb8/test.cu:45
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========
CUDA error = 719 at test.cu:68 'cudaDeviceSynchronize()'
========= Target application returned an error
========= ERROR SUMMARY: 2 errors