OpenCL：for 循环中的 CL_OUT_OF_RESOURCES

Question

我正在尝试执行 OpenCL，但它给了我一个 CL_OUT_OF_RESOURCES。情况如下：

我正在测试 100 个工作项，因此我将 global_sizes 和 local_sizes 设置为 100。我创建了一个 100 * 128 的只写缓冲区，用于处理工作项的 128 个值。我执行内核，当我要读取结果缓冲区时，我收到错误。

内核代码如下：

__kernel void k2(__global int* debug) {
    uint idx = 128 * get_global_id(0);
    uint i, k;
    for (i = 0; i < 128000; ++i) {
        for (k = 0; k < 128; ++k) {
            debug[idx+k] = 23;
        }
    }
}

我在变量idx中获取每个工作项的索引。然后，我执行 128000 次子循环循环（我知道这是一件愚蠢的事情，但这只是为了测试目的！），并将值 23 赋予缓冲区的每个值。

启动代码如下：

    cl_int status;
cl_uint num_platforms;
cl_platform_id* platforms;
cl_uint* num_devices;
cl_device_id** devices;
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_kernel kernel;
cl_program program;

cl_ulong max_mem_size;
cl_ulong max_work_group_size;
size_t max_work_item_size[3];

// Discover and populate the platforms
status = clGetPlatformIDs(0, NULL, &num_platforms);
chk_err(status, "Getting platform IDs", true);
if (num_platforms <= 0) {
    // If no platforms are available, we shouldn't continue
    fprintf(stderr, "No OpenCL platforms found!\n");
    exit(-1);
}

// Get all the platforms
platforms = new cl_platform_id[num_platforms];

status = clGetPlatformIDs(num_platforms, platforms, NULL);
chk_err(status, "Getting platform IDs", true);

// Allocate space for the device lists and lengths
num_devices = new cl_uint[num_platforms];
devices = new cl_device_id*[num_platforms];

// Traverse the platforms array printing information and
// populating devices
for (cl_uint i = 0; i < num_platforms; ++i) {
    // Print some platform info
    char* name = get_platform_info(platforms[i], CL_PLATFORM_NAME,
            "Getting platform name");
    char* vendor = get_platform_info(platforms[i], CL_PLATFORM_VENDOR,
            "Getting platform vendor");
    //printf("Platform: %s\nVendor: %s\n", name, vendor);
    delete[] name;
    delete[] vendor;

    // Retrieve the devices
    status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices[i]);
    if (chk_err(status, "Getting device IDs")) {
        printf("This is a known NVIDIA bug (if platform == AMD then die)\n");
        printf("Setting number of devices to 0 and continuing\n");
        num_devices[i] = 0;
    }

    //printf("Devices: %d\n", num_devices[i]);

    // Populate OpenCL devices if any exist
    if (num_devices[i] != 0) {
        // Allocate an array of devices of size "numDevices"
        devices[i] = new cl_device_id[num_devices[i]];

        // Populate Array with devices
        status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, num_devices[i],
            devices[i], NULL);
        chk_err(status, "Getting device IDs", true);
    }
}

cl_uint chosen_platform = 0;
cl_uint chosen_device = 0;

// Do a sanity check of platform/device selection
if (chosen_platform >= num_platforms ||
    chosen_device >= num_devices[chosen_platform]) {
    fprintf(stderr, "Invalid platform/device combination\n");
    exit(-1);
}

// Set the selected platform and device
platform = platforms[chosen_platform];
device = devices[chosen_platform][chosen_device];

// Get some device info
char* name = get_device_name(device);
char* vendor = get_device_vendor(device);
max_mem_size = get_device_max_mem_size(device);
max_work_group_size = get_device_max_work_group_size(device);
get_device_max_work_item_size(device, max_work_item_size);

printf("Device: %s\n", name);
printf("Vendor: %s\n", vendor);
printf("Max mem size: %llu Mb\n", max_mem_size / 1024);
printf("Max work group size: %llu\n", max_work_group_size);
printf("Max work item size: %llu, %llu, %llu\n",
        max_work_item_size[0], max_work_item_size[1], max_work_item_size[2]);

delete[] name;
delete[] vendor;

// Create the context
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,
    (cl_context_properties)(platform), 0};
context = clCreateContext(cps, 1, &device, NULL, NULL, &status);
chk_err(status, "Creating context", true);

// Create the command queue
queue = clCreateCommandQueue(context, device, 0, &status);
chk_err(status, "creating command queue", true);

// Load kernel source
char* source = load_kernel_source("vpm2.cl");
size_t source_size[] = { strlen(source) };

// Create the program object
program = clCreateProgramWithSource(context, 1, (const char**)&source,
        source_size, &status);
chk_err(status, "Creating program", true);
delete[] source;

// Try to compile the program
const char options[] = "-D ENABLE_DOUBLE -Werror -cl-nv-verbose";
status = clBuildProgram(program, 1, &device, options, NULL, NULL);

if (chk_err(status, "Building program")) {
    cl_build_status build_status;

    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
        sizeof(cl_build_status), &build_status, NULL);

    size_t size;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0,
        NULL, &size);

    char* build_log = new char[size+1];
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
        size+1, build_log, NULL);
    build_log[size] = '\0';

    printf("Build log:\n%s\nEnd log\n", build_log);
    chk_err(build_status, "Getting build info", true);
}

// Create the kernel
kernel = clCreateKernel(program, "k2", &status);
chk_err(status, "Creating kernel", true);

// Create the buffer
uint num_workitems = 100;
uint buf_size = num_workitems * 128;

cl_mem mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buf_size * sizeof(int), NULL, &status);
chk_err(status, "Error creating const mem buffer", true);

// Add arguments
status = clSetKernelArg(kernel, 0, sizeof(mem), &mem);
chk_err(status, "Setting kernel arg", true);

// Execute kernel
size_t global_sizes[1] = {num_workitems};
size_t local_sizes[1] = {num_workitems};
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
    global_sizes, local_sizes, 0, NULL, NULL);
chk_err(status, "Executing kernel", true);

// Read the results
int* res = new int[buf_size];
status = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0,
        buf_size * sizeof(int), (void*)res, 0, NULL, NULL);
chk_err(status, "Reading buffer", true);

// Release objects
status = clReleaseProgram(program);
chk_err(status, "Releasing program");
status = clReleaseKernel(kernel);
chk_err(status, "Releasing kernel");
status = clReleaseMemObject(mem);
chk_err(status, "Releasing mem object");
clReleaseCommandQueue(queue);
clReleaseContext(context);
for (cl_uint i = 0; i < num_platforms; ++i) {
    delete[] devices[i];
}
delete[] devices;
delete[] num_devices;
delete[] platforms;
delete res;

起初我以为我超出了 idx+k 索引的范围，但事实并非如此。

这个错误真的很奇怪，因为例如，如果我将 idx+k 更改为 idx+127，它就会起作用。如果我还将数字 128000 更改为较小的数字，例如 56000，它也可以工作（！），因此这一事实会丢弃内核创建/执行中的错误。太棒了，不是吗？我开始认为本地内存管理或类似问题存在问题。有什么想法吗？？

顺便说一下......我正在 NVIDIA Quadro 2000 中运行代码。

非常感谢！

Answer 1

最可能的情况是您在内核中出现 SEG_FAULTing，并给出 CL_OUT_OF_RESOURCES，这是 nVIDIA 平台中内核 SEG_FAULTS 时出现的一般错误。但是，由于 clEnqueueNDRangeKernel 在对内核进行排队时无法检测到错误，因此在读取 SEG_FAULTED 缓冲区时返回该错误。

原因可能是：

您运行的项目比您想象的要多（我们可以看看您如何运行内核吗？）
您创建的内存少于调试变量所需的内存。
内存标志不正常，它们是只读的，或任何其他问题。

PD：如果您只运行 100 个工作项，我最初的假设是错误的。

您的错误的另一种选择是您将 6GB 数据写入 120kB 区域并且仅在 1 个工作组中，这会导致巨大的瓶颈，使内核需要花费大量时间来运行，从而被驱动程序杀死。返回 CL_OUT_OF_RESOURCES。

减少循环量可以解决这个问题，将 k 设置为固定值将在编译器优化阶段消除循环（从而也解决问题）。您可以尝试使用更多工作组是否也可以解决该问题。

您是否遇到过 2 秒屏幕冻结的情况？那么这肯定是问题所在。

Answer 2

您所看到的是 OpenCL 之间争用的结果内核和您的操作系统。您的操作系统想要使用 GPU 来渲染窗口 OpenCL 希望将它用于您的计算。因此屏幕冻结。最终您的操作系统终止了 OpenCL 程序的 GPU 访问会导致上述消息。在我的电脑上 Intel GPU 以下行记录在 dmesg 中：

kernel: Asynchronous wait on fence 0000:00:02.0:kwin_x11[495]:1be343e timed out (hint:intel_atomic_commit_ready [i915])
kernel: i915 0000:00:02.0: [drm] GPU HANG: ecode 8:1:85ddfffb, in csim [541902]
kernel: i915 0000:00:02.0: [drm] Resetting rcs0 for stopped heartbeat on rcs0
kernel: i915 0000:00:02.0: [drm] csim[541902] context reset due to GPU hang

OpenCL：for 循环中的 CL_OUT_OF_RESOURCES

问题描述投票：0回答：2

2个回答

最新问题

OpenCL：for 循环中的 CL_OUT_OF_RESOURCES

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2