为什么OpenCL中不同的本地大小会产生不同的结果?

问题描述 投票:1回答:1

我正在尝试使用OpenCL执行基本的矩阵乘法算法。假定两个矩阵都具有相等的维度(SIZE x SIZE),所以我将问题定义为一个二维,其全局大小为SIZE x SIZE,并且我正在测试在不同的局部大小下会发生什么。

内核编写如下:

__kernel void matmul(
    __global unsigned int *a,
    __global unsigned int *b,
    __global unsigned int *c
) {
    int row, col, i, size;
    unsigned int dot;

    row = get_global_id(0);
    col = get_global_id(1);
    size = get_global_size(0);

    dot = 0;
    for (i = 0; i < size; i++) {
        dot += a[row * size + i] * b[i * size + col];
    }

    c[row * size + col] = dot;
}

如果将全局和局部大小分别设置为1024 x 1024和1 x 1,则效果很好。但是,事实证明,如果局部大小是2 x 2或4 x 4,则乘法的结果将是错误的。现在,对于使用8的倍数的本地大小,例如8 x 8、16 x 16,...乘法没有错误。为什么会这样?

我不知道问题出在内核的编程中,还是我对工作组或工作项的处理方式了解不多。

完整的主机代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <CL/opencl.h>

#define SIZE (1024)
#define WORKITEMS (4096)
#define LOG_SIZE (2048)

int main(int argc, char *argv[]) {
    int i, j, k, size, errors;

    // Host memory
    cl_uint *a_host = NULL;
    cl_uint *b_host = NULL;
    cl_uint *c_host = NULL;
    cl_uint ref_dot;

    // Device memory
    cl_mem a_device;
    cl_mem b_device;
    cl_mem c_device;

    // Performance measurements
    struct timeval t0, tf;
    float ts, tp, tb;

    // OpenCL variables
    FILE *f;
    size_t f_size;
    size_t global[3] = {0}, local[3] = {0};
    char *buffer = NULL;
    cl_int ret;
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    // [1] Initialize application

    // Read command line arguments to configure run
    size = (argc > 1) ? atoi(argv[1]) : SIZE;
    printf("Matrix multiplication with OpenCL (Size = %d)\n", size);

    // Allocate memory for host variables
    a_host = malloc(size * size * sizeof *a_host);
    b_host = malloc(size * size * sizeof *b_host);
    c_host = malloc(size * size * sizeof *c_host);

    // Initialize input arrays
    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            a_host[i * size + j] = rand();
            b_host[i * size + j] = rand();
        }
    }

    // [2] Initialize OpenCL environment

    // Get platform
    ret = clGetPlatformIDs(1, &platform, NULL);
    // Get device
    ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);

    // Create context
    context = clCreateContext(0, 1, &device, NULL, NULL, &ret);

    // Create command queue
    queue = clCreateCommandQueueWithProperties(context, device, 0, &ret);

    // [3] Compile OpenCL kernel
    f = fopen("kernel.cl", "rb");
    fseek(f, 0, SEEK_END);
    f_size = ftell(f);
    rewind(f);

    // Read file into memory
    buffer = malloc(f_size + 1);
    buffer[f_size] = '\0';
    fread(buffer, sizeof(char), f_size, f);
    fclose(f);

    // Create program
    printf("<OpenCL> Kernel source:\n%s", buffer);
    program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &f_size, &ret);

    // Build program
    printf("<OpenCL> Building kernel...\n");
    gettimeofday(&t0, NULL);
    ret = clBuildProgram(program, 0, NULL, "-cl-std=CL2.0", NULL, NULL);
    gettimeofday(&tf, NULL);
    tb = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
    printf("Build time: %.3f ms\n", tb);

    // Print build log (optional)
    char log[LOG_SIZE];
    ret = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, LOG_SIZE, log, NULL);
    printf("<OpenCL> Kernel build log:\n%s\n", log);

    // [4] Configure OpenCL kernel

    // Create kernel
    kernel = clCreateKernel(program, "matmul", &ret);

    // Create device buffers
    a_device = clCreateBuffer(context, CL_MEM_READ_ONLY, size * size * sizeof *a_host, NULL, &ret);
    b_device = clCreateBuffer(context, CL_MEM_READ_ONLY, size * size * sizeof *b_host, NULL, &ret);
    c_device = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size * size * sizeof *c_host, NULL, &ret);

    // Set kernel parameters
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_device);
    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_device);
    ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_device);

    // [5] Execute kernel
    printf("<OpenCL> Executing kernel...\n");
    gettimeofday(&t0, NULL);

    // Write data from host to device
    ret = clEnqueueWriteBuffer(queue, a_device, CL_TRUE, 0, size * size * sizeof *a_host, a_host, 0, NULL, NULL);
    ret |= clEnqueueWriteBuffer(queue, b_device, CL_TRUE, 0, size * size * sizeof *b_host, b_host, 0, NULL, NULL);

    // Enqueue kernel for execution
    global[0] = size;
    global[1] = size;
    local[0] = 2;
    local[1] = 2;
    ret = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);

    // Wait for kernel to finish
    ret = clFinish(queue);

    // Read data from device to host
    ret = clEnqueueReadBuffer(queue, c_device, CL_TRUE, 0, size * size * sizeof *c_host, c_host, 0, NULL, NULL);

    gettimeofday(&tf, NULL);
    tp = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
    printf("[PAR] Execution time: %.3f ms\n", tp);

    // [6] Print results, perform checks

    // Compute golden reference and check errors

    gettimeofday(&t0, NULL);
    errors = 0;

    for (i = 0; i < size; i++) {
        for (j = 0; j < size; j++) {
            ref_dot = 0;
            for (k = 0; k < size; k++) {
                ref_dot += a_host[i * size + k] * b_host[k * size + j];
            }

            if (ref_dot != c_host[i * size + j]) {
                errors++;
            }
        }
    }

    gettimeofday(&tf, NULL);
    ts = ((tf.tv_sec - t0.tv_sec) * 1000.0) + ((tf.tv_usec - t0.tv_usec) / 1000.0);
    printf("[SEQ] Execution time : %.3f ms\n", ts);
    printf("Found %d error%s\n", errors, (errors == 1) ? "" : "s");

    // [7] Cleanup system

    // Cleanup host variables
    free(a_host);
    free(b_host);
    free(c_host);
    free(buffer);

    // Cleanup OpenCL
    clReleaseMemObject(a_device);
    clReleaseMemObject(b_device);
    clReleaseMemObject(c_device);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}
c parallel-processing opencl matrix-multiplication
1个回答
0
投票

全局范围必须是本地范围的精确倍数。如果不是这种情况,GPU可能会读取或写入未定义的内存区域(如果未在加载/存储任何内存之前在内核开头用if(rwo>=SIZE||col>SIZE) return; *明确捕获到内核中)。*由于性能原因,应避免此分支。

GPU线程以32(扭曲)为一组进行操作,因此局部范围应至少为32或其倍数。例如,如果局部范围仅为16,则每个扭曲的一半将处于空闲状态,从而使可用处理能力减半。适合您的最小尺寸为8x8=64 > 32

一维示例:

  • 全局范围64,局部范围32:线程块1执行项目0-31,线程块2执行项目32-63。一切正常。

  • 全局范围64,局部范围40:线程块1执行项目0-39,线程块2执行项目40-79。线程块2在带有项64-79的未定义存储区域中操作。编译器不会警告您,但最终您会看到不正确的结果。

类似于第二个示例,当您的局部范围小于32时,例如局部范围为4x4=16 < 32,我怀疑warp的其余线程在未定义的内存区域中运行。

© www.soinside.com 2019 - 2024. All rights reserved.