在 OpenCL 上计算逆矩阵

问题描述 投票:0回答:0

我对为什么为 OpenCL 编写的算法对小维矩阵正确工作,但对大维矩阵不能正确工作的问题很感兴趣。 此外,对于相同的输入数据,逆矩阵在 OpenCL 上每次都是不同的。 CPU 的常用算法使用高斯方法正确计算所有内容。 大型矩阵的 GPU 算法每次都会给出不同的错误答案。

不知是不是同步线程有问题?但是,我使用屏障在主机级别和内核本身添加了同步。 但一切都无济于事。我附上CPU和GPU(OpenCL)的源代码,请帮忙。

CPU代码(矩阵已经扩充为右侧的扩展单位矩阵):

void doInverse() {
    for (int i = 0; i < size; i++) getUpperTriangularMatrix(i);

    if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
    else {
        for (int i = 0; i < size; i++) {
            mainDiagonal(i);
            processingTopRows(i);
        }
    }
}

void getUpperTriangularMatrix(int i) {
    double elem = matrix[i][i];
    int iConst = i;
    i++;
    for (i; i < size; i++) {
        double elem2 = matrix[i][iConst];
        for (int j = 0; j < size * 2; j++) matrix[i][j] -= matrix[iConst][j] * elem2 / elem;
    }
}

void mainDiagonal(int i) {
    double count = matrix[i][i];
    for (int j = 0; j < size * 2; j++) matrix[i][j] /= count;
}

void processingTopRows(int i) {
    int iConst = i;
    i--;
    for (i; i >= 0; i--) {
        double elem = matrix[i][iConst];
        for (int j = 0; j < size * 2; j++) matrix[i][j] -= elem * matrix[iConst][j];
    }
}

bool isNullDeterminantMatrix() {
    for (int i = 0; i < size; i++)
        if (!matrix[i][i]) return true;
    return false;
}

GPU的代码(矩阵已经扩充为右侧的扩展单位矩阵):

cl_event event;
size_t globalSize[2] = { size, size * 2 };
cl_event eventRead = NULL;

cl_kernel kernel = clCreateKernel(program, "getUpperTriangularMatrix", &clStatus);
//if (clStatus != CL_SUCCESS) error("Error: creating kernel\n", ERROR_CREATING_KERNEL);

clStatus = clSetKernelArg(kernel, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
//if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
clStatus = clSetKernelArg(kernel, 1, sizeof(int), (int*)&size);
//if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);

clStatus = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalSize, NULL, 0, NULL, &event);
//if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
clWaitForEvents(1, &event);
        
clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
//if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
clWaitForEvents(1, &eventRead);

clStatus = clReleaseKernel(kernel);

clStatus = clFlush(commandQueue);
clStatus = clFinish(commandQueue);

if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
else {

    cl_kernel kernel2 = clCreateKernel(program, "mainDiagonal_processingTopRows", &clStatus);
    //if (clStatus != CL_SUCCESS) error("Error: create kernel\n", ERROR_CREATING_KERNEL);

    clStatus = clSetKernelArg(kernel2, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
    //if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
    clStatus = clSetKernelArg(kernel2, 1, sizeof(int), (int*)&size);
    //if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);

    clStatus = clEnqueueNDRangeKernel(commandQueue, kernel2, 2, NULL, globalSize, NULL, 0, NULL, &event);
    //if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
    clWaitForEvents(1, &event);

    clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
    //if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
    clWaitForEvents(1, &eventRead);

    clStatus = clReleaseKernel(kernel2);
    }

    clStatus = clFlush(commandQueue);
    clStatus = clFinish(commandQueue);

内核代码(GPU):

#pragma OPENCL EXTENSION cl_khr_fp64: enable

__kernel void getUpperTriangularMatrix(__global double *matrix, int size) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);

    for (int ii = 0; ii < size; ii++) {
        if (i + 1 < size && i >= ii) {
            double elem = matrix[ii * size * 2 + ii];
            double elem2 = matrix[(i + 1) * size * 2 + ii];
            matrix[(i + 1) * size * 2 + j] -= matrix[ii * size * 2 + j] * elem2 / elem;
        }
        barrier(CLK_GLOBAL_MEM_FENCE);
    }
}

__kernel void mainDiagonal_processingTopRows(__global double *matrix, int size) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);

    double count = matrix[i * size * 2 + i];
    matrix[i * size * 2 + j] /= count;

    for (int ii = 0; ii < size; ii++) {
        if (i - 1 >= 0 && i <= ii) {
            double elem = matrix[(i - 1) * size * 2 + ii];
            matrix[(i - 1) * size * 2 + j] -= elem * matrix[ii * size * 2 + j];
        }
        barrier(CLK_GLOBAL_MEM_FENCE);
    }
}
matrix gpu opencl cpu inverse
© www.soinside.com 2019 - 2024. All rights reserved.