我对为什么为 OpenCL 编写的算法对小维矩阵正确工作,但对大维矩阵不能正确工作的问题很感兴趣。 此外,对于相同的输入数据,逆矩阵在 OpenCL 上每次都是不同的。 CPU 的常用算法使用高斯方法正确计算所有内容。 大型矩阵的 GPU 算法每次都会给出不同的错误答案。
不知是不是同步线程有问题?但是,我使用屏障在主机级别和内核本身添加了同步。 但一切都无济于事。我附上CPU和GPU(OpenCL)的源代码,请帮忙。
CPU代码(矩阵已经扩充为右侧的扩展单位矩阵):
void doInverse() {
for (int i = 0; i < size; i++) getUpperTriangularMatrix(i);
if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
else {
for (int i = 0; i < size; i++) {
mainDiagonal(i);
processingTopRows(i);
}
}
}
void getUpperTriangularMatrix(int i) {
double elem = matrix[i][i];
int iConst = i;
i++;
for (i; i < size; i++) {
double elem2 = matrix[i][iConst];
for (int j = 0; j < size * 2; j++) matrix[i][j] -= matrix[iConst][j] * elem2 / elem;
}
}
void mainDiagonal(int i) {
double count = matrix[i][i];
for (int j = 0; j < size * 2; j++) matrix[i][j] /= count;
}
void processingTopRows(int i) {
int iConst = i;
i--;
for (i; i >= 0; i--) {
double elem = matrix[i][iConst];
for (int j = 0; j < size * 2; j++) matrix[i][j] -= elem * matrix[iConst][j];
}
}
bool isNullDeterminantMatrix() {
for (int i = 0; i < size; i++)
if (!matrix[i][i]) return true;
return false;
}
GPU的代码(矩阵已经扩充为右侧的扩展单位矩阵):
cl_event event;
size_t globalSize[2] = { size, size * 2 };
cl_event eventRead = NULL;
cl_kernel kernel = clCreateKernel(program, "getUpperTriangularMatrix", &clStatus);
//if (clStatus != CL_SUCCESS) error("Error: creating kernel\n", ERROR_CREATING_KERNEL);
clStatus = clSetKernelArg(kernel, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
//if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
clStatus = clSetKernelArg(kernel, 1, sizeof(int), (int*)&size);
//if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);
clStatus = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalSize, NULL, 0, NULL, &event);
//if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
clWaitForEvents(1, &event);
clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
//if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
clWaitForEvents(1, &eventRead);
clStatus = clReleaseKernel(kernel);
clStatus = clFlush(commandQueue);
clStatus = clFinish(commandQueue);
if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
else {
cl_kernel kernel2 = clCreateKernel(program, "mainDiagonal_processingTopRows", &clStatus);
//if (clStatus != CL_SUCCESS) error("Error: create kernel\n", ERROR_CREATING_KERNEL);
clStatus = clSetKernelArg(kernel2, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
//if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
clStatus = clSetKernelArg(kernel2, 1, sizeof(int), (int*)&size);
//if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);
clStatus = clEnqueueNDRangeKernel(commandQueue, kernel2, 2, NULL, globalSize, NULL, 0, NULL, &event);
//if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
clWaitForEvents(1, &event);
clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
//if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
clWaitForEvents(1, &eventRead);
clStatus = clReleaseKernel(kernel2);
}
clStatus = clFlush(commandQueue);
clStatus = clFinish(commandQueue);
内核代码(GPU):
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void getUpperTriangularMatrix(__global double *matrix, int size) {
const int i = get_global_id(0);
const int j = get_global_id(1);
for (int ii = 0; ii < size; ii++) {
if (i + 1 < size && i >= ii) {
double elem = matrix[ii * size * 2 + ii];
double elem2 = matrix[(i + 1) * size * 2 + ii];
matrix[(i + 1) * size * 2 + j] -= matrix[ii * size * 2 + j] * elem2 / elem;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
__kernel void mainDiagonal_processingTopRows(__global double *matrix, int size) {
const int i = get_global_id(0);
const int j = get_global_id(1);
double count = matrix[i * size * 2 + i];
matrix[i * size * 2 + j] /= count;
for (int ii = 0; ii < size; ii++) {
if (i - 1 >= 0 && i <= ii) {
double elem = matrix[(i - 1) * size * 2 + ii];
matrix[(i - 1) * size * 2 + j] -= elem * matrix[ii * size * 2 + j];
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}