我试图通过编写一个简单的程序来学习OpenCL,将一个点的尺寸的减法的绝对值相加。当我写完代码后,输出似乎是错误的,所以我决定在代码和内核中集成一些printf,以验证所有的变量是否被正确地传递给内核。通过这样做,我了解到输入变量并没有被正确地传递给内核,因为打印它们会返回不正确的数据(准确地说,都是0)。我试着把数据类型从uint8改为int,但似乎没有任何效果。我怎样才能在OpenCL中正确地将uint8变量发送到内存缓冲区?我似乎真的无法确定我在编写和发送内存缓冲区时做错了什么,所以它们显示不正确,希望得到任何意见、建议或帮助。
先谢谢你。
EDIT:问题现在已经解决了。我已经根据评论和回答部分提供的友好反馈更新了下面的代码。非常感谢
下面的代码。
#include <iostream>
#include <chrono>
#include <CL/cl.hpp>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
#define USE_PLATFORM_NR 0
#define SIZE 100*1024*1024UL
//SAD DEFINES
#define NUM_DIM_SAD 5
#define NUM_POINTS_SAD 10
//#define NUM_LOOPS_SAD 20
#define SAD_SEED 2014
//NUM_LOOPS * NUM_POINTS should be 75M
//SSD DEFINES
#define NUM_DIM_SSD 128
#define NUM_POINTS_SSD 150000
//#define NUM_LOOPS_SSD 1000
#define SSD_SEED 2048
//NUM_LOOPS * NUM_POINTS should be 150M
// Threadblock sizes (e.g. for kernels )
#define TS 5
// =================================================================================================
// Set the kernel as a string
const char* kernelstring =
"__kernel void SAD(const int num_points_sad, const int num_dim_sad,"
" const global unsigned char* m1_set,"
" const global unsigned char* m2_set,"
" global unsigned char* sad_gpu) {"
" const int Point = get_global_id(0);"
" unsigned char acc = 0;"
" printf(\" POINT: %d \\n \", Point); "
" for (int s=0; s<num_dim_sad ; s++) {"
" printf(\"GPU: i = %d | m1_set = %d| m2_set = %d \\n \",Point*num_dim_sad + s,m1_set[Point*num_dim_sad+s],m2_set[Point*num_dim_sad+s]);}"
" for (int k=0; k<num_dim_sad; k++) {"
" acc += abs( m1_set[Point*num_dim_sad + k] - m2_set[Point*num_dim_sad + k] );"
" }"
" printf(\"ACC: %d \\n \",acc);"
" sad_gpu[Point] = acc;"
"}";
// =================================================================================================
// Matrix-multiplication using a custom OpenCL SGEMM kernel.
int main() {
cout << "Computing naive SAD & SSD for result checking" << endl;
//naive implementation on CPU for result checking
uint8_t* m1_set;// [NUM_POINTS][NUM_DIM];
uint8_t* m2_set;// [NUM_POINTS][NUM_DIM];
m1_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);
m2_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);
uint8_t* sad; // [NUM_POINTS];
uint8_t* sad_gpu;// [NUM_POINTS];
sad = (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);
sad_gpu = (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);
srand(SAD_SEED);
for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
{
sad[i/NUM_DIM_SAD] = 0;
m1_set[i] = rand() / (uint8_t)RAND_MAX;
m2_set[i] = rand() / (uint8_t)RAND_MAX;
cout << "CPU: i = " << i << "| m1_set = " << (unsigned int)m1_set[i] << "| m2_set = " << (unsigned int)m2_set[i] << endl;
}
for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
sad[i/NUM_DIM_SAD] += abs(m1_set[i] - m2_set[i]);
cl_int err;
// Configure the OpenCL environment
printf(">>> Initializing OpenCL...\n");
cl_platform_id platform = USE_PLATFORM_NR;
err = clGetPlatformIDs(1, &platform, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetPlatformId"; return -1;}
cl_device_id device = 0;
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetDeviceIDs"; return -1; }
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateContext"; return -1; }
cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateCommandQueue"; return -1; }
char deviceName[1024];
err = clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, deviceName, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetDeviceInfo"; return -1; }
cl_event event = NULL;
// Compile the kernel
cl_program program = clCreateProgramWithSource(context, 1, &kernelstring_sad, NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateProgramWithSource"; return -1; }
err = clBuildProgram(program, 0, NULL, "", NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clBuildProgram"; return -1; }
// Check for compilation errors
size_t logSize;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo"; return -1; }
char* messages = (char*)malloc((1 + logSize) * sizeof(char));
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo2"; return -1; }
messages[logSize] = '\0';
if (logSize > 10) { printf(">>> Compiler message: %s\n", messages); }
free(messages);
// Prepare OpenCL memory objects
cl_mem buf_m1 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m1"; return -1; }
cl_mem buf_m2 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m2"; return -1; }
cl_mem buf_sad = clCreateBuffer(context, CL_MEM_READ_WRITE, NUM_POINTS_SAD * sizeof(uint8_t), NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_sad"; return -1; }
// Copy matrices to the GPU
err = clEnqueueWriteBuffer(queue, buf_m1, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m1_set, 0, NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m1"; return -1; }
err = clEnqueueWriteBuffer(queue, buf_m2, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m2_set, 0, NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m2"; return -1; }
err = clEnqueueWriteBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_sad"; return -1; }
// Configure the kernel and set its arguments
int num_points_sad = NUM_POINTS_SAD;
int num_dim_sad = NUM_DIM_SAD;
cl_kernel kernel = clCreateKernel(program, "SAD", &err);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel"; return -1; }
err = clSetKernelArg(kernel, 0, sizeof(int), (void*)&num_points_sad);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg0"; return -1; }
err = clSetKernelArg(kernel, 1, sizeof(int), (void*)&num_dim_sad);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg1"; return -1; }
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buf_m1);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg2"; return -1; }
err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buf_m2);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg3"; return -1; }
err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&buf_sad);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel4"; return -1; }
// Start the timed loop
printf(">>> Starting SAD GPU run...\n");
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
// const size_t local[1] = { TS };
const size_t global[1] = { NUM_POINTS_SAD };
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, &event); //local
if (err != CL_SUCCESS) { cout << err << "clEnqueueNDRangeKernel"; return -1; }
// Wait for calculations to be finished
clWaitForEvents(1, &event);
// End the timed loop
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
// Copy the output matrix C back to the CPU memory
clEnqueueReadBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
auto us = std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count();
std::cout << "Time difference = " << us << " us " << std::endl;
// Free the OpenCL memory objects
clReleaseMemObject(buf_m1);
clReleaseMemObject(buf_m2);
clReleaseMemObject(buf_sad);
// Clean-up OpenCL
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel);
for (int i = 0; i < NUM_POINTS_SAD; i++)
{
cout << "i: " << i;
cout << " | CPU: " << (unsigned int)sad[i];
cout << " | GPU: " << (unsigned int)sad_gpu[i];
cout << endl;
}
// Free the host memory objects
free(m1_set);
free(m2_set);
free(sad);
free(sad_gpu);
// Exit
return 0;
}
在创建上下文的函数中出现了一个错误--其中一个参数被传递到了错误的位置。
取而代之的是:
cl_context context = clCreateContext(NULL, 1, &device, NULL, &err, NULL);
应该是:
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
^^^^^^^^^^
另外,错误的输出方式也没有什么帮助。应该是这样的。
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
cout << err << "clCreateContext";
return -1;
}
这样当错误发生时,我们就会停止代码的执行,并知道是哪个函数发生的。
内核中使用了错误的类型。uint8
OpenCL中的type是一个向量类型,意味着8个类型的值的数组。int
.
要解决这个问题,请使用 uchar
unsigned char
类型,相当于OpenCL内核中的 uint8_t
unsigned char
从c++。
见 OpenCL数据类型 和 标量数据类型.