我在OpenCL中有这样的代码宁静:
std::string src = "__kernel void dot_product(__global float* weights,"
"__global float* values,"
"__global float* result,"
"__const unsigned int sz){"
"float dot = 0.f;"
"unsigned int i;"
"int current_idx = get_global_id(0);"
"unsigned int offset = current_idx * sz;"
"for( i = 0; i < sz; ++i )"
"{"
"dot += weights[ offset + i ] * values[ offset + i ];"
"}"
"result[current_idx] = dot;"
"}";
哪个卡在结果上[current_idx] =点;如果我注释掉这段代码,一切都会很好。我不明白为什么它会堆积。
相关的c ++代码在这里:
using namespace cl;
std::array< float, CONST_INPUTS_NUMBER * CONST_NEURONS_NUMBER > in_weights;
std::array< float, CONST_INPUTS_NUMBER * CONST_NEURONS_NUMBER > in_values;
// Create a command queue and use the first device
const std::size_t size = in_weights.size();
std::vector< Device > devices =
m_context.getInfo< CL_CONTEXT_DEVICES >();
Buffer weights(m_context, CL_MEM_READ_ONLY, size * sizeof(float));
Buffer values(m_context, CL_MEM_READ_ONLY, size * sizeof(float));
Buffer product(m_context, CL_MEM_WRITE_ONLY, CONST_NEURONS_NUMBER * sizeof(float));
std::cout << __FILE__ << __LINE__ << std::endl;
// Set arguments to kernel
m_kernel.setArg(0, weights);
m_kernel.setArg(1, values);
m_kernel.setArg(2, product);
m_kernel.setArg(3, CONST_INPUTS_NUMBER);
CommandQueue queue(m_context, devices[0]);
try {
std::vector< float > dotProducts(CONST_NEURONS_NUMBER);
for(std::size_t i = 0; i < CONST_NEURONS_NUMBER; ++i) {
// Create memory buffers
for(std::size_t j = 0; j < CONST_INPUTS_NUMBER; ++j) {
const std::size_t index = i * CONST_INPUTS_NUMBER + j;
in_weights[index] = m_internal[i][j].weight;
in_values[index] = m_internal[i][j].value;
}
}
queue.enqueueWriteBuffer(weights,
CL_TRUE,
0,
in_weights.size() * sizeof(float),
in_weights.data());
queue.enqueueWriteBuffer(values,
CL_TRUE,
0,
in_values.size() * sizeof(float),
in_values.data());
for(std::size_t offset = 0; offset < CONST_NEURONS_NUMBER; ++offset) {
queue.enqueueNDRangeKernel(m_kernel,
cl::NDRange(offset),
cl::NDRange(CONST_INPUTS_NUMBER));
}
std::cout << __FILE__ << __LINE__ << std::endl;
queue.enqueueReadBuffer(product,
CL_TRUE,
0,
CONST_NEURONS_NUMBER * sizeof(float),
dotProducts.data());
std::cout << __FILE__ << __LINE__ << std::endl;
for(std::size_t i = 0; i < CONST_NEURONS_NUMBER; ++i) {
std::cout << __FILE__ << __LINE__ << std::endl;
m_internal[i].calculateOutput(dotProducts.begin(),
dotProducts.end());
}
} catch(const cl::Error& e) {
cl_int err;
cl::STRING_CLASS buildlog =
m_program.getBuildInfo< CL_PROGRAM_BUILD_LOG >(devices[0], &err);
std::cout << "Building error! Log: " << buildlog << std::endl;
}
哪个卡在结果上[current_idx] =点;如果我注释掉代码一切正常。我不明白为什么它会堆积。
当您注释掉将计算结果写入输出缓冲区的行时,优化程序很可能会删除所有计算,而使您的内核为空。
我认为这是问题所在:
for(std::size_t offset = 0; offset < CONST_NEURONS_NUMBER; ++offset) {
queue.enqueueNDRangeKernel(m_kernel, cl::NDRange(offset), cl::NDRange(CONST_INPUTS_NUMBER));
}
特别是,您循环处理在相同输出缓冲区上工作的许多内核,这导致每个内核争夺对相同缓冲区的访问,无论如何结果都将被覆盖。您需要使用CONST_NEURONS_NUMBER
全局工作项使内核不偏移进入队列:
queue.enqueueNDRangeKernel(m_kernel, cl::NullRange, cl::NDRange(CONST_NEURONS_NUMBER));
[CONST_INPUTS_NUMBER
已作为内核参数传递。