OpenCL代码卡在写输出缓冲区上

问题描述 投票:0回答:1

我在OpenCL中有这样的代码宁静:

            std::string src = "__kernel void dot_product(__global float* weights,"
                                                        "__global float* values,"
                                                        "__global float* result,"
                                                        "__const unsigned int sz){"
                                "float dot = 0.f;"
                                "unsigned int i;"
                                "int current_idx = get_global_id(0);"
                                "unsigned int offset = current_idx * sz;"
                                "for( i = 0; i < sz; ++i )"
                                "{"
                                    "dot += weights[ offset + i ] * values[ offset + i ];"
                                "}"
                                "result[current_idx] = dot;"
                              "}";

哪个卡在结果上[current_idx] =点;如果我注释掉这段代码,一切都会很好。我不明白为什么它会堆积。

相关的c ++代码在这里:

               using namespace cl;      
                std::array< float, CONST_INPUTS_NUMBER * CONST_NEURONS_NUMBER > in_weights;      
                std::array< float, CONST_INPUTS_NUMBER * CONST_NEURONS_NUMBER > in_values;      
                // Create a command queue and use the first device      
                const std::size_t size = in_weights.size();      
                std::vector< Device > devices =      
                 m_context.getInfo< CL_CONTEXT_DEVICES >();      
                Buffer weights(m_context, CL_MEM_READ_ONLY, size * sizeof(float));      
                Buffer values(m_context, CL_MEM_READ_ONLY, size * sizeof(float));      
                Buffer product(m_context, CL_MEM_WRITE_ONLY, CONST_NEURONS_NUMBER * sizeof(float));      

                std::cout << __FILE__ << __LINE__ << std::endl;      

                // Set arguments to kernel      
                m_kernel.setArg(0, weights);      
                m_kernel.setArg(1, values);      
                m_kernel.setArg(2, product);      
                m_kernel.setArg(3, CONST_INPUTS_NUMBER);      
                CommandQueue queue(m_context, devices[0]);      

                try {        
                    std::vector< float > dotProducts(CONST_NEURONS_NUMBER);      
                    for(std::size_t i = 0; i < CONST_NEURONS_NUMBER; ++i) {      
                        // Create memory buffers      
                        for(std::size_t j = 0; j < CONST_INPUTS_NUMBER; ++j) {      
                            const std::size_t index = i * CONST_INPUTS_NUMBER + j;      
                            in_weights[index] = m_internal[i][j].weight;      
                            in_values[index] = m_internal[i][j].value;      
                        }      
                    }        

                    queue.enqueueWriteBuffer(weights,      
                                             CL_TRUE,      
                                             0,      
                                             in_weights.size() * sizeof(float),      
                                             in_weights.data());      
                    queue.enqueueWriteBuffer(values,      
                                             CL_TRUE,      
                                             0,      
                                             in_values.size() * sizeof(float),      
                                             in_values.data());      
                    for(std::size_t offset = 0; offset < CONST_NEURONS_NUMBER; ++offset) {      
                        queue.enqueueNDRangeKernel(m_kernel,      
                                                   cl::NDRange(offset),      
                                                   cl::NDRange(CONST_INPUTS_NUMBER));      
                    }        

                    std::cout << __FILE__ << __LINE__ << std::endl;      
                    queue.enqueueReadBuffer(product,      
                                            CL_TRUE,      
                                            0,      
                                            CONST_NEURONS_NUMBER * sizeof(float),      
                                            dotProducts.data());      

                    std::cout << __FILE__ << __LINE__ << std::endl;      
                    for(std::size_t i = 0; i < CONST_NEURONS_NUMBER; ++i) {      
                        std::cout << __FILE__ << __LINE__ << std::endl;      
                        m_internal[i].calculateOutput(dotProducts.begin(),      
                                                      dotProducts.end());      
                    }        
                } catch(const cl::Error& e) {      
                    cl_int err;      
                    cl::STRING_CLASS buildlog =      
                     m_program.getBuildInfo< CL_PROGRAM_BUILD_LOG >(devices[0], &err);      
                    std::cout << "Building error! Log: " << buildlog << std::endl;    
                }            
c++ debugging opencl
1个回答
0
投票

哪个卡在结果上[current_idx] =点;如果我注释掉代码一切正常。我不明白为什么它会堆积。

当您注释掉将计算结果写入输出缓冲区的行时,优化程序很可能会删除所有计算,而使您的内核为空。

我认为这是问题所在:

for(std::size_t offset = 0; offset < CONST_NEURONS_NUMBER; ++offset) {      
    queue.enqueueNDRangeKernel(m_kernel, cl::NDRange(offset), cl::NDRange(CONST_INPUTS_NUMBER));      
} 

特别是,您循环处理在相同输出缓冲区上工作的许多内核,这导致每个内核争夺对相同缓冲区的访问,无论如何结果都将被覆盖。您需要使用CONST_NEURONS_NUMBER全局工作项使内核不偏移进入队列:

queue.enqueueNDRangeKernel(m_kernel, cl::NullRange, cl::NDRange(CONST_NEURONS_NUMBER));      

[CONST_INPUTS_NUMBER已作为内核参数传递。

© www.soinside.com 2019 - 2024. All rights reserved.