如何修复OpenCL冻结？

Question

我正试图检测闪烁的像素。我先用C ++编写代码，但我意识到，CPU不适合它。所以我找到了OpenCL库。我以前从未使用过它。此外，我还没有找到OpenCL的好文档。

OpenCLHelper.cpp

#include <CL/cl.hpp>
#include <fstream>
#include <iostream>
#include <stdlib.h>


cl::Program CreateProgram(const std::string& fileName) {


    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);

    auto platform = platforms.front();
    std::vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);

    auto device = devices.front();

    std::ifstream file(fileName);
    std::string src(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>()));

    cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length()+1));

    cl::Context context(device);

    cl::Program program(context, sources);

    std::cout << program.build("-cl-std=CL1.2") << std::endl;
    return program;
}

main.cpp（不是文件的整个代码）

cl::Context context = program.getInfo<CL_PROGRAM_CONTEXT>();
vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
cl::Device device = devices.front();

unsigned char* shootFrame(unsigned char *data) {

    unsigned char* frequencyImage = new unsigned char[pixelsPerFrame];
    strcopy(data, frequencyImage);


        cl_int err = 0;
        cl::Buffer inBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(unsigned char) * pixelsPerFrame * equalxFramesAtTheSameTime, lastFrames, &err);
        cout << err << endl;

        cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(unsigned char) * pixelsPerFrame, nullptr, &err);

        cl::Buffer var1(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int), &isLightOn, &err);cout << err << endl;

//I creates variables, because I can't use #define in there
        int equalxFramesAtTheSameTime2 = equalxFramesAtTheSameTime;
        cl::Buffer var2(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int), &equalxFramesAtTheSameTime2, &err);cout << err << endl;
        int thresholdPixel2 = thresholdPixel;
        cl::Buffer var3(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int), &thresholdPixel2, &err);cout << err << endl;
        int ppf = pixelsPerFrame;
        cl::Buffer var4(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int), &ppf, &err);cout << err << endl;

        cl::Buffer var5(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int), &currentFrameID, &err);cout << err << endl;

        cl::Kernel kernel(program, "ProcessImage"); cout << err << endl;

        err = kernel.setArg(0, var1); cout << err << endl;
        err = kernel.setArg(1, var2); cout << err << endl;
        err = kernel.setArg(2, var3); cout << err << endl;
        err = kernel.setArg(3, var4); cout << err << endl;
        err = kernel.setArg(4, var5); cout << err << endl;
        err = kernel.setArg(5, inBuf); cout << err << endl;
        err = kernel.setArg(6, outBuf); cout << err << endl;


        cl::CommandQueue queue(context, device);

        err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(pixelsPerFrame)); cout << err << endl;

        err = queue.enqueueReadBuffer(outBuf, CL_FALSE, 0, sizeof(unsigned char) * pixelsPerFrame, done); cout << err << endl;

        cout << "done: " << queue.finish() << endl;

        return getXYfromRawImage(done, frequencyImage, updown, leftright);
}

process image.除了

__kernel void ProcessImage(const int isLightOn, const int frameSize, const int thresholdPixel, const int pixelsPerFrame, const int currentFrameID, __global unsigned char* lastFrames, __global unsigned char* outData) {
    int isBegin = 1;

    bool mustBrightNow = !isLightOn;

    int lastPixel = 0;

    int isWrongPixel = 0;


    for (int i=currentFrameID; i<frameSize + currentFrameID; i++) {

        int i2 = i;

        if(i >= frameSize) {
            i2 = i2 - frameSize;
        }       

        int id = (i2 * pixelsPerFrame) + get_global_id(0);

        if (isBegin == 1) {
            lastPixel = (int) lastFrames[ id ];
            isBegin = 0;

        } else {
            int currentPixel = (int) lastFrames[ id ];

            if (mustBrightNow == false) {
                if (currentPixel + thresholdPixel < lastPixel) {
                    mustBrightNow = true;
                } else {
                    isWrongPixel = 0; //It freezes when I write '1' 
instead of '0'

                    break;
                }
            } else {
                if (currentPixel - thresholdPixel > lastPixel) {
                    mustBrightNow = false;
                } else {
                    isWrongPixel = 0; //Also it freezes when I write '1'. And it doesn't matter if is an integer or a boolean.

                    break;
                }
            }

            lastPixel = currentPixel;
        }

    }
    if (isWrongPixel == 0) {
        outData[get_global_id(0)] = (uchar) (0);
    } else {
        outData[get_global_id(0)] = (uchar) (1);
    }

}

在所有cout我得到0。所以“没有”明显的错误。

我知道，代码的某些部分未经过优化，但应该可以正常工作。

如果你现在想要，unsigned char* lastFrames就像每个像素一样构建一个char（单色）。所以它的大小是2000万（宽x高x前一帧）。所以它有多个帧，我可以比较OpenCL中的不同帧。

那可能是什么？

有没有问题，主线程不是启动内核而是pthread_t。
OpenCL主要在第146帧或随机冻结。也许是记忆中的错误？
当我用isWrongPixel 1或true写作时，它只会冻结。
当我写0或false它会工作，但我需要一个布尔值。
而且我还没有尝试超过500帧。

那么我做错了什么？

我知道我的语法不完全正确..

提前致谢

Answer 1

继评论讨论之后。

以下是如何使用图像的示例：

#include <CL/cl.hpp>
#include <vector>

std::vector<cl::Platform> clPlatforms;
cl::Platform::get(&clPlatforms);

// TODO Set correctly
cl::Device chosenDevice;
bool first = true;
for (auto &&platform : clPlatforms) {
    std::vector<cl::Device> clDevices;
    platform.getDevices(CL_DEVICE_TYPE_ALL, &clDevices);

    if (first) { // REMOVE
        chosenDevice = clDevices[0];
        first = false;
    }

    std::cout << platform.getInfo<CL_PLATFORM_NAME>()<<'\n';
    for (auto &&device : clDevices) {
        std::cout << device.getInfo<CL_DEVICE_NAME>()<<'\n';
    }
}

cl::Context context{chosenDevice};
// Possible values
// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/cl_image_format.html
cl::ImageFormat format{ CL_R, CL_UNSIGNED_INT8};

std::size_t imageWidth = 640;
std::size_t imageHeight = 480;
std::size_t numFrames = 128;
// Fill as sequences of rows for each 2D
std::uint8_t
    *input = new std::uint8_t[imageWidth * imageHeight * numFrames];

std::size_t i = 0;
for (std::size_t frameI = 0; frameI < numFrames; ++frameI)
    for (std::size_t y = 0; y < imageHeight; ++y)
        for (std::size_t x = 0; x < imageWidth; ++x)
            input[i++] = 0; // INIT

// Zeroes specify data format, see
// https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clCreateImage3D.html
// Note that images cannot be both read and write
cl::Image3D
    inImage{context,
            CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR,
            format,
            imageWidth,
            imageHeight,
            numFrames,
            0,
            0,
            input,
            nullptr};
cl::Image2D outImage{context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY,
                     format, imageWidth, imageHeight};

std::string source = "PASTE SOURCE HERE";
cl::Program program(context, source);
program.build("-cl-std=CL1.2");

cl::Kernel kernel(program, "ProcessImage");
kernel.setArg(0, (int)0);
kernel.setArg(1, (int)numFrames);
int thresholdPixel = 10; // SET
kernel.setArg(2, (int)thresholdPixel);
kernel.setArg(3, (int)(imageWidth * imageHeight));
int currentFrameID = 12; // SET
kernel.setArg(4, (int)currentFrameID);
kernel.setArg(5, inImage);
kernel.setArg(6, outImage);
cl::CommandQueue queue(context, chosenDevice);
queue.enqueueNDRangeKernel(kernel, cl::NullRange,
                           cl::NDRange(imageWidth, imageHeight));

std::uint8_t *output = new std::uint8_t[imageWidth * imageHeight];
// See
// https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clEnqueueReadImage.html
cl::size_t<3> region;
region[0] = imageWidth;
region[1] = imageHeight;
region[2] = 1;
//Might as well block if the next call would be clFinish anyway.
queue.enqueueReadImage(outImage, true, cl::size_t<3>(), region, 0, 0,
                       output);

内核源码：

__kernel void ProcessImage(const int isLightOn, const int frameSize, const int thresholdPixel, const int pixelsPerFrame, const int currentFrameID, read_only image3d_t lastFrames, write_only image2d_t outData) {
    int isBegin = 1;
    bool mustBrightNow = !isLightOn;
    int lastPixel = 0;
    int isWrongPixel = 0;

    for (int i=currentFrameID; i<frameSize + currentFrameID; i++) {
        int i2 = i;

        if(i >= frameSize) {
            i2 = i2 - frameSize;
        }       

        int pixValue = (int)read_imageui(lastFrames,(int4)(get_global_id(0),get_global_id(1),i2,0)).x;
        if (isBegin == 1) {
            lastPixel = pixValue;
            isBegin = 0;
        } else {
            int currentPixel = pixValue;

            if (mustBrightNow == false) {
                if (currentPixel + thresholdPixel < lastPixel) {
                    mustBrightNow = true;
                } else {
                    isWrongPixel = 1; 
                    break;
                }
            } else {
                if (currentPixel - thresholdPixel > lastPixel) {
                    mustBrightNow = false;
                } else {
                    isWrongPixel = 1;
                    break;
                }
            }
            lastPixel = currentPixel;
        }
    }
    write_imageui(outData,(int2)(get_global_id(0),get_global_id(1)),(uint4)(isWrongPixel,0,0,0));
}

我能够在我的1050TI，Intel 7700HQ和Intel 630HD上运行此代码而不会冻结，我希望你能:)

我用图像替换缓冲区，并将此任务设为“2D”，这反映在enqueueNDRangeKernel中。内核完全相同，但它使索引图像更自然。我不确定您是否了解多个平台，这可能允许您使用GPU。它应该只需要最新的驱动程序，没有别的东西可以显示在平台和设备上。没有必要为const变量创建缓冲区，只需在kernel.setArg模板函数中使用正确的类型。

试试吧:)

如何修复OpenCL冻结？

问题描述投票：1回答：1

1个回答

最新问题

如何修复OpenCL冻结？

问题描述 投票：1回答：1

1个回答

最新问题

问题描述投票：1回答：1