JOCL CL_OUT_OF_RESOURCES 仅当已分配值时

问题描述 投票:0回答:1

我目前正在 OpenCL 中开发 sha256 内核。如果我通过内核向内核写入值,我将无法读取内核的输出。如果我不向变量写入任何值,则代码可以正常工作。我尝试向变量写入不同的值,但仍然没有成功。这浪费了我近 3 天的空闲时间,我希望你能在我失去理智之前帮我解决这个问题。

这是我的依赖。

<!-- https://mvnrepository.com/artifact/org.jocl/jocl -->
    <dependency>
        <groupId>org.jocl</groupId>
        <artifactId>jocl</artifactId>
        <version>2.0.5</version>
    </dependency>

我还从此页面的 Utils 部分添加了 JOCL Struct Jar: http://www.jocl.org/utilities/utilities.html

这是我的主课

package org.joclforce;

import org.jocl.*;
import org.jocl.struct.Buffers;
import org.jocl.struct.CLTypes;
import org.jocl.struct.SizeofStruct;
import org.joclforce.Structs.WorkerInput;
import org.joclforce.Structs.WorkerOutput;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.util.stream.Collectors;

import static org.jocl.CL.*;
import static org.jocl.CL.clCreateKernel;

public class Main {
    private static String programSource = "";


    // CL state
    private static cl_context context;
    private static cl_command_queue commandQueue;
    private static cl_kernel kernel;
    private static cl_program program;


    public Main() {
        try {
            programSource = Main.getResourceFileAsString("sha256.cl");
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        startKernel();
    }

    private void startKernel() {
        defaultInitialization();

        // Initialization of an array containing some inputs
        int n = 3;


        WorkerInput[] workerInputs = new WorkerInput[n];
        WorkerOutput[] workerOutputs = new WorkerOutput[n];
        for (int i=0; i<n; i++)
        {
            workerInputs[i] = new WorkerInput();
            workerOutputs[i] = new WorkerOutput();

            workerInputs[i].message = new CLTypes.cl_uint[8];
            workerOutputs[i].digest = new CLTypes.cl_uint[8];

            for( int c = 0; c < 8; c++ ) {
                workerInputs[i].message[c] = new CLTypes.cl_uint();
                workerOutputs[i].digest[c] = new CLTypes.cl_uint();

                workerInputs[i].message[c].set(0);
                workerOutputs[i].digest[c].set(0);
            }
        }
        int structSizeInput = SizeofStruct.sizeof(WorkerInput.class);
        int structSizeOutput = SizeofStruct.sizeof(WorkerOutput.class);

        // Allocate a buffer that can store the data
        ByteBuffer inputBuffer = Buffers.allocateBuffer(workerInputs);
        ByteBuffer outputBuffer = Buffers.allocateBuffer(workerOutputs);

        // Write the data into the buffer
        Buffers.writeToBuffer(inputBuffer, workerInputs);
        Buffers.writeToBuffer(outputBuffer, workerOutputs);

        // Allocate the memory object for the data that
        // contains the data from the struct buffer
        cl_mem inputsMem = clCreateBuffer(context,
                CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                (long) structSizeInput * n, Pointer.to(inputBuffer), null);

        cl_mem outputsMem = clCreateBuffer(context,
                CL_MEM_READ_WRITE,
                (long) structSizeOutput * n, Pointer.to(outputBuffer), null);

        // Set the arguments for the kernel
        clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(inputsMem));
        clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(outputBuffer));

        // Set the work-item dimensions
        long global_work_size[] = new long[]{n};

        // Execute the kernel
        clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
                global_work_size, null, 0, null, null);


        // Read back the data from to memory object to the object buffer
        clEnqueueReadBuffer(commandQueue, outputsMem, true, 0,
                (long) structSizeOutput * n, Pointer.to(outputBuffer), 0 , null, null);

        // Read the data from the object buffer back into the object
        outputBuffer.rewind();
        Buffers.readFromBuffer(outputBuffer, workerOutputs);


        // Print the result
        for (int i=0; i<n; i++)
        {
            System.out.println(i+": "+workerOutputs[i]);
        }

        // Clean up
        clReleaseMemObject(inputsMem);
        clReleaseMemObject(outputsMem);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(commandQueue);
        clReleaseContext(context);
    }

    private void defaultInitialization()
    {
        // Obtain the platform IDs and initialize the context properties
        cl_platform_id platforms[] = new cl_platform_id[1];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);

        // Create an OpenCL context on a GPU device
        context = clCreateContextFromType(
                contextProperties, CL_DEVICE_TYPE_GPU, null, null, null);
        if (context == null)
        {
            // If no context for a GPU device could be created,
            // try to create one for a CPU device.
            context = clCreateContextFromType(
                    contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);

            if (context == null)
            {
                System.out.println("Unable to create a context");
                return;
            }
        }

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);

        // Get the list of GPU devices associated with the context
        long numBytes[] = new long[1];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);

        // Obtain the cl_device_id for the first device
        int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
                Pointer.to(devices), null);

        // Create a command-queue
        commandQueue =
                clCreateCommandQueue(context, devices[0], 0, null);

        // Create the program from the source code
        program = clCreateProgramWithSource(context,
                1, new String[]{ programSource }, null, null);

        // Build the program
        clBuildProgram(program, 0, null, null, null, null);

        // Create the kernel
        kernel = clCreateKernel(program, "sha256", null);
    }


    public static void main(String[] args) {
        Main main = new Main();
    }

    /**
     * Reads given resource file as a string.
     *
     * @param fileName path to the resource file
     * @return the file's contents
     * @throws IOException if read fails for any reason
     */
    static String getResourceFileAsString(String fileName) throws IOException {
        ClassLoader classLoader = ClassLoader.getSystemClassLoader();
        try (InputStream is = classLoader.getResourceAsStream(fileName)) {
            if (is == null) return null;
            try (InputStreamReader isr = new InputStreamReader(is);
                 BufferedReader reader = new BufferedReader(isr)) {
                return reader.lines().collect(Collectors.joining(System.lineSeparator()));
            }
        }
    }
}

这是我的两个参数结构:

package org.joclforce.Structs;

import org.jocl.struct.ArrayLength;
import org.jocl.struct.CLTypes.*;
import org.jocl.struct.Struct;

import java.util.Arrays;

public class WorkerInput extends Struct {
    @ArrayLength(8)
    public cl_uint[] message;

    public String toString()
    {
        return "WorkerInput[" +
                "message="+ Arrays.toString(message)+"]";
    }

}

package org.joclforce.Structs;

import org.jocl.struct.ArrayLength;
import org.jocl.struct.CLTypes;
import org.jocl.struct.Struct;

import java.util.Arrays;

public class WorkerOutput extends Struct {
    @ArrayLength(8)
    public CLTypes.cl_uint[] digest;

    public String toString()
    {
        return "WorkerOutput[" +
                "digest="+ Arrays.toString(digest)+"]";
    }
}

最后这是我使用的内核

#ifndef uint8
#define uint8  unsigned char
#endif

#ifndef uint32
#define uint32 unsigned int
#endif

typedef struct {
    uint32 state[8];
} sha256_context;

typedef struct {
    uint32 message[8];
} sha256_input;

typedef struct {
    uint32 digest[8];
} sha256_output;

#define GET_UINT32(n,b,i)                       \
{                                               \
    (n) = ( (uint32) (b)[(i)    ] << 24 )       \
        | ( (uint32) (b)[(i) + 1] << 16 )       \
        | ( (uint32) (b)[(i) + 2] <<  8 )       \
        | ( (uint32) (b)[(i) + 3]       );      \
}

#define PUT_UINT32(n,b,i)                       \
{                                               \
    (b)[(i)    ] = (uint8) ( (n) >> 24 );       \
    (b)[(i) + 1] = (uint8) ( (n) >> 16 );       \
    (b)[(i) + 2] = (uint8) ( (n) >>  8 );       \
    (b)[(i) + 3] = (uint8) ( (n)       );       \
}

inline void sha256_starts(sha256_context * ctx)
{
    ctx->state[0] = 0x6A09E667;
    ctx->state[1] = 0xBB67AE85;
    ctx->state[2] = 0x3C6EF372;
    ctx->state[3] = 0xA54FF53A;
    ctx->state[4] = 0x510E527F;
    ctx->state[5] = 0x9B05688C;
    ctx->state[6] = 0x1F83D9AB;
    ctx->state[7] = 0x5BE0CD19;
}

inline void sha256_process(sha256_context * ctx, uint8 data[64])
{
    uint32 temp1, temp2, W[64];
    uint32 A, B, C, D, E, F, G, H;

    GET_UINT32(W[0], data, 0);
    GET_UINT32(W[1], data, 4);
    GET_UINT32(W[2], data, 8);
    GET_UINT32(W[3], data, 12);
    GET_UINT32(W[4], data, 16);
    GET_UINT32(W[5], data, 20);
    GET_UINT32(W[6], data, 24);
    GET_UINT32(W[7], data, 28);
    GET_UINT32(W[8], data, 32);
    GET_UINT32(W[9], data, 36);
    GET_UINT32(W[10], data, 40);
    GET_UINT32(W[11], data, 44);
    GET_UINT32(W[12], data, 48);
    GET_UINT32(W[13], data, 52);
    GET_UINT32(W[14], data, 56);
    GET_UINT32(W[15], data, 60);

#define  SHR(x,n) ((x & 0xFFFFFFFF) >> n)
#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))

#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))
#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))

#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))

#define F0(x,y,z) ((x & y) | (z & (x | y)))
#define F1(x,y,z) (z ^ (x & (y ^ z)))

#define R(t)                                    \
(                                               \
    W[t] = S1(W[t -  2]) + W[t -  7] +          \
           S0(W[t - 15]) + W[t - 16]            \
)

#define P(a,b,c,d,e,f,g,h,x,K)                  \
{                                               \
    temp1 = h + S3(e) + F1(e,f,g) + K + x;      \
    temp2 = S2(a) + F0(a,b,c);                  \
    d += temp1; h = temp1 + temp2;              \
}

    A = ctx->state[0];
    B = ctx->state[1];
    C = ctx->state[2];
    D = ctx->state[3];
    E = ctx->state[4];
    F = ctx->state[5];
    G = ctx->state[6];
    H = ctx->state[7];

    P(A, B, C, D, E, F, G, H, W[0], 0x428A2F98);
    P(H, A, B, C, D, E, F, G, W[1], 0x71374491);
    P(G, H, A, B, C, D, E, F, W[2], 0xB5C0FBCF);
    P(F, G, H, A, B, C, D, E, W[3], 0xE9B5DBA5);
    P(E, F, G, H, A, B, C, D, W[4], 0x3956C25B);
    P(D, E, F, G, H, A, B, C, W[5], 0x59F111F1);
    P(C, D, E, F, G, H, A, B, W[6], 0x923F82A4);
    P(B, C, D, E, F, G, H, A, W[7], 0xAB1C5ED5);
    P(A, B, C, D, E, F, G, H, W[8], 0xD807AA98);
    P(H, A, B, C, D, E, F, G, W[9], 0x12835B01);
    P(G, H, A, B, C, D, E, F, W[10], 0x243185BE);
    P(F, G, H, A, B, C, D, E, W[11], 0x550C7DC3);
    P(E, F, G, H, A, B, C, D, W[12], 0x72BE5D74);
    P(D, E, F, G, H, A, B, C, W[13], 0x80DEB1FE);
    P(C, D, E, F, G, H, A, B, W[14], 0x9BDC06A7);
    P(B, C, D, E, F, G, H, A, W[15], 0xC19BF174);
    P(A, B, C, D, E, F, G, H, R(16), 0xE49B69C1);
    P(H, A, B, C, D, E, F, G, R(17), 0xEFBE4786);
    P(G, H, A, B, C, D, E, F, R(18), 0x0FC19DC6);
    P(F, G, H, A, B, C, D, E, R(19), 0x240CA1CC);
    P(E, F, G, H, A, B, C, D, R(20), 0x2DE92C6F);
    P(D, E, F, G, H, A, B, C, R(21), 0x4A7484AA);
    P(C, D, E, F, G, H, A, B, R(22), 0x5CB0A9DC);
    P(B, C, D, E, F, G, H, A, R(23), 0x76F988DA);
    P(A, B, C, D, E, F, G, H, R(24), 0x983E5152);
    P(H, A, B, C, D, E, F, G, R(25), 0xA831C66D);
    P(G, H, A, B, C, D, E, F, R(26), 0xB00327C8);
    P(F, G, H, A, B, C, D, E, R(27), 0xBF597FC7);
    P(E, F, G, H, A, B, C, D, R(28), 0xC6E00BF3);
    P(D, E, F, G, H, A, B, C, R(29), 0xD5A79147);
    P(C, D, E, F, G, H, A, B, R(30), 0x06CA6351);
    P(B, C, D, E, F, G, H, A, R(31), 0x14292967);
    P(A, B, C, D, E, F, G, H, R(32), 0x27B70A85);
    P(H, A, B, C, D, E, F, G, R(33), 0x2E1B2138);
    P(G, H, A, B, C, D, E, F, R(34), 0x4D2C6DFC);
    P(F, G, H, A, B, C, D, E, R(35), 0x53380D13);
    P(E, F, G, H, A, B, C, D, R(36), 0x650A7354);
    P(D, E, F, G, H, A, B, C, R(37), 0x766A0ABB);
    P(C, D, E, F, G, H, A, B, R(38), 0x81C2C92E);
    P(B, C, D, E, F, G, H, A, R(39), 0x92722C85);
    P(A, B, C, D, E, F, G, H, R(40), 0xA2BFE8A1);
    P(H, A, B, C, D, E, F, G, R(41), 0xA81A664B);
    P(G, H, A, B, C, D, E, F, R(42), 0xC24B8B70);
    P(F, G, H, A, B, C, D, E, R(43), 0xC76C51A3);
    P(E, F, G, H, A, B, C, D, R(44), 0xD192E819);
    P(D, E, F, G, H, A, B, C, R(45), 0xD6990624);
    P(C, D, E, F, G, H, A, B, R(46), 0xF40E3585);
    P(B, C, D, E, F, G, H, A, R(47), 0x106AA070);
    P(A, B, C, D, E, F, G, H, R(48), 0x19A4C116);
    P(H, A, B, C, D, E, F, G, R(49), 0x1E376C08);
    P(G, H, A, B, C, D, E, F, R(50), 0x2748774C);
    P(F, G, H, A, B, C, D, E, R(51), 0x34B0BCB5);
    P(E, F, G, H, A, B, C, D, R(52), 0x391C0CB3);
    P(D, E, F, G, H, A, B, C, R(53), 0x4ED8AA4A);
    P(C, D, E, F, G, H, A, B, R(54), 0x5B9CCA4F);
    P(B, C, D, E, F, G, H, A, R(55), 0x682E6FF3);
    P(A, B, C, D, E, F, G, H, R(56), 0x748F82EE);
    P(H, A, B, C, D, E, F, G, R(57), 0x78A5636F);
    P(G, H, A, B, C, D, E, F, R(58), 0x84C87814);
    P(F, G, H, A, B, C, D, E, R(59), 0x8CC70208);
    P(E, F, G, H, A, B, C, D, R(60), 0x90BEFFFA);
    P(D, E, F, G, H, A, B, C, R(61), 0xA4506CEB);
    P(C, D, E, F, G, H, A, B, R(62), 0xBEF9A3F7);
    P(B, C, D, E, F, G, H, A, R(63), 0xC67178F2);

    ctx->state[0] += A;
    ctx->state[1] += B;
    ctx->state[2] += C;
    ctx->state[3] += D;
    ctx->state[4] += E;
    ctx->state[5] += F;
    ctx->state[6] += G;
    ctx->state[7] += H;
}

/* Write a 32-bit big-endian long value to a buffer. */
inline void WriteLong(unsigned char *buf, int value)
{
    buf[0] = (unsigned char)(value >> 24);
    buf[1] = (unsigned char)(value >> 16);
    buf[2] = (unsigned char)(value >> 8);
    buf[3] = (unsigned char)value;
}

inline void sha256_update(sha256_context * ctx, uint8 * input)
{
    int i;
    unsigned char buffer[64] = { 0 };
    for(i = 0; input[i]; i++)
        buffer[i] = input[i];
    char *p = (char *) buffer;
    for (i = 0; i != 64 && p[i]; i++);
    p[i] = 0x80;
    unsigned long tbl = i << 3;
    WriteLong(buffer + 56, (unsigned int)(tbl >> 32));
    WriteLong(buffer + 60, (unsigned int)tbl);
    sha256_process(ctx, buffer);
}

__kernel void sha256(__global sha256_input * input, __global sha256_output * output)
{
    int gid = get_global_id(0);
    uint key[16] = { 0 };
    int i;

    for (i = 0; input[gid].message[i]; i++)
        key[i] = input[gid].message[i];

    sha256_context ctx;
    sha256_starts(&ctx);

    sha256_update(&ctx, (uint8 *) key);

    printf("%u\n", ctx.state[0]);
    output[gid].digest[0] = ctx.state[0]; //Line 219 Producing the Error

/**
    output[gid].digest[0] = ctx.state[0];
    output[gid].digest[1] = ctx.state[1];
    output[gid].digest[2] = ctx.state[2];
    output[gid].digest[3] = ctx.state[3];
    output[gid].digest[4] = ctx.state[4];
    output[gid].digest[5] = ctx.state[5];
    output[gid].digest[6] = ctx.state[6];
    output[gid].digest[7] = ctx.state[7];
**/
}

如果您注释掉内核中的第 219 行,该错误就会消失。我尝试使用不同的方式在第 219 行写入值,但没有成功。

无论如何,谢谢谁有理智帮助我解决这个问题

java parallel-processing gpu opencl jocl
1个回答
0
投票

我自己解决了这个问题,我只是犯了一个基本错误,使用了错误的指针线

clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(outputBuffer));

其实应该是

clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(outputsMem));

© www.soinside.com 2019 - 2024. All rights reserved.