计算着色器比预期慢

问题描述 投票:0回答:0

我有一个应该在 gpu 上运行的程序。现在我已经测量了 cpu 上一个函数 initEdgesX 的性能,它为我提供了大约 150 毫秒的 400³ 数据数组。现在我想在 gpu 上并行化它,并且由于 gpus 的并行性质,我期望有很高的加速。

然而,当我在 gpu 上运行代码时,它的速度大约是 cpu 版本的 2 倍。我正在使用 opengl 计算着色器。

这是我的代码:

中央处理器:

ComputeShader computeShader("./AVISE_GPU/Shader/initEdgesX.cs");
computeShader.use();

Buffer scalarFieldBuffer(GL_SHADER_STORAGE_BUFFER, scalarFieldSizeTotal * 4, scalarField, GL_DYNAMIC_COPY);
scalarFieldBuffer.bindBufferBase(0);

Buffer heightmapBufferNeg(GL_SHADER_STORAGE_BUFFER, sizeEdgesX * sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapBufferNeg.bindBufferBase(1);

Buffer heightmapBufferPos(GL_SHADER_STORAGE_BUFFER, sizeEdgesX * sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapBufferPos.bindBufferBase(2);

Buffer heightmapIndexOffsetBufferNeg(GL_SHADER_STORAGE_BUFFER, sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapIndexOffsetBufferNeg.bindBufferBase(3);

Buffer heightmapIndexOffsetBufferPos(GL_SHADER_STORAGE_BUFFER, sizeY * sizeZ * 4, nullptr, GL_DYNAMIC_COPY);
heightmapIndexOffsetBufferPos.bindBufferBase(4);

unsigned int testCounter = 0;

Buffer atomicCounter(GL_ATOMIC_COUNTER_BUFFER, 4, &testCounter, GL_DYNAMIC_COPY);
atomicCounter.bindBufferBase(5);

computeShader.setUInt("sizeX", sizeX);
computeShader.setUInt("sizeY", sizeY);
computeShader.setUInt("sizeZ", sizeZ);
computeShader.setUInt("sizeEdgesX", sizeEdgesX);

glfwSetTime(0.0);

/*for (int x = 0; x < sizeX - 1; ++x) {
    computeShader.setUInt("currentX", x);
    glDispatchCompute(1, ceil((float)sizeY / 8), ceil((float)sizeZ / 8));
}*/
glDispatchCompute(1, ceil((float)sizeY / 8), ceil((float)sizeZ / 8));
glFinish();
std::cout << glfwGetTime() << std::endl;

着色器:

# version 450 core

const int localSizeX = 1;
const int localSizeY = 8;
const int localSizeZ = 8;
layout(local_size_x = localSizeX, local_size_y = localSizeY, local_size_z = localSizeZ) in;

uniform uint sizeX;
uniform uint sizeY;
uniform uint sizeZ;
uniform uint currentX;
uniform uint sizeEdgesX;

layout(binding = 5) uniform atomic_uint testCounter;

layout(std430, binding = 0) readonly buffer scalarField
{
        float density [];
}
inputScalarField;

layout(std430, binding = 1) buffer heightmapBuffer1
{
        uint height [] ;
} heightmapZYNeg;

layout(std430, binding = 2) buffer heightmapBuffer2
{
        uint height [] ;
} heightmapZYPos;

layout(std430, binding = 3) buffer heightmapIndexOffsetBuffer1
{
        uint indexOffset [] ;
} heightmapIndexOffsetZYNeg;

layout(std430, binding = 4) buffer heightmapIndexOffsetBuffer2
{
        uint indexOffset [] ;
} heightmapIndexOffsetZYPos;

uint getScalarIndex(uint x, uint y, uint z)
{
    return z * sizeX * sizeY + y * sizeX + x;
}

uint getHeightmapIndex(uint widthIndex, uint heightIndex, uint depthIndex, uint width, uint depth)
{
    return heightIndex * width * depth + widthIndex * depth + depthIndex;
}

void main()
{
    uint currentYIndex = gl_LocalInvocationID.y + (gl_WorkGroupID.y * localSizeY);
    if (currentYIndex > sizeY)
    {
        return;
    }

    uint currentZIndex = gl_LocalInvocationID.z + (gl_WorkGroupID.z * localSizeZ);
    if (currentZIndex > sizeZ)
    {
        return;
    }

    uint heightmapIndexOffsetIndex = currentYIndex * sizeZ + currentZIndex;
    heightmapIndexOffsetZYNeg.indexOffset[heightmapIndexOffsetIndex] = 0;
    heightmapIndexOffsetZYPos.indexOffset[heightmapIndexOffsetIndex] = 0;

    atomicCounterIncrement(testCounter);

    for (int x = 0; x < sizeX - 1; ++x)
    {
        float scalar1 = inputScalarField.density[getScalarIndex(x, currentYIndex, currentZIndex)];
        float scalar2 = inputScalarField.density[getScalarIndex(x + 1, currentYIndex, currentZIndex)];

        if (scalar1 < 0 && scalar2 >= 0)
        {
            uint currentHeightmapIndexOffset = heightmapIndexOffsetZYNeg.indexOffset[heightmapIndexOffsetIndex];
            uint arrayIndex = getHeightmapIndex(currentZIndex, currentYIndex, currentHeightmapIndexOffset, sizeZ, sizeEdgesX);
            heightmapZYNeg.height[arrayIndex] = x;
            heightmapIndexOffsetZYNeg.indexOffset[heightmapIndexOffsetIndex] = currentHeightmapIndexOffset + 1;
        }
        else if (scalar1 >= 0 && scalar2 < 0)
        {
            uint currentHeightmapIndexOffset = heightmapIndexOffsetZYPos.indexOffset[heightmapIndexOffsetIndex];
            uint arrayIndex = getHeightmapIndex(currentZIndex, currentYIndex, currentHeightmapIndexOffset, sizeZ, sizeEdgesX);
            heightmapZYPos.height[arrayIndex] = x;
            heightmapIndexOffsetZYPos.indexOffset[heightmapIndexOffsetIndex] = currentHeightmapIndexOffset + 1;
        }
    }

}
c++ performance opengl glsl compute-shader
© www.soinside.com 2019 - 2024. All rights reserved.