注意:这可以100%运行,唯一的问题就是性能。我猜想与同步线程和内存访问有关。
根据结构中的第一个元素(Elem.c)进行排序
重音排序(GLSL):
layout(local_size_x = 512) in;
struct Elem {
uint c;
uint p;
};
layout(std430, binding = 4) buffer IndexList {
Elem es[];
};
uniform uint u_Level;
uniform uint u_LevelMask;
shared Elem shared_data[512];
void main() {
// Load shared data
shared_data[gl_LocalInvocationIndex] = es[gl_GlobalInvocationID.x];
barrier();
// Sort the shared data
for (uint j = u_Level >> 1; j > 0; j >>= 1) {
Elem result = ((shared_data[gl_LocalInvocationIndex & ~j].c <= shared_data[gl_LocalInvocationIndex | j].c)
== bool(u_LevelMask & gl_GlobalInvocationID.x)) ?
shared_data[gl_LocalInvocationIndex ^ j] : shared_data[gl_LocalInvocationIndex];
barrier();
shared_data[gl_LocalInvocationIndex] = result;
barrier();
}
// Store shared data
es[gl_GlobalInvocationID.x] = shared_data[gl_LocalInvocationIndex];
}
和移调
#define XSIZE 16
#define YSIZE 16
layout(local_size_x = XSIZE ,local_size_y = YSIZE, local_size_z = 1) in;
struct Elem {
uint c;
uint p;
};
layout(std430, binding = 4) buffer InputBUF {
Elem inputElem[];
};
layout(std430, binding = 5) buffer OutputBUF {
Elem outputElem[];
};
uniform uint u_Width;
uniform uint u_Height;
shared Elem shared_data[XSIZE * YSIZE];
void main() {
shared_data[gl_LocalInvocationIndex] = inputElem[gl_GlobalInvocationID.y * u_Width + gl_GlobalInvocationID.x];
barrier();
uvec2 XY = gl_GlobalInvocationID.yx - gl_LocalInvocationID.yx + gl_LocalInvocationID.xy;
outputElem[XY.y * u_Height + XY.x] = shared_data[gl_LocalInvocationID.x * XSIZE + gl_LocalInvocationID.y];
}
#define BITONIC_BLOCK_SIZE 512
#define TRANSPOSE_BLOCK_SIZE 16
// The number of elements to sort is limited to an even power of 2
// At minimum 8,192 elements - BITONIC_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE
// At maximum 262,144 elements - BITONIC_BLOCK_SIZE * BITONIC_BLOCK_SIZE
const uint MATRIX_WIDTH = BITONIC_BLOCK_SIZE;
const uint MATRIX_HEIGHT = N / BITONIC_BLOCK_SIZE;
Bitonic->BindSSBO(*IndexList, "IndexList", 4);
for (uint level = 2; level <= BITONIC_BLOCK_SIZE; level <<= 1) {
Bitonic->SetUniform1ui("u_Level", level);
Bitonic->SetUniform1ui("u_LevelMask", level);
Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);
}
for (uint level = BITONIC_BLOCK_SIZE << 1; level <= N; level <<= 1) {
// Transpose the data from buffer 1 into buffer 2
Transposer->BindSSBO(*IndexList, "InputBUF", 4);
Transposer->BindSSBO(*SecondaryIndexList, "OutputBUF", 5);
Transposer->SetUniform1ui("u_Width", MATRIX_WIDTH);
Transposer->SetUniform1ui("u_Height", MATRIX_HEIGHT);
Transposer->DispatchCompute(MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, 1);
// Sort the transposed column data
Bitonic->BindSSBO(*SecondaryIndexList, "IndexList", 4);
Bitonic->SetUniform1ui("u_Level", level / BITONIC_BLOCK_SIZE);
Bitonic->SetUniform1ui("u_LevelMask", (level & ~N)/BITONIC_BLOCK_SIZE);
Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);
// Transpose the data from buffer 2 back into buffer 1
Transposer->BindSSBO(*SecondaryIndexList, "InputBUF", 4);
Transposer->BindSSBO(*IndexList, "OutputBUF", 5);
Transposer->SetUniform1ui("u_Width", MATRIX_HEIGHT);
Transposer->SetUniform1ui("u_Height", MATRIX_WIDTH);
Transposer->DispatchCompute(MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, 1);
// Sort the row data
Bitonic->BindSSBO(*IndexList, "IndexList", 4);
Bitonic->SetUniform1ui("u_Level", BITONIC_BLOCK_SIZE);
Bitonic->SetUniform1ui("u_LevelMask", level);
Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);
}
我有一个包含两个无符号整数的结构数组。我想使用Bitonic Sorting根据第一个uint对它们进行排序。我在DirectX中实现了此代码(我将其转换为...