我在 GPU 上实例化数以万计的网格 - 每个网格都需要有一个独特的变换。在 CPU 上计算数以万计的矩阵并通过计算缓冲区将它们传递到 GPU 更快,还是在 GPU 本身上计算每个唯一的 TRS 矩阵更快(例如,使用计算着色器)?
我已经尝试实现这两种方法,但我还无法在 HLSL 中正确计算 TRS 矩阵。我只是想确保,在进一步尝试之前,在 GPU 上进行计算可能是一个不错的选择,因为有如此多的实例。
是的,如果您有大量实例,那么将计算抵消到 GPU 绝对是值得的。 请注意,如果您还想执行剔除,也需要在 GPU 上完成(在实例化的情况下,这不是一个复杂的操作)。
在速度方面,您将有一个阈值,在 GPU 上执行将变得比 CPU 更快(因为您需要上传数据并执行计算过程),这将根据架构而有所不同。
这是我用来将 SRT 姿势转换为矩阵的计算着色器代码(在这种情况下,我还尝试合理优化代码以避免多个矩阵乘法,即使它们在 GPU 上运行速度非常快)
#define PI acos(-1.0f)
struct PoseSRT
{
float3 position;
float3 scale;
float3 rotation;
};
float4 quat_yawpitchroll(float yaw, float pitch, float roll)
{
float halfRoll = roll * 0.5f * PI * 2.0f;
float halfPitch = pitch * 0.5f * PI * 2.0f;
float halfYaw = yaw * 0.5f* PI * 2.0f;
float sinRoll = sin(halfRoll);
float cosRoll = cos(halfRoll);
float sinPitch = sin(halfPitch);
float cosPitch = cos(halfPitch);
float sinYaw = sin(halfYaw);
float cosYaw = cos(halfYaw);
float4 result;
result.x = (cosYaw * sinPitch * cosRoll) + (sinYaw * cosPitch * sinRoll);
result.y = (sinYaw * cosPitch * cosRoll) - (cosYaw * sinPitch * sinRoll);
result.z = (cosYaw * cosPitch * sinRoll) - (sinYaw * sinPitch * cosRoll);
result.w = (cosYaw * cosPitch * cosRoll) + (sinYaw * sinPitch * sinRoll);
return result;
}
float4x4 srt_to_matrix(PoseSRT pose)
{
float4 rotation = quat_yawpitchroll(pose.rotation.y,pose.rotation.x, pose.rotation.z);
float4x4 result;
float xx = rotation.x * rotation.x;
float yy = rotation.y * rotation.y;
float zz = rotation.z * rotation.z;
float xy = rotation.x * rotation.y;
float zw = rotation.z * rotation.w;
float zx = rotation.z * rotation.x;
float yw = rotation.y * rotation.w;
float yz = rotation.y * rotation.z;
float xw = rotation.x * rotation.w;
result._14 = 0.0f;
result._24 = 0.0f;
result._34 = 0.0f;
result._44 = 1.0f;
result._11 = 1.0f - (2.0f * (yy + zz));
result._12 = 2.0f * (xy + zw);
result._13 = 2.0f * (zx - yw);
result._11_12_13 *= pose.scale.x;
result._21 = 2.0f * (xy - zw);
result._22 = 1.0f - (2.0f * (zz + xx));
result._23 = 2.0f * (yz + xw);
result._21_22_23 *= pose.scale.y;
result._31 = 2.0f * (zx + yw);
result._32 = 2.0f * (yz - xw);
result._33 = 1.0f - (2.0f * (yy + xx));
result._31_32_33 *= pose.scale.z;
result._41 = pose.position.x;
result._42 = pose.position.y;
result._43 = pose.position.z;
return result;
}
StructuredBuffer<PoseSRT> InputBuffer : register(t0);
RWStructuredBuffer<float4x4> OutputBuffer : register(u0);
cbuffer cbSettings : register(b0)
{
uint elementCount;
}
[numthreads(128,1,1)]
void CS(uint3 tid : SV_DispatchThreadID)
{
if (tid.x >= elementCount)
return;
PoseSRT pose = InputBuffer[tid.x];
float4x4 mat = srt_to_matrix(pose);
OutputBuffer[tid.x] = mat;
}