我建立一个粒子系统和面临困难与计算patricle位置CUDA内核性能。
__global__
void updateParticle(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
int particleLifetime = (time - device_particleCreatedTime[threadId]) / 1000;
double distanceX = 0.5 * device_particleAcceleration[threadId * 2 + 0] * (particleLifetime * particleLifetime) / 5000.0;
double distanceY = 0.5 * device_particleAcceleration[threadId * 2 + 1] * (particleLifetime * particleLifetime) / 5000.0;
device_particleCoordinates[threadId * 2 + 0] = device_particleStartCoordinates[threadId * 2 + 0] + distanceX;
device_particleCoordinates[threadId * 2 + 1] = device_particleStartCoordinates[threadId * 2 + 1] + distanceY;
}
}
内核被称为是这样的:
int blockSize = 32;
int nBlocks = maxParticleCount / 32 + 1;
updateParticle << <nBlocks, blockSize >> >(particles.size(), time, gravity, device_particleCoordinates,
device_particleStartCoordinates, device_particleAcceleration, device_particleCreatedTime);
glDrawArrays(GL_POINTS, 0, particles.size());
HANDLE_ERROR(cudaMemcpy(particleCoordinatesFlat.data(), device_particleCoordinates, particles.size() * 2 * sizeof(GLfloat), cudaMemcpyDeviceToHost));
device_particleCoordinates被链接到一个OpenGL缓冲器,使得坐标直接修改。
性能不是很好,我认为这是由于内核调用。是否存在可能影响性能的任何明显的错误?
正如在评论中已经建议,这个内核可能不是你认为它是性能瓶颈。至少,你没有提供数据支持这一想法。然而一些建议仍可以应该提高这个内核的运行。
GLfloat
相当于float
。在这种情况下,特别是由于这个内核(device_particleCoordinates
)的主输出是float
数量时,其令人怀疑了在double
精度完成的任何中间计算所提供多少好处。你可以尝试将所有操作float
算术。下面是一个例子修改过的内核(未经测试),证明这些想法:
__global__
void updateParticle1(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
根据我的测试,这些变化会降低内核执行时间从大约69us(updateParticle
)约54us(updateParticle1
)为〜百万粒子:
$ cat t388.cu
#include <GL/gl.h>
const int ppt = 4;
__global__
void updateParticle(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
int particleLifetime = (time - device_particleCreatedTime[threadId]) / 1000;
double distanceX = 0.5 * device_particleAcceleration[threadId * 2 + 0] * (particleLifetime * particleLifetime) / 5000.0;
double distanceY = 0.5 * device_particleAcceleration[threadId * 2 + 1] * (particleLifetime * particleLifetime) / 5000.0;
device_particleCoordinates[threadId * 2 + 0] = device_particleStartCoordinates[threadId * 2 + 0] + distanceX;
device_particleCoordinates[threadId * 2 + 1] = device_particleStartCoordinates[threadId * 2 + 1] + distanceY;
}
}
__global__
void updateParticle1(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
__global__
void updateParticle2(const int num_particles, const double time, const double gravity,
GLfloat * __restrict__ device_particleCoordinates, const GLfloat * __restrict__ device_particleStartCoordinates,
const GLfloat * __restrict__ device_particleAcceleration, const GLint * __restrict__ device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<const float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<const float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
__global__
void updateParticle3(const int num_particles, const double time, const double gravity,
GLfloat * __restrict__ device_particleCoordinates, const GLfloat * __restrict__ device_particleStartCoordinates,
const GLfloat * __restrict__ device_particleAcceleration, const GLint * __restrict__ device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = 0; i < ppt; i++)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<const float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<const float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
threadId += gridDim.x*blockDim.x;
}
}
int main(){
int num_p = 1048576;
float *dpC, *dpSC, *dpA;
int *dpCT;
cudaMalloc(&dpC, num_p*2*sizeof(dpC[0]));
cudaMalloc(&dpSC, num_p*2*sizeof(dpSC[0]));
cudaMalloc(&dpA, num_p*2*sizeof(dpA[0]));
cudaMalloc(&dpCT, num_p*sizeof(dpCT[0]));
updateParticle<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle1<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle2<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle3<<<num_p/(ppt*256), 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle1<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle2<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle3<<<num_p/(ppt*256), 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_60 -o t388 t388.cu
$ nvprof ./t388
==32419== NVPROF is profiling process 32419, command: ./t388
==32419== Profiling application: ./t388
==32419== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 30.11% 141.41us 2 70.703us 68.991us 72.416us updateParticle(int, double, double, float*, float*, float*, int*)
23.53% 110.50us 2 55.247us 54.816us 55.679us updateParticle2(int, double, double, float*, float const *, float const *, int const *)
23.31% 109.47us 2 54.735us 54.335us 55.136us updateParticle3(int, double, double, float*, float const *, float const *, int const *)
23.06% 108.29us 2 54.144us 53.952us 54.336us updateParticle1(int, double, double, float*, float*, float*, int*)
API calls: 97.56% 291.86ms 4 72.966ms 273.40us 291.01ms cudaMalloc
1.53% 4.5808ms 384 11.929us 313ns 520.98us cuDeviceGetAttribute
0.49% 1.4735ms 4 368.37us 226.07us 580.91us cuDeviceTotalMem
0.22% 670.21us 4 167.55us 89.800us 369.11us cuDeviceGetName
0.13% 392.94us 1 392.94us 392.94us 392.94us cudaDeviceSynchronize
0.05% 150.44us 8 18.804us 10.502us 67.034us cudaLaunchKernel
0.01% 21.862us 4 5.4650us 4.0570us 7.0660us cuDeviceGetPCIBusId
0.00% 10.010us 8 1.2510us 512ns 2.9310us cuDeviceGet
0.00% 6.6950us 3 2.2310us 435ns 3.8940us cuDeviceGetCount
0.00% 2.3460us 4 586ns 486ns 727ns cuDeviceGetUuid
$
装饰用const ... __restrict__
(updateParticle2
)指针似乎没有提供此测试情况下,任何额外的好处。计算每线程4个颗粒(updateParticle3
),而不是1也似乎没有对处理时间的显著影响。
P100特斯拉,CUDA 10.0,7.5 CentOS的
除了罗伯特Crovella的建议,也可以考虑:
__restrict__
- 假设他们指向的内存不同的区域。该__restrict__
关键字,通过支持NVCC,允许编译器做出各种非常有用的优化否则它不能。欲了解更多关于为什么__restrict__
是(在C ++一般)看到有用:
What does the restrict keyword mean in C++?