我正在制作一个基于粒子的代码,其中每个粒子的邻居列表已经在初始条件下生成并且在整个模拟过程中保持不变,但是每个粒子的位置都会发生变化,因此经过一段时间后“邻居”可能会很远彼此远离。这些邻居列表对应于其初始位置的“债券”,因此不变。
我将最大邻居数限制为
MAX_NEIGHBOR
并且使用 N
粒子将有一个数组 Neighbors[N*MAX_NEIGHBOR]
。对于第 i
个粒子,其邻居的索引将对应于 Neighbors[i*MAX_NEIGHBOR : (i+1)*MAX_NEIGHBOR]
(python 范围表示法)。
说,要获取每个粒子的数量(
Quantity[N]
),基本上可以考虑这种方法。
int index = blockIdx.x; // each block for one particle, and the threads will take care its neighbors
if (index > N) return;
int neighborIndex = Neighbors[MAX_NEIGHBOR*index + threadIdx.x]; // coalesced
double quantity = Quantity[neighborIndex]; // cannot be coalesced
显然,对
Quantity
的访问没有合并。由于 Neighbors[N*MAX_NEIGHBOR]
在模拟过程中是恒定的,我希望我能以某种方式做得更好,但如何在技术上做到这一点。
我可以考虑生成另一个数组
NeighborsQuantity[N*MAX_NEIGHBORS]
,这样我就可以以合并的方式访问它们,但是在生成这个NeighborsQuantity
时需要读取另一个内核中的非合并内存。
在一些文档中有一些关于纹理内存的评论,但是通过 CUDA programming guide 似乎纹理内存不支持
double3
类型的变量。
以下是我使用的内核。在这里,我访问相邻粒子的
Pos
和 Vol
。也读了Vel
,但到目前为止没有任何作用。我把它放在那里只是为了准备下一个更复杂模型的实现。
__global__ void calcBondAccelD(double4 *DVel_Dt, // output
double4 *oldPos, // inputs : oldPos[numParticles]
double4 *oldVel, // oldVel[numParticles]
double *Rho, // Rho[numParticles]
double *Vol, // Vol[numParticles]
double *BondStretch, // BondStretch[numParticles*MAX_NEIGHBOR]
double *Mu, // Mu[numParticles*MAX_NEIGHBOR]
int *Type, // Type[numParticles]
int *Neighbors, // Neighbors[numParticles*MAX_NEIGHBOR]
int *numNeighbors, // numNeighbors[numParticles]
int numParticles) {
int sortedIndex = blockIdx.x * blockDim.x + threadIdx.x; // one particle per thread
if (sortedIndex >= numParticles) return;
if ( (Type[sortedIndex] != 0) && (Type[sortedIndex] != 1) ) return; // bonds only for main particles
double3 pos = make_double3(oldPos[sortedIndex]);
double3 vel = make_double3(oldVel[sortedIndex]);
double3 force = make_double3(0.0);
// examine neighbor list
for (int i = 0; i<numNeighbors[sortedIndex]; i++) {
int neighborIndex = Neighbors[MAX_NEIGHBOR * sortedIndex + i];
double3 pos2 = make_double3(oldPos[neighborIndex]); // oldPos non-coalesced
double3 relPos = pos2 - pos;
double dist = length(relPos);
double _delta = params.horizon;
double _c = 18.*params.modBulk / (CUDART_PI * _delta*_delta*_delta*_delta);
force += _c * BondStretch[sortedIndex*MAX_NEIGHBOR + i] * Mu[sortedIndex*MAX_NEIGHBOR + i]
* Vol[neighborIndex] * relPos / dist; // Vol non-coalesced
}
// write new velocity back to original unsorted location
DVel_Dt[sortedIndex] = make_double4(force / Rho[sortedIndex], 0.0);
}
我承认这是人们能想到的最幼稚的实现。虽然这个内核在 3090 Ti 上对于我当前具有 100,000~1,000,000 个粒子的应用程序来说足够实用,但我希望它更快。
我正在尝试使用共享内存等的一些变体,但最天真的一个仍然在我的变体中表现最好。
我的一个变体是这样的,我试图在其中插入并行归约。虽然内存访问仍未合并,但速度略有加快。但据我所知,“固定”邻居列表,我认为我在内存访问方面做得更好。
对改进这个内核有什么建议吗?
__global__ void calcBondAccelD2(double4 *DVel_Dt, // output
double4 *oldPos, // intputs
double4 *oldVel,
double *Rho,
double *Vol,
double *BondStretch,
double *Mu,
int *Type,
int *Neighbors,
int *numNeighbors,
int numParticles) {
int sortedIndex = blockIdx.x; // one particle per one block, threads taking care of its neighbors
__shared__ double _s[256][6];
if (sortedIndex >= numParticles) return;
if ( (Type[sortedIndex] != 0) && (Type[sortedIndex] != 1) ) return; // bonds only for main particles
double3 pos = make_double3(oldPos[sortedIndex]);
double3 vel = make_double3(oldVel[sortedIndex]);
double rho = Rho[sortedIndex];
double3 force = make_double3(0.0);
int neighborIndex = Neighbors[MAX_NEIGHBOR * sortedIndex + threadIdx.x];
int _numNeighbor = numNeighbors[sortedIndex];
double3 pos2 = make_double3(oldPos[neighborIndex]);
double vol2 = Vol[neighborIndex];
_s[threadIdx.x][0] = BondStretch[sortedIndex*256 + threadIdx.x];
_s[threadIdx.x][1] = Mu [sortedIndex*256 + threadIdx.x];
_s[threadIdx.x][2] = 0.0;
_s[threadIdx.x][3] = 0.0;
_s[threadIdx.x][4] = 0.0;
__syncthreads();
if (threadIdx.x < _numNeighbor) {
double3 relPos = pos2 - pos;
double dist = length(relPos);
double _delta = params.horizon;
double _c = 18.*params.modBulk / (CUDART_PI * _delta*_delta*_delta*_delta);
force = _c * _s[threadIdx.x][0] * _s[threadIdx.x][1] * vol2 * relPos / dist;
_s[threadIdx.x][2] = force.x;
_s[threadIdx.x][3] = force.y;
_s[threadIdx.x][4] = force.z;
}
__syncthreads();
int threadIdx2;
// parallel reduction - WARNING!! numThread should be fixed to 256 !!!
if (threadIdx.x < 128) {
threadIdx2 = threadIdx.x + 128;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x < 64) {
threadIdx2 = threadIdx.x + 64;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x < 32) {
threadIdx2 = threadIdx.x + 32;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x == 0) {
force.x = _s[0][2];
force.y = _s[0][3];
force.z = _s[0][4];
// write new velocity back to original unsorted location
DVel_Dt[sortedIndex] = make_double4(force / rho, 0.0);
}
}