我使用 CUDA 的并行雅可比算法存在问题,该算法不收敛。我有一个可以工作的串行版本,不幸的是我无法在并行实现中发现任何明显的错误。请让我知道任何提示! 您可以尝试一下代码,这将表明该算法对于并行不收敛
constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;
void fillMatrix(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i * size + i] = size + 1;
for (int j = 0; j<size; ++j) {
if (j != i) {
M[j + size * i] = 1;
}
}
}
}
void fillVector(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i] = i;
}
}
void solveJacobi(const float* A, const float* b, float* xNew, float* xOld, const int size) {
for (int i = 0; i < size; ++i) {
xOld[i] = xNew[i];
xNew[i] = 0.0;
for (int j = 0; j < size; ++j) {
if (j != i) {
xNew[i] -= A[i * size + j] * xOld[j];
}
}
xNew[i] += b[i];
xNew[i] /= A[i * size + i];
}
}
void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
*norm = 0;
for (int i = 0; i < size; ++i) {
*norm += (xOld[i] - xNew[i])*(xOld[i] - xNew[i]);
}
*norm = sqrt(*norm);
}
__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, float* xOld, const int size){
int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
if ((idx < size)){
xOld[idx] = xNew[idx];
xNew[idx] = 0;
for(int i=0; i < size; ++i){
if(i != idx){
xNew[idx] -= A[idx * size + i] * xOld[i];
}
}
xNew[idx] += b[idx];
xNew[idx] /= A[idx * size + idx];
}
}
int main() {
float* h_A = new float[N * N];
float* h_b = new float[N];
float* h_bStar = new float[N];
float* h_xOld = new float[N];
float* h_xNew = new float[N];
float* h_norm = new float{std::numeric_limits<float>::max()};
fillMatrix(h_A, N);
fillVector(h_b, N);
fillVector(h_xOld,N);
fillVector(h_xNew,N);
float* d_A;
float* d_b;
float* d_xOld;
float* d_xNew;
cudaMalloc(&d_A, N*N*sizeof(float));
cudaMalloc(&d_b, N*sizeof(float));
cudaMalloc(&d_xOld, N*sizeof(float));
cudaMalloc(&d_xNew, N*sizeof(float));
cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice);
int iterationCounter{ 0 };
while (*h_norm > epsilon && iterationCounter < 10000) {
++iterationCounter;
//solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_xNew, h_xNew, N*sizeof(float), cudaMemcpyHostToDevice);
// Launch kernel
solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
cudaCheckErrors("kernel launch failure");
cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost);
computeNorm(h_xNew, h_xOld, N, h_norm);
}
for(int i=0; i<N; ++i){
h_bStar[i] = 0;
for(int j=0; j<N; ++j){
h_bStar[i] += h_A[i*N + j]*h_xNew[j];
}
}
cout << "Jacobi finished" << endl;
cout << "N_iterations = " << iterationCounter << endl;
cout << "Final norm = " << *h_norm << endl;
cout << "b*[0] = " << h_bStar[0] << endl;
cout << "b*[N] = " << h_bStar[N-1] << endl;
return 0;
}
顺序
solveJacobi()
和并行 solveJacobiCuda()
都不正确,因为从 xNew
到 xOld
的副本错误地与矩阵向量积“循环融合”。
在顺序情况下,这实际上会导致更快的收敛,因为
xOld
过时的部分的使用基本上创建了一个比普通雅可比更强大的Gauss-Seidel类型算法。
在并行情况下,问题更加严重,因为内核融合现在会导致 xOld
上出现
竞争条件,这反过来又使结果不确定,同时甚至无法提供更快的高斯-赛德尔收敛。
虽然可以通过循环/内核裂变的正确性来解决问题,但这仍然会影响性能,因为将元素从
xNew
复制到 xOld
首先效率非常低。对于顺序和并行情况,在迭代之间交换指针 xNew
和 xOld
比复制内存中的实际值要高效得多。此外,可以通过将矩阵向量乘积的元素累加到局部变量中并仅在核函数末尾将结果写入到 xNew
来避免将 xNew
初始化为零。
正确实施:
#include <cstdio>
#include <iostream>
using std::cout;
using std::endl;
constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;
// proper CUDA runtime error checking
// from https://stackoverflow.com/a/14038590/10107454
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void fillMatrix(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i * size + i] = size + 1;
for (int j = 0; j<size; ++j) {
if (j != i) {
M[j + size * i] = 1;
}
}
}
}
void fillVector(float* M, const int size) {
for (int i = 0; i < size; ++i) {
M[i] = i;
}
}
void solveJacobi(const float* A, const float* b, float* xNew, const float* xOld, const int size) {
for (int i = 0; i < size; ++i) {
float accumulator = 0.0f;
for (int j = 0; j < size; ++j) {
if (j != i) {
accumulator -= A[i * size + j] * xOld[j];
}
}
accumulator += b[i];
xNew[i] = accumulator / A[i * size + i];
}
}
void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
*norm = 0;
for (int i = 0; i < size; ++i) {
const float diff = xOld[i] - xNew[i];
*norm += diff * diff;
}
*norm = sqrt(*norm);
}
__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, const float* xOld, const int size){
int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
if ((idx < size)){
float accumulator = 0;
for(int i=0; i < size; ++i){
if(i != idx){
accumulator -= A[idx * size + i] * xOld[i];
}
}
accumulator += b[idx];
xNew[idx] = accumulator / A[idx * size + idx];
}
}
int main() {
float* h_A = new float[N * N];
float* h_b = new float[N];
float* h_bStar = new float[N];
float* h_xOld = new float[N];
float* h_xNew = new float[N];
float* h_norm = new float{std::numeric_limits<float>::max()};
fillMatrix(h_A, N);
fillVector(h_b, N);
// fillVector(h_xNew,N); not needed
fillVector(h_xOld,N);
float* d_A;
float* d_b;
float* d_xOld;
float* d_xNew;
gpuErrchk(cudaMalloc(&d_A, N*N*sizeof(float)));
gpuErrchk(cudaMalloc(&d_b, N*sizeof(float)));
gpuErrchk(cudaMalloc(&d_xOld, N*sizeof(float)));
gpuErrchk(cudaMalloc(&d_xNew, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice));
int iterationCounter{ 0 };
while (*h_norm > epsilon && iterationCounter < 10000) {
++iterationCounter;
// solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
// Launch kernel
solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost));
std::swap(d_xOld, d_xNew);
// TODO this needs to be done on the GPU as well to achieve good performance.
computeNorm(h_xNew, h_xOld, N, h_norm);
std::swap(h_xNew, h_xOld);
}
std::swap(h_xNew, h_xOld);
for(int i=0; i<N; ++i){
h_bStar[i] = 0;
for(int j=0; j<N; ++j){
h_bStar[i] += h_A[i*N + j]*h_xNew[j];
}
}
cout << "Jacobi finished" << endl;
cout << "N_iterations = " << iterationCounter << endl;
cout << "Final norm = " << *h_norm << endl;
cout << "b*[0] = " << h_bStar[0] << endl;
cout << "b*[N] = " << h_bStar[N-1] << endl;
return 0;
}
要并行化
computeNorm
作为下一步,我建议使用 cub::DeviceReduce::TransformReduce()
。根据矩阵和 GPU 的大小,我还会尝试进一步并行化 solveJacobiCuda
,例如每行使用一个完整的块,例如cooperative_groups::reduce
并行化此版本中连续的列上的循环。