使用 CUDA 的 Jacobi 算法

Question

我使用 CUDA 的并行雅可比算法存在问题，该算法不收敛。我有一个可以工作的串行版本，不幸的是我无法在并行实现中发现任何明显的错误。请让我知道任何提示！您可以尝试一下代码，这将表明该算法对于并行不收敛

constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;


void fillMatrix(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i * size + i] = size + 1;
        for (int j = 0; j<size; ++j) {
            if (j != i) {
                M[j + size * i] = 1;
            }
        }
    }
}

void fillVector(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i] = i;
    }
}

void solveJacobi(const float* A, const float* b, float* xNew, float* xOld, const int size) {
    for (int i = 0; i < size; ++i) {
        xOld[i] = xNew[i];
        xNew[i] = 0.0;
        for (int j = 0; j < size; ++j) {
            if (j != i) {
                xNew[i] -= A[i * size + j] * xOld[j];
            }
        }
        xNew[i] += b[i];
        xNew[i] /= A[i * size + i];
    }
}

void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
    *norm = 0;
    for (int i = 0; i < size; ++i) {
        *norm += (xOld[i] - xNew[i])*(xOld[i] - xNew[i]);
    }
    *norm = sqrt(*norm);
}

__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, float* xOld, const int size){
    int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
    if ((idx < size)){
        xOld[idx] = xNew[idx];
        xNew[idx] = 0;
        for(int i=0; i < size; ++i){
            if(i != idx){
                xNew[idx] -= A[idx * size + i] * xOld[i];
            }
        }
        xNew[idx] += b[idx];
        xNew[idx] /= A[idx * size + idx];
    }
}

int main() {
    float* h_A = new float[N * N];
    float* h_b = new float[N];
    float* h_bStar = new float[N];
    float* h_xOld = new float[N];
    float* h_xNew = new float[N];
    float* h_norm = new float{std::numeric_limits<float>::max()};

    fillMatrix(h_A, N);
    fillVector(h_b, N);
    fillVector(h_xOld,N);
    fillVector(h_xNew,N);
    
    float* d_A;
    float* d_b;
    float* d_xOld;
    float* d_xNew;

    cudaMalloc(&d_A, N*N*sizeof(float));
    cudaMalloc(&d_b, N*sizeof(float));
    cudaMalloc(&d_xOld, N*sizeof(float));
    cudaMalloc(&d_xNew, N*sizeof(float));

    cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice);

    
    int iterationCounter{ 0 };
    while (*h_norm > epsilon && iterationCounter < 10000) {
        ++iterationCounter;

        //solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
        cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_xNew, h_xNew, N*sizeof(float), cudaMemcpyHostToDevice);
        // Launch kernel
        solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
        cudaCheckErrors("kernel launch failure");
        cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost);

        computeNorm(h_xNew, h_xOld, N, h_norm);
    }

    for(int i=0; i<N; ++i){
        h_bStar[i] = 0;
        for(int j=0; j<N; ++j){
            h_bStar[i] += h_A[i*N + j]*h_xNew[j];
        }
    }
        
    cout << "Jacobi finished" << endl;
    cout << "N_iterations = " << iterationCounter << endl;
    cout << "Final norm = " << *h_norm << endl;
    cout << "b*[0] = " << h_bStar[0] << endl;
    cout << "b*[N] = " << h_bStar[N-1] << endl;

    return 0;
}

Answer 1

问题

顺序

solveJacobi()

和并行

solveJacobiCuda()

都不正确，因为从

xNew

到

xOld

的副本错误地与矩阵向量积“循环融合”。

在顺序情况下，这实际上会导致更快的收敛，因为

xOld

过时的部分的使用基本上创建了一个比普通雅可比更强大的Gauss-Seidel类型算法。

在并行情况下，问题更加严重，因为内核融合现在会导致 xOld 上出现

 竞争条件

，这反过来又使结果不确定，同时甚至无法提供更快的高斯-赛德尔收敛。
这种竞争条件可以通过网格同步来避免，网格同步可以在单个内核中使用 CUDA 的协作组 API 来实现，或者通过将内核拆分为同一 CUDA 流上的两个内核来更轻松地实现。

解决方案

虽然可以通过循环/内核裂变的正确性来解决问题，但这仍然会影响性能，因为将元素从

xNew

复制到

xOld

首先效率非常低。对于顺序和并行情况，在迭代之间交换指针

xNew

和

xOld

比复制内存中的实际值要高效得多。此外，可以通过将矩阵向量乘积的元素累加到局部变量中并仅在核函数末尾将结果写入到

xNew

来避免将

xNew

初始化为零。

正确实施：

#include <cstdio>
#include <iostream>

using std::cout;
using std::endl;

constexpr int N{ 1024 };
constexpr int block_size = 256;
constexpr float epsilon = 1e-4;

// proper CUDA runtime error checking
// from https://stackoverflow.com/a/14038590/10107454
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void fillMatrix(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i * size + i] = size + 1;
        for (int j = 0; j<size; ++j) {
            if (j != i) {
                M[j + size * i] = 1;
            }
        }
    }
}

void fillVector(float* M, const int size) {
    for (int i = 0; i < size; ++i) {
        M[i] = i;
    }
}

void solveJacobi(const float* A, const float* b, float* xNew, const float* xOld, const int size) {
    for (int i = 0; i < size; ++i) {
        float accumulator = 0.0f;
        for (int j = 0; j < size; ++j) {
            if (j != i) {
                accumulator -= A[i * size + j] * xOld[j];
            }
        }
        accumulator += b[i];
        xNew[i] = accumulator / A[i * size + i];
    }
}

void computeNorm(const float* xNew, const float* xOld, const int size, float* norm) {
    *norm = 0;
    for (int i = 0; i < size; ++i) {
        const float diff = xOld[i] - xNew[i];
        *norm += diff * diff;
    }
    *norm = sqrt(*norm);
}

__global__ void solveJacobiCuda(const float* A, const float* b, float* xNew, const float* xOld, const int size){
    int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
    if ((idx < size)){
        float accumulator = 0;
        for(int i=0; i < size; ++i){
            if(i != idx){
                accumulator -= A[idx * size + i] * xOld[i];
            }
        }
        accumulator += b[idx];
        xNew[idx] = accumulator / A[idx * size + idx];
    }
}

int main() {
    float* h_A = new float[N * N];
    float* h_b = new float[N];
    float* h_bStar = new float[N];
    float* h_xOld = new float[N];
    float* h_xNew = new float[N];
    float* h_norm = new float{std::numeric_limits<float>::max()};

    fillMatrix(h_A, N);
    fillVector(h_b, N);
    // fillVector(h_xNew,N); not needed
    fillVector(h_xOld,N);
    
    float* d_A;
    float* d_b;
    float* d_xOld;
    float* d_xNew;

    gpuErrchk(cudaMalloc(&d_A, N*N*sizeof(float)));
    gpuErrchk(cudaMalloc(&d_b, N*sizeof(float)));
    gpuErrchk(cudaMalloc(&d_xOld, N*sizeof(float)));
    gpuErrchk(cudaMalloc(&d_xNew, N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_A, h_A, N*N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_xOld, h_xOld, N*sizeof(float), cudaMemcpyHostToDevice));

    
    int iterationCounter{ 0 };
    while (*h_norm > epsilon && iterationCounter < 10000) {
        ++iterationCounter;

        // solveJacobi(h_A, h_b, h_xNew, h_xOld, N);
        
        // Launch kernel
        solveJacobiCuda<<<(N+block_size-1)/block_size, block_size>>>(d_A, d_b, d_xNew, d_xOld, N);
        gpuErrchk(cudaGetLastError());
        gpuErrchk(cudaMemcpy(h_xNew, d_xNew, N*sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_xOld, d_xOld, N*sizeof(float), cudaMemcpyDeviceToHost));
        std::swap(d_xOld, d_xNew);

        // TODO this needs to be done on the GPU as well to achieve good performance. 
        computeNorm(h_xNew, h_xOld, N, h_norm);
        std::swap(h_xNew, h_xOld);
    }
    std::swap(h_xNew, h_xOld);

    for(int i=0; i<N; ++i){
        h_bStar[i] = 0;
        for(int j=0; j<N; ++j){
            h_bStar[i] += h_A[i*N + j]*h_xNew[j];
        }
    }
        
    cout << "Jacobi finished" << endl;
    cout << "N_iterations = " << iterationCounter << endl;
    cout << "Final norm = " << *h_norm << endl;
    cout << "b*[0] = " << h_bStar[0] << endl;
    cout << "b*[N] = " << h_bStar[N-1] << endl;

    return 0;
}

要并行化

computeNorm

作为下一步，我建议使用

cub::DeviceReduce::TransformReduce()

。根据矩阵和 GPU 的大小，我还会尝试进一步并行化

solveJacobiCuda

，例如每行使用一个完整的块，例如

cooperative_groups::reduce

并行化此版本中连续的列上的循环。

使用 CUDA 的 Jacobi 算法

问题描述投票：0回答：1

1个回答

问题

解决方案

最新问题

使用 CUDA 的 Jacobi 算法

问题描述 投票：0回答：1

1个回答

问题

解决方案

最新问题

问题描述投票：0回答：1