我目前正在开发一个项目,需要在 C++ 中使用 MPI 并行化高斯消除算法。我已经实现了该算法,但在调用
gaussianMPI
函数的并行版本时遇到性能问题。令人惊讶的是,gaussian
的顺序版本比并行版本运行得更快。
我正在考虑的方法之一是从主线程调用
gaussianMPI
函数,而不是创建单独的 MPI 进程。但是,我不确定如何执行此操作,或者是否可以通过 MPI 实现此操作。
我希望在主线程中调用函数本身,但根据文档,这对于 MPI 来说似乎是不可能的。在我的例子中,每个线程都会调用函数
gaussianMPI
。
有人可以建议如何优化并行高斯消除算法或提出替代方法来提高其性能吗? 我需要将 MPI 并行性的范围限制在 for 循环内,但不知道该怎么做。
这是我的代码
#include <iostream>
#include <mpi.h>
#include <ctime>
#include <cstdlib>
using namespace std;
const int N = 3; // Dimension of the matrix
const int rowsPerProc = 1; // For each process, N/size rows
void gaussianMPI(double A[N][N + 1], int rank, int local_rows) {
int i, j, k;
double koef;
double aa[1][N + 1];
// Forward elimination
for (k = 0; k < N - 1; k++) {
MPI_Bcast(A, (N-1) * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Scatter(A, local_rows * (N + 1), MPI_DOUBLE, aa, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < local_rows; i++) {
if (rank * local_rows + i > k) {
koef = aa[i][k] / A[k][k];
for (j = k; j < N + 1; j++) {
aa[i][j] -= koef * A[k][j];
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Gather(aa, local_rows * (N + 1), MPI_DOUBLE, A, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
// Back substitution
for (k = N - 1; k >= 0; k--) {
if (rank== 0) {
A[k][N] /= A[k][k];
A[k][k] = 1.0;
}
MPI_Bcast(A, N * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Scatter(A, local_rows * (N + 1), MPI_DOUBLE, aa, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < local_rows; i++) {
if (rank * local_rows + i < k) {
aa[i][N] -= A[rank * local_rows + i][k] * A[k][N];
aa[i][k] = 0.0;
}
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Gather(aa, local_rows * (N + 1), MPI_DOUBLE, A, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
// Output results on rank 0 process
if (rank == 0) {
for (int i = 0; i < N; ++i) {
cout << "x[" << i << "] = " << A[i][N] << endl;
}
}
}
void gaussian(double A[N][N + 1]) {
int i, j, k;
double koef;
// Forward elimination
for (k = 0; k < N - 1; k++) {
for (i = k + 1; i < N; i++) {
koef = A[i][k] / A[k][k];
for (j = k; j < N + 1; j++) {
A[i][j] -= koef * A[k][j];
}
}
}
// Back substitution
for (k = N - 1; k >= 0; k--) {
A[k][N] /= A[k][k];
A[k][k] = 1.0;
for (i = 0; i < k; i++) {
A[i][N] -= A[i][k] * A[k][N];
A[i][k] = 0.0;
}
}
for (int i = 0; i < N; ++i) {
cout << "x[" << i << "] = " << A[i][N] << endl;
}
}
int main() {
double A[N][N + 1] = { {1, 2, -1, 2}, {2, -3, 2, 2}, {3, 1, 1, 8} };
double B[N][N + 1] = { {1, 2, -1, 2}, {2, -3, 2, 2}, {3, 1, 1, 8} };
int rank, size;
double start_time, end_time;
MPI_Init(0, 0);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int local_rows = N / size;
if (rank == 0) start_time = clock() / (double)CLOCKS_PER_SEC;
gaussianMPI(B, rank, local_rows);
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
end_time = clock() / (double)CLOCKS_PER_SEC;
cout << "Total time taken by the program is " << end_time - start_time << " seconds" << endl;
}
MPI_Finalize();
if (rank == 0) {
start_time = clock() / (double)CLOCKS_PER_SEC;
gaussian(A);
end_time = clock() / (double)CLOCKS_PER_SEC;
cout << "Total time taken by the program is " << end_time - start_time << " seconds" << endl;
}
return 0;
}
您的代码无法提供良好性能的原因有多种。
N=3
来说工作基本上没什么,你的沟通相当于数百万次操作。