在带有 OpenMP 和 Eigen 的嵌套 for 循环中使用 Reduction

问题描述 投票:0回答:1

我正在尝试并行化以下具有 while 循环和嵌套 for 循环的函数(迭代求解器)。代码看起来像:

static const int nx = 128;
static const int ny = 128;
static const int nz = 128;
//Initialize
Eigen::Tensor<std::complex<double>, 3> xTerm(nx,ny,nz);  
Eigen::Tensor<std::complex<double>, 3> yTerm(nx,ny,nz);  
Eigen::Tensor<std::complex<double>, 3> zTerm(nx,ny,nz);  

Eigen::Tensor<std::complex<double>, 3> RHS(nx,ny,nz); 
 
double UpdatedSol_max, OldSol_max;
double it_error;

    // Begin counter for the number of iterations it takes to calculate phi
    int count = 0;
    // Begin while loop
    do{         
        // Calculate some thing
        
        
        #pragma omp parallel for collapse(3)
        for(int i = 0; i< nx; i++){
            for(int j = 0; j< ny; j++){
                for(int k = 0; k< nz; k++){ 
                    RHS(k,i,j) = Source(k,i,j) - xTerm(k,i,j) - yTerm(k,i,j) - zTerm(k,i,j);
                }
            }
        }

        
        // Some calculations
        // Increase iteration count by 1
        count = count + 1; 
        
        // Calculate error
        it_error = fabs((UpdatedSol_max-OldSol_max)/UpdatedSol_max);    
        
     }while( it_error > err_max && count  <= max_iter && UpdatedSol_max > err_max); 

     return count;

所以,一个简单的

#pragma omp parallel for collapse(3)
实际上会使并行代码变慢,所以我尝试使用 reduction,因为在嵌套的 for 循环中有一些添加,这是我的尝试:

#pragma omp parallel for reduction(+: Terms)
        for(int i = 0; i< nx; i++){
            for(int j = 0; j< ny; j++){
                for(int k = 0; k< nz; k++){ 
                    Terms(k,i,j) += xTerm(k,i,j) + yTerm(k,i,j) + zTerm(k,i,j);
                    RHS(k,i,j) = Source(k,i,j) - Terms(k,i,j);

                }
            }
        }

但是这会返回错误:

 error: user defined reduction not found for ‘Terms’

我尝试通过添加要减少的元素数来修复:

 #pragma omp parallel for reduction(+: Terms[:nx])
但我仍然收到错误:

error: ‘Terms’ does not have pointer or array type

我有点疑惑,我不能在这里使用 Eigen 数组结构吗?我怎样才能更好地重写这个?谢谢

编辑: 我包括一个可复制的小例子,供人们自己编译和测试。这是一个迭代求解器,所以一些东西被初始化为零以使其更容易编译:

static const int nx = 128;
static const int ny = 128;
static const int nz = 128;

using namespace Eigen;
int main(){
double err_max = 1.e-8;
int phi_iter_max = 500;
int Soliter = 0;

Eigen::initParallel();
Eigen::setNbThreads(omp_get_max_threads());

    Eigen::Tensor<std::complex<double>, 3> invnk(nx,ny,nz);  
    invnk.setZero();
    Eigen::Tensor<std::complex<double>, 3> dndxk(nx,ny,nz);  
    dndxk.setZero();
    Eigen::Tensor<std::complex<double>, 3> dndyk(nx,ny,nz);  
    dndyk.setZero();
    Eigen::Tensor<std::complex<double>, 3> dndzk(nx,ny,nz);  
    dndzk.setZero();
    Eigen::Tensor<std::complex<double>, 3> Sol(nx,ny,nz);  
    dphikdt.setConstant(1.0f); 
    Eigen::Tensor<std::complex<double>, 3> Source(nx,ny,nz);  
    Source.setZero();
    Eigen::Tensor<double, 3> KX(nx,ny,nz); 
    KX.setZero();
    Eigen::Tensor<double, 3> KY(nx,ny,nz); 
    KY.setZero();
    Eigen::Tensor<double, 3> KZ(nx,ny,nz);
    KZ.setZero();
    Eigen::Tensor<double, 3> factor(nx,ny,nz);
    factor.setZero();
    
    double start_time1, run_time1;
    start_time1 = omp_get_wtime();

    //clock_t start_time1 = clock();
   
    Soliter = Solver3D(invnk, dndxk, dndyk, dndzk, Sol, Source, KX, KY, KZ, factor, err_max,  phi_iter_max);
    //clock_t end1 = clock();

   run_time1 = omp_get_wtime() - start_time1;

    cout << "Parallel Time in SEC: " <<  run_time1 << "s\n";
    //cout << "Serial Time in SEC: " <<  (double)(end1-start_time1) / CLOCKS_PER_SEC << "s\n";
   


}

迭代求解器函数如下所示:

int Solver3D(Eigen::Tensor<std::complex<double>, 3>& invnk, Eigen::Tensor<std::complex<double>, 3>& dndxk, Eigen::Tensor<std::complex<double>, 3>& dndyk, Eigen::Tensor<std::complex<double>, 3>& dndzk,Eigen::Tensor<std::complex<double>, 3>& phik, Eigen::Tensor<std::complex<double>, 3>& Source,Eigen::Tensor<double, 3>& kx, Eigen::Tensor<double, 3>& ky, Eigen::Tensor<double, 3>& kz, Eigen::Tensor<double, 3>&  factor, double err_max, int max_iter){ 

    
    Eigen::Tensor<std::complex<double>, 3> xTerm(nx,ny,nz);  
    xTerm.setZero(); 
    Eigen::Tensor<std::complex<double>, 3> yTerm(nx,ny,nz);  
    yTerm.setZero(); 
    Eigen::Tensor<std::complex<double>, 3> zTerm(nx,ny,nz);  
    zTerm.setZero();
    
    Eigen::Tensor<std::complex<double>, 3> RHS(nx,ny,nz);  
    RHS.setZero();
    
    
    double UpdatedSol_max , OldSol_max;
    double it_error;

    // Begin counter for the number of iterations it takes to calculate phi
    int count = 0;
    
    // Begin while loop

    do{         
        
        
        #pragma omp parallel for collapse(3)
        for(int i = 0; i< nx; i++){
            for(int j = 0; j< ny; j++){
                for(int k = 0; k< nz; k++){ 
                    RHS(k,i,j) = Source(k,i,j) - xTerm(k,i,j) - yTerm(k,i,j) - zTerm(k,i,j);
                }
            }
        }
        
        // Calculate maximum of absolute value of previous solution
        Eigen::Tensor<double, 0> AbsMaxAsTensor = Sol.abs().maximum();
        OldSol_max = AbsMaxAsTensor(0);
        Sol = RHS * factor;

        // Calculate maximum of absolute value of updated solution
        Eigen::Tensor<double, 0> AbsMaxAsTensor1 = Sol.abs().maximum();
        UpdatedSol_max = AbsMaxAsTensor1(0); 
        // Increase iteration count by 1
        count = count + 1;
        
        // Calculate error
        it_error = fabs((UpdatedSol_max-OldSol_max)/UpdatedSol_max);
        
     }while( it_error > err_max && count  <= max_iter && UpdatedSol_max > err_max); 
     return count;

}

我运行以下命令来获取“并行”时间:

g++ ParallelTest.cpp -lfftw3 -lfftw3_threads -fopenmp -lm -O3 -Ofast -o /test.out

下面的“串行”时间使用“clock_t作为挂钟时间:

g++ ParallelTest.cpp -lfftw3 -lfftw3_threads -lm -O3 -Ofast -o /test.out

Eidt 1: 由于似乎很难重现我的示例,因此我包含了一些不同线程数的运行时: 在主代码中

for (int nThreads =1; nThreads <= 16; nThreads++){
        
        double start_time1, run_time1;
        start_time1 = omp_get_wtime();
        
        omp_set_num_threads(nThreads); 
         Soliter = Solver3D(invnk, dndxk, dndyk, dndzk, Sol, Source, KX, KY, KZ, factor, err_max,  phi_iter_max);
      
        run_time1 = omp_get_wtime() - start_time1;
       
        cout << "Threads: " << nThreads << "   Parallel Time in SEC: " <<  run_time1 << "s\n";

返回:

Threads: 1   Parallel Time in SEC: 0.444323s
Threads: 2   Parallel Time in SEC: 0.292054s
Threads: 3   Parallel Time in SEC: 0.261521s
Threads: 4   Parallel Time in SEC: 0.260612s
Threads: 5   Parallel Time in SEC: 0.25353s
Threads: 6   Parallel Time in SEC: 0.258204s
Threads: 7   Parallel Time in SEC: 0.275174s
Threads: 8   Parallel Time in SEC: 0.280042s
Threads: 9   Parallel Time in SEC: 0.274214s
Threads: 10   Parallel Time in SEC: 0.271887s
Threads: 11   Parallel Time in SEC: 0.274815s
Threads: 12   Parallel Time in SEC: 0.273562s
Threads: 13   Parallel Time in SEC: 0.283543s
Threads: 14   Parallel Time in SEC: 0.302721s
Threads: 15   Parallel Time in SEC: 0.317697s
Threads: 16   Parallel Time in SEC: 0.290503s

我有 12 个处理器和 6 个内核,所以真正使用最多 16 个线程只是一个测试。

c++ openmp eigen reduction
1个回答
-1
投票

首先:如果

collapse(3)
使循环变慢,请尝试另一个编译器。一些编译器不擅长折叠循环中的索引计算。如果外循环有足够的迭代次数,您也可以尝试
collapse(2)
或什至不使用它。说到这里:总共有多少次
i,j,k
迭代?这个数字可能必须至少有几千。

更多关于那些循环。您正在按

(k,i,j)
编制索引,但您的循环排列方式不同。因为(正如@PierU 指出的那样)特征数组是列优先的,所以你应该用
j
outer,
i
middle 和
k
inner 来排序你的循环。

关于你试图将其表述为减少:你没有减少。为了减少一些数量(通常是标量)需要同时出现在左侧和右侧。

关于你的编译错误:你只能减少标量或 C 风格的数组。这解释了您关于没有“使用定义的减少”的错误消息。要减少任何 C++ 容器,您可以提取其

.data()
成员。我不确定 Eigen 是否支持这一点。或者,Eigen 需要在您此处拥有的任何对象上支持
operator+
。但如前所述,您没有减少。我只是想我会解释那个错误信息是关于什么的。

© www.soinside.com 2019 - 2024. All rights reserved.