OpenMP 任务构造不随线程数量扩展

问题描述 投票:0回答:2

我正在使用 OpenMP task 构造来并行执行二叉树应用程序。然而,运行时性能表明单线程实现优于多线程实现。我如何解释结果?

实现如下:

  size_t N = 1 << 16;
  size_t *D = new size_t [N];
  size_t threads = 1;  // 1, 2, 4, 8, 16

  std::vector<std::vector<int>> mat1(128); // a 2D matrix
  std::vector<std::vector<int>> mat2(128); // a 2D matrix

  for(size_t i = 0; i < mat1.size(); ++i){
    mat1[i].resize(128);
    mat2[I].resize(128);
  }

  #pragma omp parallel num_threads(threads)
  {
    #pragma omp single
    {
      for(size_t i = 1; i < N; ++i) {

        size_t p = i / 2;
        size_t l = i * 2;
        size_t r = l + 1;

        if(l < N && r < N) {
          #pragma omp task firstprivate(i) depend(out:D[l], D[r]) depend(in:D[p])
          {
            // perform matrix multiplication
            for(size_t x = 0; x < 128; ++x) {
              for(size_t y = 0; y < 128; ++y) {
                size_t sum = 0;
                for(size_t z = 0; z < 128; ++z) {
                  //sum += mat1[x][z] * mat2[z][y];
                  sum += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
                }
              }
            }     
          }
        }
        else {
          #pragma omp task firstprivate(i) depend(in:D[p])
          {           
            // perform matrix multiplication
            for(size_t x = 0; x < 128; ++x) {
              for(size_t y = 0; y < 128; ++y) {
                size_t sum = 0;
                for(size_t z = 0; z < 128; ++z) {
                  //sum += mat1[x][z] * mat2[z][y];
                  sum += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
                }
              }
            }     
          }
        }
      }
    }
  }

  delete [] D;
}

但是,如果我把矩阵乘法换成sleep。那么运行时性能确实会随着线程数量的增加而变化。 下图是矩阵乘法的运行时性能。 runtime performance graph of matrix multiplication

下图是sleep的运行时表现。 runtime performance graph of sleep

如果我用轻量级运算替换矩阵乘法,如下所示:

size_t sum = i * 1000 // i is the private value for each task

我观察到与 OpenMP 无法扩展的矩阵乘法相同的运行时性能。运行时性能图如下所示,轻量级操作的运行时性能图

该实现是在启用 gcc-12 和 O3 的情况下编译的,并在 Ubuntu 19.10 (Eoan Ermine) 机器上运行,该机器具有 80 Intel Xeon Gold 6138 CPU(2.00GHz)和 256 GB RAM。

我预计运行时性能和线程数有一条凸曲线,其中最低点是 4 或 8 个线程。

task openmp
2个回答
0
投票

我测试了你的代码并提出了两个问题:

  1. 由于您根本不使用计算结果,因此编译器(我使用
    g++ -O3 -fopenmp
    )只是跳过循环(即根本不执行它们),因此时间没有任何意义。我介绍了一些说明来避免这个问题
  2. 但后来我遇到了分段错误:我不是 C++ 专家,但你确定你的矩阵已正确声明吗?外部向量的尺寸为 128,但内部向量大小未指定(edit:我看到您现在更正了代码)。然后我用经典的 C 2D 数组替换,现在代码运行良好......并且需要更多时间。

这是修改后的代码:

#include <cstdio>
#include <cstddef>
#include <vector>
#include <omp.h>

int main() {

  size_t N = 1 << 10; //16;
  size_t *D = new size_t [N];
  size_t threads = 2;  // 1, 2, 4, 8, 16
  size_t *sum = new size_t [N]; // global sum array to avoid skipping the calculations

  //std::vector<std::vector<int>> mat1(128); // a 2D matrix
  //std::vector<std::vector<int>> mat2(128); // a 2D matrix

  // classical C arrays
  size_t mat1[128][128], mat2[128][128];
  for (int i = 0; i < 128; i++) {
    for (int j = 0; j < 128; j++) {
      mat1[i][j] = 0;
      mat2[i][j] = 0;
    }
  }

  double tic = omp_get_wtime();

  // number of threads set by the environment variable OMP_NUM_THREADS
  #pragma omp parallel 
  {
    #pragma omp single
    {
      for(size_t i = 1; i < N; ++i) {

        size_t p = i / 2;
        size_t l = i * 2;
        size_t r = l + 1;

        if(l < N && r < N) {
          #pragma omp task firstprivate(i) depend(out:D[l], D[r]) depend(in:D[p])
          {
            // perform matrix multiplication
            //size_t sum = 0;
            for(size_t x = 0; x < 128; ++x) {
              for(size_t y = 0; y < 128; ++y) {
                for(size_t z = 0; z < 128; ++z) {
                  //sum += mat1[x][z] * mat2[z][y];
                  sum[i] += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
                }
              }
            }  
          }
        }
        else {
          #pragma omp task firstprivate(i) depend(in:D[p])
          {           
            // perform matrix multiplication
            //size_t sum = 0;
            for(size_t x = 0; x < 128; ++x) {
              for(size_t y = 0; y < 128; ++y) {
                for(size_t z = 0; z < 128; ++z) {
                  //sum += mat1[x][z] * mat2[z][y];
                  sum[i] += mat1[x][z] * mat2[y][z]; // mat2 is transposed for better memory access pattern
                }
              }
            }
          }
        }
      }
    }
  }

  double toc = omp_get_wtime();
  printf("%lf\n",toc-tic);

  delete [] D;
  delete [] sum;
}

正如我所说,它要慢得多,并且可以扩展(机器有 10 个核心):

% g++ -O3 -fopenmp mmtask.cpp
% setenv OMP_NUM_THREADS 1 && ./a.out
1.803429
% setenv OMP_NUM_THREADS 2 && ./a.out
1.011750
% setenv OMP_NUM_THREADS 4 && ./a.out
0.475696
% setenv OMP_NUM_THREADS 8 && ./a.out
0.301431
% setenv OMP_NUM_THREADS 16 && ./a.out
0.357968

0
投票

根据之前的讨论和评论,我发现 OpenMP 任务可以在两种情况下扩展:

  1. 每个任务的计算负载必须很大(>100us)。由于使用任务构造的开销不可忽视,小任务的运行时间将主要由任务的创建和调度决定。
  2. 当任务仅执行本地计算时,OpenMP 运行时会优化计算,例如
for(size_t x = 0; x < 128; ++x) {
  for(size_t y = 0; y < 128; ++y) {
    size_t sum = 0;
    for(size_t z = 0; z < 128; ++z) {
      sum += mat1[x][z] * mat2[y][z]; // OpenMP optimizes this out
    }
  }
}    

为了避免这种优化,我们应该像 PierU 建议的那样使用全局数据存储,

size_t *sum = new size_t[N];

for(size_t x = 0; x < 128; ++x) {
  for(size_t y = 0; y < 128; ++y) {
    for(size_t z = 0; z < 128; ++z) {
      sum[i] += mat1[x][z] * mat2[y][z]; 
    }
  }
}    

在下面,我发布了用于测量二叉树结构上 OpenMP 任务构造的运行时性能的整个实现。该实现根据之前的评论进行了修改。 该实现是在启用 gcc-9 和 O3 的情况下编译的,并在 Ubuntu 19.10 (Eoan Ermine) 机器上运行,该机器具有 80 Intel Xeon Gold 6138 CPU(2.00GHz)和 256 GB RAM。

#include <omp.h>
#include <iostream>
#include <chrono>
#include <vector>
#include <string>
#include <stdlib.h>

int main(int argc, char** argv) {
  
  size_t num_layers = 18;
  unsigned num_threads = strtol(argv[1], NULL, 10);
  size_t msize = strtol(argv[2], NULL, 10);
  
  size_t N = 1 << num_layers;

  std::vector<std::vector<int>> mat1(msize);
  std::vector<std::vector<int>> mat2(msize);

  for(int i = 0; i < msize; ++i) {
    mat1[i].resize(msize);
    mat2[i].resize(msize);
  }
  
  for(int i = 0; i < msize; ++i) {
    for(int j = 0; j < msize; ++j) {
      mat1[i][j] = i+j;
      mat2[i][j] = i-j;
    }
  }

  auto beg = std::chrono::high_resolution_clock::now();
  
  size_t *D = new size_t [N];

  #pragma omp parallel num_threads(num_threads)
  {
    #pragma omp single
    {
      for(size_t i = 1; i<N; ++i) {

        size_t p = i / 2;
        size_t l = i * 2;
        size_t r = l + 1;

        if(l < N && r < N) {
          #pragma omp task firstprivate(i) depend(out:D[l], D[r]) depend(in:D[p])
          {
            D[i] = 0; 
            for (int x = 0; x < mat1.size(); x++) {
              for (int y = 0; y < mat1.size(); y++) {
                for (int z = 0; z < mat1.size(); z++) {
                  D[i] += mat1[x][z] * mat2[y][z];
                }
              }
            }
          }
        }
        else {
          #pragma omp task firstprivate(i) depend(in:D[p])
          {
             
            D[i] = 0; 
            for (int x = 0; x < mat1.size(); x++) {
              for (int y = 0; y < mat1.size(); y++) {
                for (int z = 0; z < mat1.size(); z++) {
                  D[i] += mat1[x][z] * mat2[y][z];
                }
              }
            }
          }
        }
      }
    }
  }

  delete [] D;

  auto end = std::chrono::high_resolution_clock::now();

  std::cout << "Elapsed: " << std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count() << " us\n";

  return 0;
}

© www.soinside.com 2019 - 2024. All rights reserved.