MKL BLAS 不是多线程 zgemv

问题描述 投票:0回答:0

我在配备两个 AMD EPYC 7443 24 核处理器和 1007GB RAM 的计算机上运行一个非常简单的 MKL BLAS 矩阵-矩阵和矩阵-向量乘法。

文末给出代码、编译线和测试结果

BLAS 显然不是多线程的 mat-vec 操作,而是只有 mat-mat,如下所示。

如何使 mat-vec 操作多线程? 我做错了什么?

代码如下:

program main

  use blas95
  
  implicit none

  integer, parameter :: lp = kind(DBLE(1.0))
  integer :: m, n, i
  complex(kind=lp), dimension(:), allocatable :: x, y
  complex(kind=lp), dimension(:,:), allocatable :: A, B, C

  m=2**12
  n=2**12

  allocate(A(m,n))
  allocate(B(m,n),C(m,n))
  allocate(x(n),y(m))

  do i=0,5
     call mkl_set_num_threads_local(2**i)
     call mkl_set_dynamic(0)
     call gemm(A,B,C)
  end do
  do i=0,5
     call mkl_set_num_threads_local(2**i)
     call mkl_set_dynamic(0)
     call gemv(A,x,y)
  end do

end program main

这是我的编译行:

gfortran -Ofast -I$MKLROOT/include -I$BLASROOT/include/intel64/lp64  main.F90 -L$MKLROOT/lib/intel64 -o main -lgomp -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core $BLASROOT/lib/intel64/libmkl_blas95_lp64.a

这是输出:

MKL_VERBOSE oneMKL 2022.0 Product build 20211112 for Intel(R) 64 architecture Intel(R) Architecture processors, Lnx 1.79GHz lp64 gnu_thread
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 10.94s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:1
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 5.90s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:2
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 3.76s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:4
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 1.59s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:8
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 925.07ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:16
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 606.32ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:32
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 12.23ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:1
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.68ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:2
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.66ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:4
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.62ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:8
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.64ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:16
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.60ms CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:32

这里是只有 mat-vec 但具有更大的矩阵和向量的测试结果:

MKL_VERBOSE oneMKL 2022.0 Product build 20211112 for Intel(R) 64 architecture Intel(R) Architecture processors, Lnx 1.79GHz lp64 gnu_thread
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.89s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:1
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.87s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:2
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:4
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:8
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:16
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0  NThr:32
multithreading fortran blas intel-mkl
© www.soinside.com 2019 - 2024. All rights reserved.