How to efficient parallel matrix multiplication for small external indices and large internal indices in Fortran

问题描述 投票:0回答:0

我有一个矩阵乘法,

A[a,b]*B[b,c]=C[a,c]
a
c
的外部尺寸小,内部
b
大。在这种情况下如何高效并行? (如果
a
c
很大,那就直截了当,拆分外部维度)

我尝试了以下代码

Program test_dgemm

    Use, Intrinsic :: iso_fortran_env, Only :  wp => real64, li => int64
    integer, parameter :: dp = selected_real_kind(15, 307)
  
    Real( dp ), Dimension( :, : ), Allocatable :: a
    Real( dp ), Dimension( :, : ), Allocatable :: b
    Real( dp ), Dimension( :, : ), Allocatable :: c
  
    Integer :: na, nb, nc, m_iter
    Integer :: i, j, k
    Integer( li ) :: start, finish, rate
    Integer :: numthreads

    real(dp) :: sum_time1, sum_time2

    Write( *, * ) 'numthreads'
    Read( *, * ) numthreads

    call omp_set_num_threads(numthreads)  

    Write( *, * ) 'na, nb, nc?'
    Read( *, * ) na, nb, nc
    Allocate( a ( 1:na, 1:nb ) ) 
    Allocate( b ( 1:nb, 1:nc ) )  
    Allocate( c( 1:na, 1:nc ) )
  
  
  !A[a,b] * B[b,c] = C[a,c]
    Call Random_number( a )
    Call Random_number( b )
    c  = 0.0_dp
 
    m_iter = 100
  



    sum_time1 = 0.0  
    do m = 1, m_iter    
        c  = 0.0_dp 
        Call System_clock( start, rate )

        do i = 1, na
            do j = 1, nc
                do k = 1, nb
                c(i,j) = c(i,j) + a(i,k)*b(k,j)
                end do  
            end do 
        end do    

        Call System_clock( finish, rate )
        sum_time1 = sum_time1 + Real( finish - start, dp ) / rate  
    end do  
    Write( *, * ) 'Time for loop-1', sum_time1 / m_iter



    sum_time1 = 0.0  
    do m = 1, m_iter     
        c  = 0.0_dp 

        Call System_clock( start, rate )

        do i = 1, na
            do j = 1, nc
                !$omp parallel
                !$omp do private(k) schedule(dynamic) reduction(+:c)
                do k = 1, nb
                    c(i,j) = c(i,j) + a(i,k)*b(k,j)
                end do  
                !$omp end do
                !$omp end parallel
            end do 
        end do    


        Call System_clock( finish, rate )
        sum_time1 = sum_time1 + Real( finish - start, dp ) / rate  
    end do  



    Write( *, * ) 'Time for loop-1-omp', sum_time1 / m_iter

end

gfortran
,
numthreads=2
, 不,注意,数控? 1 1000000 1

我有

 Time for loop-1   3.2360499999999977E-003
 Time for loop-1-omp   1.5933990999999998E-002

omp 没有帮助。我可能会尝试定义一些中间矩阵,例如 A1[a,1:b/2] B1[1:b/2,c], A2[a,b/2+1:b] B2[b/2 +1:b,c] 并且每个线程处理一个,然后对所有乘积矩阵求和。有没有更简单的方法修改上面的omp部分?

fortran openmp gfortran
© www.soinside.com 2019 - 2024. All rights reserved.