我有一个矩阵乘法,
A[a,b]*B[b,c]=C[a,c]
。 a
和 c
的外部尺寸小,内部 b
大。在这种情况下如何高效并行? (如果a
和c
很大,那就直截了当,拆分外部维度)
我尝试了以下代码
Program test_dgemm
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
integer, parameter :: dp = selected_real_kind(15, 307)
Real( dp ), Dimension( :, : ), Allocatable :: a
Real( dp ), Dimension( :, : ), Allocatable :: b
Real( dp ), Dimension( :, : ), Allocatable :: c
Integer :: na, nb, nc, m_iter
Integer :: i, j, k
Integer( li ) :: start, finish, rate
Integer :: numthreads
real(dp) :: sum_time1, sum_time2
Write( *, * ) 'numthreads'
Read( *, * ) numthreads
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc?'
Read( *, * ) na, nb, nc
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc ) )
Allocate( c( 1:na, 1:nc ) )
!A[a,b] * B[b,c] = C[a,c]
Call Random_number( a )
Call Random_number( b )
c = 0.0_dp
m_iter = 100
sum_time1 = 0.0
do m = 1, m_iter
c = 0.0_dp
Call System_clock( start, rate )
do i = 1, na
do j = 1, nc
do k = 1, nb
c(i,j) = c(i,j) + a(i,k)*b(k,j)
end do
end do
end do
Call System_clock( finish, rate )
sum_time1 = sum_time1 + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for loop-1', sum_time1 / m_iter
sum_time1 = 0.0
do m = 1, m_iter
c = 0.0_dp
Call System_clock( start, rate )
do i = 1, na
do j = 1, nc
!$omp parallel
!$omp do private(k) schedule(dynamic) reduction(+:c)
do k = 1, nb
c(i,j) = c(i,j) + a(i,k)*b(k,j)
end do
!$omp end do
!$omp end parallel
end do
end do
Call System_clock( finish, rate )
sum_time1 = sum_time1 + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for loop-1-omp', sum_time1 / m_iter
end
在
gfortran
, numthreads=2
,
不,注意,数控?
1 1000000 1
我有
Time for loop-1 3.2360499999999977E-003
Time for loop-1-omp 1.5933990999999998E-002
omp 没有帮助。我可能会尝试定义一些中间矩阵,例如 A1[a,1:b/2] B1[1:b/2,c], A2[a,b/2+1:b] B2[b/2 +1:b,c] 并且每个线程处理一个,然后对所有乘积矩阵求和。有没有更简单的方法修改上面的omp部分?