我编写了一个离散元模型来计算粒子之间的相互作用。第一步,我使用了 openMP,并对超级计算机集群进行了一些扩展分析,现在我准备升级到 MPI,因为我确定我可以有效地使用许多节点。不过,我对MPI一无所知,网上查的例子要么太简单,要么太复杂。我只想将每个循环计算分派到不同节点上的新处理器上,当循环计算结束时,下一个循环计算将分配给新空闲的处理器。 (作为参考,我计划在 4 个节点,每个 64 个 cpu 上运行。目前,openMP 版本在 1 个节点,64 个 cpu 上运行良好。)
我的代码在 openMP 中只有一个并行区域:我想要做的是将并行嵌套循环(在 i 和 j 上)拆分到多个节点上(使用每个节点上的所有 cpu)。但我不知道该怎么做。我应该预先计算要发送到每个处理器的循环元素的确切数量吗?有没有办法模仿 openMP 中的调度引导行为来调度每个循环计算?我应该使用 BCast 还是 Scatter?它们应该在两个循环中使用还是只工作一次?这些是我问自己的问题。
代码的相关(openMP)部分是:
subroutine stepper (tstep)
use omp_lib
use m_allocate, only: allocate
use m_deallocate, only: deallocate
use m_KdTree, only: KdTree, KdTreeSearch
use dArgDynamicArray_Class, only: dArgDynamicArray
use m_strings, only: str
implicit none
include "parameter.h"
include "CB_variables.h"
include "CB_const.h"
include "CB_bond.h"
include "CB_forcings.h"
include "CB_options.h"
integer :: i, j, k
integer, intent(in) :: tstep
type(KdTree) :: tree
type(KdTreeSearch) :: search
type(dArgDynamicArray) :: da
! Build the tree
tree = KdTree(x, y)
! reinitialize force arrays for contact and bonds
do i = 1, n
mc(i) = 0d0
mb(i) = 0d0
fcx(i) = 0d0
fcy(i) = 0d0
fbx(i) = 0d0
fby(i) = 0d0
! and total force arrays
m(i) = 0d0
tfx(i) = 0d0
tfy(i) = 0d0
end do
! put yourself in the referential of the ith particle
! loop through all j particles and compute interactions
!$omp parallel do schedule(guided) &
!$omp private(i,j,da) &
!$omp reduction(+:fcx,fcy,fbx,fby,mc,mb)
do i = n, 1, -1
! Find all the particles j near i
da = search%kNearest(tree, x, y, xQuery = x(i), yQuery = y(i), &
radius = r(i) + rtree)
! loop over the nearest neighbors except the first because this is the particle i
do k = 1, size(da%i%values) - 1
j = da%i%values(k + 1)
! only compute lower triangular matrices
if (i .ge. j) then
cycle
end if
! compute relative position and velocity
call rel_pos_vel (j, i)
! bond initialization
if ( cohesion .eqv. .true. ) then
if ( tstep .eq. 1 ) then
if ( deltan(j, i) .ge. -5d-1 ) then ! can be fancier
bond (j, i) = 1
end if
call bond_properties (j, i)
end if
end if
! verify if two particles are colliding
if ( deltan(j,i) .gt. 0 ) then
call contact_forces (j, i)
!call bond_creation (j, i) ! to implement
! update contact force on particle i by particle j
fcx(i) = fcx(i) - fcn(j,i) * cosa(j,i)
fcy(i) = fcy(i) - fcn(j,i) * sina(j,i)
! update moment on particule i by particule j due to tangent contact
mc(i) = mc(i) - r(i) * fct(j,i) - mcc(j,i)
! Newton's third law
! update contact force on particle j by particle i
fcx(j) = fcx(j) + fcn(j,i) * cosa(j,i)
fcy(j) = fcy(j) + fcn(j,i) * sina(j,i)
! update moment on particule j by particule i due to tangent contact
mc(j) = mc(j) - r(j) * fct(j,i) + mcc(j,i)
else
call reset_contact (j, i)
end if
! compute forces from bonds between particle i and j
if ( bond (j, i) .eq. 1 ) then
call bond_forces (j, i)
call bond_breaking (j, i)
if ( bond (j, i) .eq. 1 ) then
! update force on particle i by j due to bond
fbx(i) = fbx(i) - fbn(j,i) * cosa(j,i) + &
fbt(j,i) * sina(j,i)
fby(i) = fby(i) - fbn(j,i) * sina(j,i) - &
fbt(j,i) * cosa(j,i)
! update moment on particule i by j to to bond
mb(i) = mb(i) - r(i) * fbt(j,i) - mbb(j, i)
! Newton's third law
! update force on particle j by i due to bond
fbx(j) = fbx(j) + fbn(j,i) * cosa(j,i) - &
fbt(j,i) * sina(j,i)
fby(j) = fby(j) + fbn(j,i) * sina(j,i) + &
fbt(j,i) * cosa(j,i)
! update moment on particule j by i due to bond
mb(j) = mb(j) - r(j) * fbt(j,i) + mbb(j, i)
end if
else
call reset_bond (j, i)
end if
! compute sheltering height for particule j on particle i for air and water drag
call sheltering(j, i)
end do
! compute the total forcing from winds, currents and coriolis on particule i
call forcing (i)
call coriolis(i)
end do
!$omp end parallel do
! deallocate tree memory
call tree%deallocate()
! sum all forces together on particule i
do i = 1, n
tfx(i) = fcx(i) + fbx(i) + fax(i) + fwx(i) + fcorx(i)
tfy(i) = fcy(i) + fby(i) + fay(i) + fwy(i) + fcory(i)
! sum all moments on particule i together
m(i) = mc(i) + mb(i) + ma(i) + mw(i)
end do
! forces on side particles for experiments
!call experiment_forces
! integration in time
call velocity
call euler
end subroutine stepper
请注意,我知道如何编译和运行代码。我的问题是从 openMP 到 MPI 的实际逻辑转换。显然,它没有编译或运行,因为我不明白如何正确使用 MPI。这是我迄今为止尝试过的:
subroutine stepper (tstep)
use mpi
... ! same as above
integer :: p, np , ierr
integer, parameter :: master = 0
double precision :: fcx_temp(n), fcy_temp(n), fbx_temp(n), fby_temp(n), mc_temp(n), mb_temp(n)
! Build the tree
tree = KdTree(x, y)
! reinitialize force arrays for contact and bonds
do i = 1, n
mc(i) = 0d0
mb(i) = 0d0
fcx(i) = 0d0
fcy(i) = 0d0
fbx(i) = 0d0
fby(i) = 0d0
! and total force arrays
m(i) = 0d0
tfx(i) = 0d0
tfy(i) = 0d0
end do
! put yourself in the referential of the ith particle
! loop through all j particles and compute interactions
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD, p, ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD, np, ierr)
call MPI_BCAST(n, 1, MPI_INTEGER, master, MPI_COMM_WORLD, ierr)
do i = n, 1, -1
! Find all the particles j near i
da = search%kNearest(tree, x, y, xQuery = x(i), yQuery = y(i), &
radius = r(i) + rtree)
! loop over the nearest neighbors except the first because this is the particle i
do k = 1, size(da%i%values) - 1
j = da%i%values(k + 1)
! only compute lower triangular matrices
if (i .ge. j) then
cycle
end if
... ! sa me as above with: variables -> variables_temp
end do
... ! same as above
call MPI_REDUCE(fcx_temp, fcx, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
call MPI_REDUCE(fcy_temp, fcy, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
call MPI_REDUCE(fbx_temp, fbx, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
call MPI_REDUCE(fby_temp, fby, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
call MPI_REDUCE(mc_temp, mc, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
call MPI_REDUCE(mb_temp, mb, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
end do
call MPI_FINALIZE(ierr)
... ! same as above
end subroutine stepper
谢谢!
OpenMP 和 MPI 之间的最大区别在于,使用 OpenMP,首先,您只需要考虑工作划分,因为所有线程都可以访问相同的内存。稍后,为了性能微调,您需要更努力地思考如何:在 OpenMP 中,所有线程共享相同的内存。
在 MPI 中,工作分工通常是由数据分工引起的。您必须决定哪个进程获取数据的哪个子集。因此,您的
n
元素循环是因为 n/nprocs
元素,并且每个进程决定它获取哪些元素。
最大的问题是,现在您已经划分了数据,但进程可能需要来自邻居的数据:进程上的第一个和最后一个点具有由邻居进程拥有的最近邻居。这被称为“边界交换”或“光环区域”或类似的东西。这是基本上每个 MPI 课程中的标准示例。