如何将 openMP 并行区域转换为 fortran 中的 MPI

问题描述 投票:0回答:1

我编写了一个离散元模型来计算粒子之间的相互作用。第一步,我使用了 openMP,并对超级计算机集群进行了一些扩展分析,现在我准备升级到 MPI,因为我确定我可以有效地使用许多节点。不过,我对MPI一无所知,网上查的例子要么太简单,要么太复杂。我只想将每个循环计算分派到不同节点上的新处理器上,当循环计算结束时,下一个循环计算将分配给新空闲的处理器。 (作为参考,我计划在 4 个节点,每个 64 个 cpu 上运行。目前,openMP 版本在 1 个节点,64 个 cpu 上运行良好。)

我的代码在 openMP 中只有一个并行区域:我想要做的是将并行嵌套循环(在 i 和 j 上)拆分到多个节点上(使用每个节点上的所有 cpu)。但我不知道该怎么做。我应该预先计算要发送到每个处理器的循环元素的确切数量吗?有没有办法模仿 openMP 中的调度引导行为来调度每个循环计算?我应该使用 BCast 还是 Scatter?它们应该在两个循环中使用还是只工作一次?这些是我问自己的问题。

代码的相关(openMP)部分是:

subroutine stepper (tstep)

    use omp_lib
    use m_allocate, only: allocate
    use m_deallocate, only: deallocate
    use m_KdTree, only: KdTree, KdTreeSearch
    use dArgDynamicArray_Class, only: dArgDynamicArray
    use m_strings, only: str

    implicit none

    include "parameter.h"
    include "CB_variables.h"
    include "CB_const.h"
    include "CB_bond.h"
    include "CB_forcings.h"
    include "CB_options.h"

    integer :: i, j, k
    integer, intent(in) :: tstep
    type(KdTree) :: tree
    type(KdTreeSearch) :: search
    type(dArgDynamicArray) :: da

    ! Build the tree
    tree = KdTree(x, y)
    
    ! reinitialize force arrays for contact and bonds
    do i = 1, n
        mc(i)    = 0d0
        mb(i)    = 0d0
        fcx(i)   = 0d0
        fcy(i)   = 0d0
        fbx(i)   = 0d0
        fby(i)   = 0d0
        ! and total force arrays
        m(i)   = 0d0
    tfx(i) = 0d0
        tfy(i) = 0d0
    end do
    
    ! put yourself in the referential of the ith particle
    ! loop through all j particles and compute interactions

    !$omp parallel do schedule(guided) &
    !$omp private(i,j,da) &
    !$omp reduction(+:fcx,fcy,fbx,fby,mc,mb)
    do i = n, 1, -1
        ! Find all the particles j near i
        da = search%kNearest(tree, x, y, xQuery = x(i), yQuery = y(i), &
                            radius = r(i) + rtree)
        ! loop over the nearest neighbors except the first because this is the particle i
        do k = 1, size(da%i%values) - 1
            j = da%i%values(k + 1)

            ! only compute lower triangular matrices
            if (i .ge. j) then
                cycle
            end if

        ! compute relative position and velocity
            call rel_pos_vel (j, i)

        ! bond initialization
        if ( cohesion .eqv. .true. ) then
                if ( tstep .eq. 1 ) then
                    if ( deltan(j, i) .ge. -5d-1 ) then ! can be fancier
                        bond (j, i) = 1
                    end if
                    call bond_properties (j, i)
                end if
        end if

            ! verify if two particles are colliding
            if ( deltan(j,i) .gt. 0 ) then
                
                call contact_forces (j, i)
        !call bond_creation (j, i) ! to implement

        ! update contact force on particle i by particle j
                fcx(i) = fcx(i) - fcn(j,i) * cosa(j,i)
                fcy(i) = fcy(i) - fcn(j,i) * sina(j,i)

                ! update moment on particule i by particule j due to tangent contact 
                mc(i) = mc(i) - r(i) * fct(j,i) - mcc(j,i)

                ! Newton's third law
                ! update contact force on particle j by particle i
                fcx(j) = fcx(j) + fcn(j,i) * cosa(j,i)
                fcy(j) = fcy(j) + fcn(j,i) * sina(j,i)

                ! update moment on particule j by particule i due to tangent contact 
                mc(j) = mc(j) - r(j) * fct(j,i) + mcc(j,i)

            else
            
                call reset_contact (j, i)

            end if

        ! compute forces from bonds between particle i and j
        if ( bond (j, i) .eq. 1 ) then

        call bond_forces (j, i)
        call bond_breaking (j, i)

                if ( bond (j, i) .eq. 1 ) then
                    ! update force on particle i by j due to bond
                    fbx(i) = fbx(i) - fbn(j,i) * cosa(j,i) +    &
                                        fbt(j,i) * sina(j,i)
                    fby(i) = fby(i) - fbn(j,i) * sina(j,i) -    &
                                        fbt(j,i) * cosa(j,i)

                    ! update moment on particule i by j to to bond
                    mb(i) = mb(i) - r(i) * fbt(j,i) - mbb(j, i)

                    ! Newton's third law
                    ! update force on particle j by i due to bond
                    fbx(j) = fbx(j) + fbn(j,i) * cosa(j,i) -    &
                                        fbt(j,i) * sina(j,i)
                    fby(j) = fby(j) + fbn(j,i) * sina(j,i) +    &
                                        fbt(j,i) * cosa(j,i)
    
                    ! update moment on particule j by i due to bond
                    mb(j) = mb(j) - r(j) * fbt(j,i) + mbb(j, i)
                end if

            else

                call reset_bond (j, i)

        end if

        ! compute sheltering height for particule j on particle i for air and water drag
            call sheltering(j, i)

        end do

        ! compute the total forcing from winds, currents and coriolis on particule i
        call forcing (i)
        call coriolis(i)

    end do
    !$omp end parallel do

    ! deallocate tree memory
    call tree%deallocate()

    ! sum all forces together on particule i
    do i = 1, n
        tfx(i) = fcx(i) + fbx(i) + fax(i) + fwx(i) + fcorx(i)
        tfy(i) = fcy(i) + fby(i) + fay(i) + fwy(i) + fcory(i)

        ! sum all moments on particule i together
        m(i) =  mc(i) + mb(i) + ma(i) + mw(i)
    end do

    ! forces on side particles for experiments
    !call experiment_forces

    ! integration in time
    call velocity
    call euler

end subroutine stepper

请注意,我知道如何编译和运行代码。我的问题是从 openMP 到 MPI 的实际逻辑转换。显然,它没有编译或运行,因为我不明白如何正确使用 MPI。这是我迄今为止尝试过的:

subroutine stepper (tstep)

    use mpi
    
    ... ! same as above

    integer :: p, np , ierr
    integer, parameter :: master = 0
    double precision :: fcx_temp(n), fcy_temp(n), fbx_temp(n), fby_temp(n), mc_temp(n), mb_temp(n)

    ! Build the tree
    tree = KdTree(x, y)
    
    ! reinitialize force arrays for contact and bonds
    do i = 1, n
        mc(i)    = 0d0
        mb(i)    = 0d0
        fcx(i)   = 0d0
        fcy(i)   = 0d0
        fbx(i)   = 0d0
        fby(i)   = 0d0
        ! and total force arrays
        m(i)   = 0d0
    tfx(i) = 0d0
        tfy(i) = 0d0
    end do
    
    ! put yourself in the referential of the ith particle
    ! loop through all j particles and compute interactions

    call MPI_INIT(ierr) 
    call MPI_COMM_RANK(MPI_COMM_WORLD, p, ierr)
    call MPI_COMM_SIZE(MPI_COMM_WORLD, np, ierr)

    call MPI_BCAST(n, 1, MPI_INTEGER, master, MPI_COMM_WORLD, ierr)

    do i = n, 1, -1

        ! Find all the particles j near i
        da = search%kNearest(tree, x, y, xQuery = x(i), yQuery = y(i), &
                            radius = r(i) + rtree)
        ! loop over the nearest neighbors except the first because this is the particle i
        do k = 1, size(da%i%values) - 1
            j = da%i%values(k + 1)

            ! only compute lower triangular matrices
            if (i .ge. j) then
                cycle
            end if

        ... ! sa me as above with: variables -> variables_temp

        end do

        ... ! same as above

        call MPI_REDUCE(fcx_temp, fcx, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
        call MPI_REDUCE(fcy_temp, fcy, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
        call MPI_REDUCE(fbx_temp, fbx, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
        call MPI_REDUCE(fby_temp, fby, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
        call MPI_REDUCE(mc_temp, mc, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)
        call MPI_REDUCE(mb_temp, mb, 1, MPI_DOUBLE_PRECISION, MPI_SUM, master, MPI_COMM_WORLD, ierr)

    end do
    
    call MPI_FINALIZE(ierr)

    ... ! same as above

end subroutine stepper

谢谢!

fortran mpi openmp reduce do-loops
1个回答
0
投票

OpenMP 和 MPI 之间的最大区别在于,使用 OpenMP,首先,您只需要考虑工作划分,因为所有线程都可以访问相同的内存。稍后,为了性能微调,您需要更努力地思考如何:在 OpenMP 中,所有线程共享相同的内存。

在 MPI 中,工作分工通常是由数据分工引起的。您必须决定哪个进程获取数据的哪个子集。因此,您的

n
元素循环是因为
n/nprocs
元素,并且每个进程决定它获取哪些元素。

最大的问题是,现在您已经划分了数据,但进程可能需要来自邻居的数据:进程上的第一个和最后一个点具有由邻居进程拥有的最近邻居。这被称为“边界交换”或“光环区域”或类似的东西。这是基本上每个 MPI 课程中的标准示例。

© www.soinside.com 2019 - 2024. All rights reserved.