! Module for MPI communication of field halos ! This module uses Fortran 2003 features : move_alloc intrinsic, pointer bounds remapping, allocatable type fields module transfert_mpi_mod use abort_mod, only : dynamico_abort, abort_acc use profiling_mod, only : enter_profile, exit_profile, register_id use domain_mod, only : ndomain, ndomain_glo, domain, domain_glo, domloc_glo_ind, domglo_rank, domglo_loc_ind use field_mod, only : t_field, field_T, field_U, field_Z use transfert_request_mod implicit none private ! Describes how to pack/unpack a message from a local domain to another type t_local_submessage integer :: src_ind_loc, dest_ind_loc ! index of local and remote domain integer, allocatable :: displ_src(:) ! List of indexes to copy from domain src_ind_loc integer, allocatable :: displ_dest(:) ! List of indexes to copy to domain dest_ind_loc integer, allocatable :: sign(:) ! Sign change to be applied for vector requests end type ! Describes how to pack/unpack a message from a domain to another, and contains MPI buffer type t_submessage integer :: ind_loc, remote_ind_glo ! index of local and remote domain integer, allocatable :: displs(:) ! List of indexes to copy from field to buffer for each level integer, allocatable :: sign(:) ! Sign change to be applied for vector requests real, allocatable :: buff(:,:,:) ! MPI buffer buff(iim*jjm[*3],dim3,dim4) end type ! Describes how to exchange data for a field. type t_message type (t_field), pointer :: field(:) => null() ! Field to exchange type (t_request), pointer :: request(:) => null() ! Type of message to send type (t_local_submessage), pointer :: message_local(:) ! Local halo copies type (t_submessage), pointer :: message_in(:) ! Messages to recieve from remote ranks and to copy back to the field type (t_submessage), pointer :: message_out(:) ! Halos to copy to MPI buffer and to send to remote ranks integer, pointer :: mpi_requests_in(:) ! MPI requests used for message_in. integer, pointer :: mpi_requests_out(:) ! MPI requests used for message_out. ! NOTE : requests are persistant requests initialized in init_message. MPI_Start and MPI_Wait are then used to initiate and complete communications. ! ex : Give mpi_requests_in(i) to MPI_Start to send the buffer contained in message_in(i) integer :: send_seq ! Sequence number : send_seq is incremented each time send_message is called integer :: wait_seq ! Sequence number : wait_seq is incremented each time wait_message is called logical :: ondevice ! Ready to transfer ondevice field end type t_message public :: t_message, t_request, & req_i1, req_e1_scal, req_e1_vect, & req_i0, req_e0_scal, req_e0_vect, & req_z1_scal, & init_transfert, & init_message, & finalize_message, & send_message, & wait_message, & test_message ! ---- Private variables ---- ! Profiling id for mpi integer :: profile_mpi, profile_mpi_copies, profile_mpi_waitall, profile_mpi_barrier contains ! Initialize transfert : must be called before any other transfert_mpi routines subroutine init_transfert use mpi_mod, only : MPI_THREAD_SINGLE, MPI_THREAD_FUNNELED use mpipara, only : mpi_threading_mode use profiling_mod, only : register_id logical, parameter :: profile_mpi_detail = .true. !$omp master ! Check requested threads support if( mpi_threading_mode /= MPI_THREAD_SINGLE .and. mpi_threading_mode /= MPI_THREAD_FUNNELED ) call dynamico_abort("Only single and funneled threading mode are supported.") ! Register profiling ids call register_id("MPI", profile_mpi) if( profile_mpi_detail ) then call register_id("MPI_copies", profile_mpi_copies) call register_id("MPI_waitall", profile_mpi_waitall) call register_id("MPI_omp_barrier", profile_mpi_barrier) else profile_mpi_copies = profile_mpi profile_mpi_waitall = profile_mpi profile_mpi_barrier = profile_mpi endif ! Initialize requests call init_all_requests() !$omp end master !$omp barrier end subroutine subroutine init_message(field, request, message, name) use mpi_mod use mpipara type(t_field), pointer, intent(in) :: field(:) type(t_request),pointer, intent(in) :: request(:) type(t_message), target, intent(out) :: message ! Needs intent out for call to finalize_message character(len=*), intent(in),optional :: name integer, parameter :: INITIAL_ALLOC_SIZE = 10, GROW_FACTOR = 2 type(t_submessage) :: submessage_in, submessage_out type(t_local_submessage) :: submessage_local integer :: dim3, dim4 integer :: ind, ind_loc, remote_ind_glo, i integer :: message_in_size, message_out_size, message_local_size type(t_local_submessage), allocatable :: message_local_tmp(:) type(t_submessage), allocatable :: message_in_tmp(:), message_out_tmp(:) type(t_submessage), pointer :: submessage integer :: field_type !$omp barrier !$omp master !init off-device message%ondevice=.false. message%send_seq = 0 message%wait_seq = 0 if( request(1)%field_type /= field(1)%field_type ) call dynamico_abort( "init_message : field_type/request mismatch" ) field_type = request(1)%field_type ! Set field%rval4d pointer to always use 4d array do ind = 1, ndomain if( field(ind)%ndim == 2 ) field(ind)%rval4d(1:size(field(ind)%rval2d,1),1:1,1:1) => field(ind)%rval2d ! This is Fortran 2008 : can be avoided by using a subroutine with rval3d as a 1D dummy argument ! (/!\ : using a subroutine might generate a temporary contiguous array) if( field(ind)%ndim == 3 ) field(ind)%rval4d(1:size(field(ind)%rval3d,1), & 1:size(field(ind)%rval3d,2), 1:1) => field(ind)%rval3d end do dim3 = size(field(1)%rval4d,2) dim4 = size(field(1)%rval4d,3) message%field => field message%request => request ! Create list of inbound/outbound/local messages allocate(message_in_tmp(INITIAL_ALLOC_SIZE)) message_in_size=0 allocate(message_out_tmp(INITIAL_ALLOC_SIZE)) message_out_size=0 allocate(message_local_tmp(INITIAL_ALLOC_SIZE)) message_local_size=0 do ind_loc = 1, ndomain do remote_ind_glo = 1, ndomain_glo if( domglo_rank(remote_ind_glo) == mpi_rank ) then ! If sending to local domain if(request(ind_loc)%points_HtoB(remote_ind_glo)%npoints > 0 ) then ! Add only non-empty messages ! Add local message ind_loc -> remote_ind_glo, aggregarting submessage_in and submessage_out into submessage_local submessage_out = make_submessage( field_type, request(ind_loc)%points_HtoB(remote_ind_glo), & ind_loc, remote_ind_glo, dim3, dim4, request(1)%vector ) submessage_in = make_submessage( field_type, request(domglo_loc_ind(remote_ind_glo))%points_BtoH(domloc_glo_ind(ind_loc)), & domglo_loc_ind(remote_ind_glo), domloc_glo_ind(ind_loc), dim3, dim4, request(1)%vector) submessage_local%src_ind_loc = ind_loc submessage_local%dest_ind_loc = domglo_loc_ind(remote_ind_glo) submessage_local%displ_src = submessage_out%displs submessage_local%displ_dest = submessage_in%displs submessage_local%sign = submessage_in%sign ! Add to local message list call array_append_local_submessage( message_local_tmp, message_local_size, submessage_local) endif else ! If remote domain ! When data to send to remote_domain, add submessage in message%message_out if( request(ind_loc)%points_HtoB(remote_ind_glo)%npoints > 0 ) then submessage_out = make_submessage( field_type, request(ind_loc)%points_HtoB(remote_ind_glo), & ind_loc, remote_ind_glo, dim3, dim4, request(1)%vector ) call array_append_submessage( message_out_tmp, message_out_size, submessage_out ) end if if( request(ind_loc)%points_BtoH(remote_ind_glo)%npoints > 0 ) then submessage_in = make_submessage( field_type, request(ind_loc)%points_BtoH(remote_ind_glo), & ind_loc, remote_ind_glo, dim3, dim4, request(1)%vector ) call array_append_submessage( message_in_tmp, message_in_size, submessage_in ) end if end if end do end do ! Trim message_xx_tmp and put it in message%message_xx allocate(message%message_in(message_in_size)); message%message_in(:) = message_in_tmp(:message_in_size) allocate(message%message_out(message_out_size)); message%message_out(:) = message_out_tmp(:message_out_size) allocate(message%message_local(message_local_size)); message%message_local(:) = message_local_tmp(:message_local_size) ! Create MPI Persistant Send/Recv requests allocate( message%mpi_requests_in(size(message%message_in)) ) allocate( message%mpi_requests_out(size(message%message_out)) ) do i=1, size(message%message_out) submessage => message%message_out(i) call MPI_Send_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & submessage%remote_ind_glo+ndomain_glo*domloc_glo_ind(submessage%ind_loc), & comm_icosa, message%mpi_requests_out(i), ierr ) end do do i=1, size(message%message_in) submessage => message%message_in(i) call MPI_Recv_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & domloc_glo_ind(submessage%ind_loc)+ndomain_glo*submessage%remote_ind_glo, & comm_icosa, message%mpi_requests_in(i), ierr ) end do !$omp end master !$omp barrier contains ! Generate submessage from points function make_submessage(field_type, points, ind_loc, remote_ind_glo, dim3, dim4, vector) result(submessage) use dimensions, only : swap_dimensions, iim, u_pos, z_pos integer, intent(in) :: field_type type(t_points), intent(in) :: points integer, intent(in) :: ind_loc, remote_ind_glo, dim3, dim4 logical, intent(in) :: vector integer :: k type(t_submessage) :: submessage call swap_dimensions(ind_loc) submessage%ind_loc = ind_loc submessage%remote_ind_glo = remote_ind_glo allocate( submessage%buff( points%npoints, dim3, dim4 ) ) allocate( submessage%displs( points%npoints ) ) submessage%displs(:) = points%i + (points%j-1)*iim if(field_type == field_U) submessage%displs = submessage%displs + u_pos( points%elt ) if(field_type == field_Z) submessage%displs = submessage%displs + z_pos( points%elt ) allocate(submessage%sign( points%npoints )) if( vector ) then ! For U fields only submessage%sign(:) = (/( domain(ind_loc)%edge_assign_sign(points%elt(k)-1, points%i(k), points%j(k)) ,k=1,points%npoints)/) else submessage%sign(:) = 1 endif end function ! Add element to array, and reallocate if necessary subroutine array_append_submessage( a, a_size, elt ) type(t_submessage), allocatable, intent(inout) :: a(:) integer, intent(inout) :: a_size type(t_submessage), intent(in) :: elt type(t_submessage), allocatable :: a_tmp(:) integer, parameter :: GROW_FACTOR = 2 if( size( a ) <= a_size ) then allocate( a_tmp ( a_size * GROW_FACTOR ) ) a_tmp(1:a_size) = a(1:a_size) call move_alloc(a_tmp, a) end if a_size = a_size + 1 a(a_size) = elt; end subroutine ! Add element to array, and reallocate if necessary subroutine array_append_local_submessage( a, a_size, elt ) type(t_local_submessage), allocatable, intent(inout) :: a(:) integer, intent(inout) :: a_size type(t_local_submessage), intent(in) :: elt type(t_local_submessage), allocatable :: a_tmp(:) integer, parameter :: GROW_FACTOR = 2 if( size( a ) <= a_size ) then allocate( a_tmp ( a_size * GROW_FACTOR ) ) a_tmp(1:a_size) = a(1:a_size) call move_alloc(a_tmp, a) end if a_size = a_size + 1 a(a_size) = elt; end subroutine ! Je demande pardon au dieu du copier-coller car j'ai péché end subroutine subroutine message_create_ondevice(message) use mpi_mod use mpipara, only : comm_icosa type(t_message), intent(inout) :: message type(t_submessage), pointer :: submessage integer :: i, ierr if( message%ondevice ) call dynamico_abort("Message already on device") !$acc enter data copyin(message) async !$acc enter data copyin(message%message_in(:)) async do i = 1, size( message%message_in ) !$acc enter data copyin(message%message_in(i)%buff(:,:,:)) async !$acc enter data copyin(message%message_in(i)%displs(:)) async !$acc enter data copyin(message%message_in(i)%sign(:)) async end do !$acc enter data copyin(message%message_out(:)) async do i = 1, size( message%message_out ) !$acc enter data copyin(message%message_out(i)%buff(:,:,:)) async !$acc enter data copyin(message%message_out(i)%displs(:)) async !!$acc enter data copyin(message%message_out(i)%sign(:)) async end do !$acc enter data copyin(message%message_local(:)) async do i = 1, size( message%message_local ) !$acc enter data copyin(message%message_local(i)%displ_src(:)) async !$acc enter data copyin(message%message_local(i)%displ_dest(:)) async !$acc enter data copyin(message%message_local(i)%sign(:)) async end do !$acc enter data copyin(message%field(:)) async do i = 1, ndomain !$acc enter data copyin(message%field(i)%rval4d(:,:,:)) async end do do i=1, size(message%message_out) submessage => message%message_out(i) call MPI_Request_free(message%mpi_requests_out(i), ierr) !$acc host_data use_device(submessage%buff) call MPI_Send_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & submessage%remote_ind_glo+ndomain_glo*domloc_glo_ind(submessage%ind_loc), & comm_icosa, message%mpi_requests_out(i), ierr ) !$acc end host_data end do do i=1, size(message%message_in) submessage => message%message_in(i) call MPI_Request_free(message%mpi_requests_in(i), ierr) !$acc host_data use_device(submessage%buff) call MPI_Recv_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & domloc_glo_ind(submessage%ind_loc)+ndomain_glo*submessage%remote_ind_glo, & comm_icosa, message%mpi_requests_in(i), ierr ) !$acc end host_data end do message%ondevice=.true. !!$acc update device(message%ondevice) end subroutine subroutine message_delete_ondevice(message) type(t_message), intent(inout) :: message integer :: i if( .not. message%ondevice ) call dynamico_abort("Message not on device") do i = 1, size( message%message_in ) !$acc exit data delete(message%message_in(i)%buff(:,:,:)) async !$acc exit data delete(message%message_in(i)%displs(:)) async !$acc exit data delete(message%message_in(i)%sign(:)) async end do !$acc exit data delete(message%message_in(:)) async do i = 1, size( message%message_out ) !$acc exit data delete(message%message_out(i)%buff(:,:,:)) async !$acc exit data delete(message%message_out(i)%displs(:)) async !!$acc exit data delete(message%message_out(i)%sign(:)) async end do !$acc exit data delete(message%message_out(:)) async do i = 1, size( message%message_local ) !$acc exit data delete(message%message_local(i)%displ_src(:)) async !$acc exit data delete(message%message_local(i)%displ_dest(:)) async !$acc exit data delete(message%message_local(i)%sign(:)) async end do !$acc exit data delete(message%message_local(:)) async do i = 1, ndomain !$acc exit data delete(message%field(i)%rval4d(:,:,:)) async end do !$acc exit data delete(message%field(:)) async !$acc exit data delete(message) async message%ondevice=.false. end subroutine subroutine finalize_message(message) type(t_message), intent(inout) :: message integer :: i, ierr !$omp barrier !$omp master if(message%send_seq /= message%wait_seq) call dynamico_abort("No matching wait_message before finalization") if(message%ondevice) call message_delete_ondevice(message) deallocate(message%message_in) deallocate(message%message_out) deallocate(message%message_local) do i=1, size(message%mpi_requests_in) call MPI_Request_free(message%mpi_requests_in(i), ierr) end do deallocate(message%mpi_requests_in) do i=1, size(message%mpi_requests_out) call MPI_Request_free(message%mpi_requests_out(i), ierr) end do deallocate(message%mpi_requests_out) !$omp end master !$omp barrier end subroutine subroutine send_message(field, message) use mpi_mod use domain_mod, only : assigned_domain use omp_para, only : distrib_level type(t_field),pointer :: field(:) type(t_message), target :: message integer :: i, k, d3, d4 integer :: ierr, d3_begin, d3_end, dim4 call enter_profile(profile_mpi) ! Needed because rval4d is set in init_message if( .not. associated( message%field, field ) ) & call dynamico_abort("send_message must be called with the same field used in init_message") !Prepare 'message' for on-device copies if field is on device !$omp master if( field(1)%ondevice .and. .not. message%ondevice ) call message_create_ondevice(message) if( field(1)%ondevice .neqv. message%ondevice ) call dynamico_abort("send_message : internal device/host memory synchronization error") ! Check if previous message has been waited if(message%send_seq /= message%wait_seq) & call dynamico_abort("No matching wait_message before new send_message") message%send_seq = message%send_seq + 1 !$omp end master call enter_profile(profile_mpi_barrier) !$omp barrier call exit_profile(profile_mpi_barrier) dim4 = size(message%field(1)%rval4d, 3) CALL distrib_level( 1, size(message%field(1)%rval4d, 2), d3_begin, d3_end ) call enter_profile(profile_mpi_copies) !$acc data present(message) async if(message%ondevice) ! Halo to Buffer : copy outbound message to MPI buffers !$acc parallel loop default(none) present(message, assigned_domain) async if(message%ondevice) do i = 1, size( message%message_out ) if( assigned_domain( message%message_out(i)%ind_loc ) ) then !$acc loop collapse(2) do d4 = 1, dim4 do d3 = d3_begin, d3_end !$acc loop do k = 1, size(message%message_out(i)%displs) message%message_out(i)%buff(k,d3,d4) = message%field(message%message_out(i)%ind_loc)%rval4d(message%message_out(i)%displs(k),d3, d4) end do end do end do endif end do ! Halo to Halo : copy local messages from source field to destination field !$acc parallel loop default(none) present(message, assigned_domain) async if(message%ondevice) do i = 1, size( message%message_local ) if( assigned_domain( message%message_local(i)%dest_ind_loc ) ) then !$acc loop collapse(2) do d4 = 1, dim4 do d3 = d3_begin, d3_end ! Cannot collapse because size(displ_dest) varies with i !$acc loop vector do k = 1, size(message%message_local(i)%displ_dest) message%field(message%message_local(i)%dest_ind_loc)%rval4d(message%message_local(i)%displ_dest(k),d3,d4) = & message%message_local(i)%sign(k)*message%field(message%message_local(i)%src_ind_loc)%rval4d(message%message_local(i)%displ_src(k),d3,d4) end do end do end do endif end do !$acc end data call exit_profile(profile_mpi_copies) call enter_profile(profile_mpi_barrier) !$acc wait !$omp barrier call exit_profile(profile_mpi_barrier) !$omp master call MPI_Startall( size(message%mpi_requests_out), message%mpi_requests_out, ierr ) call MPI_Startall( size(message%mpi_requests_in), message%mpi_requests_in, ierr ) !$omp end master call exit_profile(profile_mpi) end subroutine subroutine test_message(message) use mpi_mod type(t_message) :: message integer :: ierr logical :: completed !$omp master call MPI_Testall( size(message%mpi_requests_out), message%mpi_requests_out, completed, MPI_STATUSES_IGNORE, ierr ) call MPI_Testall( size(message%mpi_requests_in), message%mpi_requests_in, completed, MPI_STATUSES_IGNORE, ierr ) !$omp end master end subroutine subroutine wait_message(message) use mpi_mod use domain_mod, only : assigned_domain use omp_para, only : distrib_level type(t_message), target :: message integer :: d3_begin, d3_end, dim4 integer :: i, ierr, k, d3, d4 ! Check if message has been sent and not recieved yet ! note : barrier needed between this and send_seq increment, and this and wait_seq increment ! note : watch out for integer overflow a = b+1 doesn't imply a>b if(message%send_seq /= message%wait_seq+1) then print*, "WARNING : wait_message called multiple times for one send_message, skipping" return ! Don't recieve message if already recieved end if call enter_profile(profile_mpi) CALL distrib_level( 1, size(message%field(1)%rval4d, 2), d3_begin, d3_end ) call enter_profile(profile_mpi_waitall) !$omp master call MPI_Waitall( size(message%mpi_requests_out), message%mpi_requests_out, MPI_STATUSES_IGNORE, ierr ) call MPI_Waitall( size(message%mpi_requests_in), message%mpi_requests_in, MPI_STATUSES_IGNORE, ierr ) !$omp end master call exit_profile(profile_mpi_waitall) call enter_profile(profile_mpi_barrier) !$omp barrier call exit_profile(profile_mpi_barrier) call enter_profile(profile_mpi_copies) !$acc data present(message) async if(message%ondevice) dim4 = size(message%field(1)%rval4d, 3) ! Buffer to Halo : copy inbound message to field !$acc parallel loop default(none) present(message, assigned_domain) async if(message%ondevice) do i = 1, size( message%message_in ) if( assigned_domain( message%message_in(i)%ind_loc ) ) then !$acc loop collapse(2) do d4 = 1, dim4 do d3 = d3_begin, d3_end !$acc loop do k = 1, size(message%message_in(i)%displs) message%field(message%message_in(i)%ind_loc)%rval4d(message%message_in(i)%displs(k),d3,d4) = message%message_in(i)%sign(k)*message%message_in(i)%buff(k,d3,d4) end do end do end do endif end do !$acc end data call exit_profile(profile_mpi_copies) !$omp master message%wait_seq = message%wait_seq + 1 !$omp end master call enter_profile(profile_mpi_barrier) !$omp barrier call exit_profile(profile_mpi_barrier) call exit_profile(profile_mpi) end subroutine end module