Context Navigation

mod_oasis_reprosum.F90

Last change on this file was 6331, checked in by aclsce, 17 months ago
Moved oasis-mct_5.0 in oasis3-mct/branches directory.
File size: 60.9 KB

Rev	Line
[6331]	1
	2	!> OASIS reproducible sum method from P. Worley
	3
	4	MODULE mod_oasis_reprosum
	5
	6	!-----------------------------------------------------------------------
	7	!
	8	! Purpose:
	9	!> Compute reproducible global sums of a set of arrays across an MPI
	10	!> subcommunicator
	11	!
	12	! Methods:
	13	!> Compute using either or both a scalable, reproducible algorithm and a
	14	!> scalable, nonreproducible algorithm:
	15	!> * Reproducible (scalable):
	16	!> Convert to fixed point (integer vector representation) to enable
	17	!> reproducibility when using MPI_Allreduce
	18	!> * Alternative usually reproducible (scalable):
	19	!> Use parallel double-double algorithm due to Helen He and
	20	!> Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
	21	!> * Nonreproducible (scalable):
	22	!> Floating point and MPI_Allreduce based.
	23	!> If computing both reproducible and nonreproducible sums, compare
	24	!> these and report relative difference (if absolute difference
	25	!> less than sum) or absolute difference back to calling routine.
	26	!
	27	! Author: P. Worley (based on suggestions from J. White for fixed
	28	! point algorithm and on He/Ding paper for ddpdd
	29	! algorithm)
	30	!
	31	! Modified by T.Craig for Oasis, May 2017 based on the public version in
	32	! CESM1.2.
	33	!
	34	!-----------------------------------------------------------------------
	35
	36	!-----------------------------------------------------------------------
	37	!- use statements ------------------------------------------------------
	38	!-----------------------------------------------------------------------
	39	#if ( defined noI8 )
	40	! Workaround for when shr_kind_i8 is not supported.
	41	use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i4_p
	42	#else
	43	use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i8_p
	44	#endif
	45	use mod_oasis_data, only: nulprt
	46	use mod_oasis_sys, only: oasis_abort, estr
	47	use mod_oasis_mpi, only: oasis_mpi_barrier
	48	use mod_oasis_timer, only: oasis_timer_start, oasis_timer_stop
	49	use, intrinsic :: iso_c_binding
	50
	51	!-----------------------------------------------------------------------
	52	!- module boilerplate --------------------------------------------------
	53	!-----------------------------------------------------------------------
	54	implicit none
	55	private
	56
	57	!-----------------------------------------------------------------------
	58	!- include statements --------------------------------------------------
	59	!-----------------------------------------------------------------------
	60	#include <mpif.h>
	61
	62	!-----------------------------------------------------------------------
	63	! Public interfaces ----------------------------------------------------
	64	!-----------------------------------------------------------------------
	65	public :: &
	66	oasis_reprosum_setopts, &! set runtime options
	67	oasis_reprosum_calc, &! calculate distributed sum
	68	oasis_reprosum_tolExceeded ! utility function to check relative
	69	! differences against the tolerance
	70
	71	!-----------------------------------------------------------------------
	72	! Public data ----------------------------------------------------------
	73	!-----------------------------------------------------------------------
	74	logical, public :: oasis_reprosum_recompute = .false.
	75
	76	real(r8), public :: oasis_reprosum_reldiffmax = -1.0_r8
	77
	78	!-----------------------------------------------------------------------
	79	! Private interfaces ---------------------------------------------------
	80	!-----------------------------------------------------------------------
	81	private :: &
	82	ddpdd, &! double-double sum routine
	83	split_indices ! split indices among OMP threads
	84
	85	!-----------------------------------------------------------------------
	86	! Private data ---------------------------------------------------------
	87	!-----------------------------------------------------------------------
	88
	89	logical :: repro_sum_use_ddpdd = .false.
	90	logical :: detailed_timing = .false.
	91
	92	!-----------------------------------------------------------------------
	93	! External C bindings --------------------------------------------------
	94	!-----------------------------------------------------------------------
	95
	96	interface
	97	subroutine shr_reprosumx86_fix_start(old_cw) bind(C)
	98	use iso_c_binding, only: c_int
	99	integer(c_int), intent(out) :: old_cw
	100	end subroutine shr_reprosumx86_fix_start
	101
	102	subroutine shr_reprosumx86_fix_end(old_cw) bind(C)
	103	use iso_c_binding, only: c_int
	104	integer(c_int), intent(in) :: old_cw
	105	end subroutine shr_reprosumx86_fix_end
	106	end interface
	107
	108	CONTAINS
	109
	110	!========================================================================
	111	!-----------------------------------------------------------------------
	112	! Purpose:
	113	!> Set runtime options
	114	! Author: P. Worley
	115	!-----------------------------------------------------------------------
	116
	117	subroutine oasis_reprosum_setopts(repro_sum_use_ddpdd_in, &
	118	repro_sum_rel_diff_max_in, &
	119	repro_sum_recompute_in, &
	120	repro_sum_master, &
	121	repro_sum_logunit )
	122
	123	!------------------------------Arguments--------------------------------
	124	logical, intent(in), optional :: repro_sum_use_ddpdd_in
	125	!< Use DDPDD algorithm instead of fixed precision algorithm
	126	real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
	127	!< maximum permissible difference between reproducible and
	128	!< nonreproducible sums
	129	logical, intent(in), optional :: repro_sum_recompute_in
	130	!< recompute using different algorithm when difference between
	131	!< reproducible and nonreproducible sums is too great
	132	logical, intent(in), optional :: repro_sum_master
	133	!< flag indicating whether this process should output
	134	!< log messages
	135	integer, intent(in), optional :: repro_sum_logunit
	136	!< unit number for log messages
	137	!---------------------------Local Workspace-----------------------------
	138	integer llogunit ! unit number for log messages
	139	logical master ! local master?
	140	logical,save :: firstcall = .true. ! first call
	141	character(len=*),parameter :: subname = '(oasis_reprosum_setopts)'
	142	!-----------------------------------------------------------------------
	143
	144	if ( present(repro_sum_master) ) then
	145	master = repro_sum_master
	146	else
	147	master = .false.
	148	endif
	149
	150	if ( present(repro_sum_logunit) ) then
	151	llogunit = repro_sum_logunit
	152	else
	153	llogunit = nulprt
	154	endif
	155
	156	if (.not. firstcall) then
	157	write(llogunit,*) subname,estr,' can only be called once'
	158	call oasis_abort(file=__FILE__,line=__LINE__)
	159	endif
	160	firstcall = .false.
	161
	162	if ( present(repro_sum_use_ddpdd_in) ) then
	163	repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
	164	endif
	165	if ( present(repro_sum_rel_diff_max_in) ) then
	166	oasis_reprosum_reldiffmax = repro_sum_rel_diff_max_in
	167	endif
	168	if ( present(repro_sum_recompute_in) ) then
	169	oasis_reprosum_recompute = repro_sum_recompute_in
	170	endif
	171	if (master) then
	172	if ( repro_sum_use_ddpdd ) then
	173	write(llogunit,*) subname, &
	174	'Using double-double-based (scalable) usually reproducible ', &
	175	'distributed sum algorithm'
	176	else
	177	write(llogunit,*) subname, &
	178	'Using fixed-point-based (scalable) reproducible ', &
	179	'distributed sum algorithm'
	180	endif
	181
	182	if (oasis_reprosum_reldiffmax >= 0._r8) then
	183	write(llogunit,*) subname, &
	184	' with a maximum relative error tolerance of ', &
	185	oasis_reprosum_reldiffmax
	186	if (oasis_reprosum_recompute) then
	187	write(llogunit,*) subname, &
	188	'If tolerance exceeded, sum is recomputed using ', &
	189	'a serial algorithm.'
	190	else
	191	write(llogunit,*) subname, &
	192	'If tolerance exceeded, fixed-precision is sum used ', &
	193	'but a warning is output.'
	194	endif
	195	else
	196	write(llogunit,*) subname, &
	197	'and not comparing with floating point algorithms.'
	198	endif
	199
	200	endif
	201	end subroutine oasis_reprosum_setopts
	202
	203	!========================================================================
	204
	205	!
	206	! Purpose:
	207	!> Compute the global sum of each field in "arr" using the indicated
	208	!> communicator with a reproducible yet scalable implementation based
	209	!> on a fixed point algorithm. An alternative is to use an "almost
	210	!> always reproducible" floating point algorithm.
	211	!
	212	! The accuracy of the fixed point algorithm is controlled by the
	213	! number of "levels" of integer expansion. The algorithm will calculate
	214	! the number of levels that is required for the sum to be essentially
	215	! exact. The optional parameter arr_max_levels can be used to override
	216	! the calculated value. The optional parameter arr_max_levels_out can be
	217	! used to return the values used.
	218	!
	219	! The algorithm also requires an upper bound on
	220	! the maximum summand (in absolute value) for each field, and will
	221	! calculate this internally. However, if the optional parameters
	222	! arr_max_levels and arr_gbl_max are both set, then the algorithm will
	223	! use the values in arr_gbl_max for the upper bounds instead. If these
	224	! are not upper bounds, or if the upper bounds are not tight enough
	225	! to achieve the requisite accuracy, and if the optional parameter
	226	! repro_sum_validate is NOT set to .false., the algorithm will repeat the
	227	! computation with appropriate upper bounds. If only arr_gbl_max is present,
	228	! then the maxima are computed internally (and the specified values are
	229	! ignored). The optional parameter arr_gbl_max_out can be
	230	! used to return the values used.
	231	!
	232	! Finally, the algorithm requires an upper bound on the number of
	233	! local summands across all processes. This will be calculated internally,
	234	! using an MPI collective, but the value in the optional argument
	235	! gbl_max_nsummands will be used instead if (1) it is present, (2)
	236	! it is > 0, and (3) the maximum value and required number of levels
	237	! are also specified. (If the maximum value is calculated, the same
	238	! MPI collective is used to determine the maximum number of local
	239	! summands.) The accuracy of the user-specified value is not checked.
	240	! However, if set to < 1, the value will instead be calculated. If the
	241	! optional parameter gbl_max_nsummands_out is present, then the value
	242	! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
	243	! returned.
	244	!
	245	! If requested (by setting oasis_reprosum_reldiffmax >= 0.0 and passing in
	246	! the optional rel_diff parameter), results are compared with a
	247	! nonreproducible floating point algorithm.
	248	!
	249	! Note that the cost of the algorithm is not strongly correlated with
	250	! the number of levels, which primarily shows up as a (modest) increase
	251	! in cost of the MPI_Allreduce as a function of vector length. Rather the
	252	! cost is more a function of (a) the number of integers required to
	253	! represent an individual summand and (b) the number of MPI_Allreduce
	254	! calls. The number of integers required to represent an individual
	255	! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
	256	! when the number of local summands is not too large. As the number of
	257	! local summands increases, the number of integers required increases.
	258	! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
	259	! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
	260	! correctly). When specifying arr_max_levels and arr_gbl_max
	261	! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
	262	!
	263	! The alternative algorithm is a minor modification of a parallel
	264	! implementation of David Bailey's routine DDPDD by Helen He
	265	! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
	266	! precision summation of double precision values with 10 double
	267	! precision operations. The advantage of this algorithm is that
	268	! it requires a single MPI_Allreduce and is less expensive per summand
	269	! than is the fixed precision algorithm. The disadvantage is that it
	270	! is not guaranteed to be reproducible (though it is reproducible
	271	! much more often than is the standard algorithm). This alternative
	272	! is used when the optional parameter ddpdd_sum is set to .true. It is
	273	! also used if the fixed precision algorithm radix assumption does not
	274	! hold.
	275
	276	!----------------------------------------------------------------------
	277	subroutine oasis_reprosum_calc (arr, arr_gsum, nsummands, dsummands, &
	278	nflds, ddpdd_sum, &
	279	arr_gbl_max, arr_gbl_max_out, &
	280	arr_max_levels, arr_max_levels_out, &
	281	gbl_max_nsummands, gbl_max_nsummands_out,&
	282	gbl_count, repro_sum_validate, &
	283	repro_sum_stats, rel_diff, commid )
	284	!----------------------------------------------------------------------
	285
	286	! Arguments
	287
	288	integer, intent(in) :: nsummands !< number of local summands
	289	integer, intent(in) :: dsummands !< declared first dimension
	290	integer, intent(in) :: nflds !< number of fields
	291	real(r8), intent(in) :: arr(dsummands,nflds)
	292	!< input array
	293
	294	real(r8), intent(out):: arr_gsum(nflds)
	295	!< global means
	296
	297	logical, intent(in), optional :: ddpdd_sum
	298	!< use ddpdd algorithm instead
	299	!< of fixed precision algorithm
	300
	301	real(r8), intent(in), optional :: arr_gbl_max(nflds)
	302	!< upper bound on max(abs(arr))
	303
	304	real(r8), intent(out), optional :: arr_gbl_max_out(nflds)
	305	!< calculated upper bound on
	306	!< max(abs(arr))
	307
	308	integer, intent(in), optional :: arr_max_levels(nflds)
	309	!< maximum number of levels of
	310	!< integer expansion to use
	311
	312	integer, intent(out), optional :: arr_max_levels_out(nflds)
	313	!< output of number of levels of
	314	!< integer expansion to used
	315
	316	integer, intent(in), optional :: gbl_max_nsummands
	317	!< maximum of nsummand over all
	318	!< processes
	319
	320	integer, intent(out), optional :: gbl_max_nsummands_out
	321	!< calculated maximum nsummands
	322	!< over all processes
	323
	324	integer, intent(in), optional :: gbl_count
	325	!< was total number of summands;
	326	!< now is ignored; use
	327	!< gbl_max_nsummands instead
	328
	329	logical, intent(in), optional :: repro_sum_validate
	330	!< flag enabling/disabling testing that gmax and max_levels are
	331	!< accurate/sufficient. Default is enabled.
	332
	333	integer, intent(inout), optional :: repro_sum_stats(5)
	334	!< increment running totals for
	335	!< (1) one-reduction repro_sum
	336	!< (2) two-reduction repro_sum
	337	!< (3) both types in one call
	338	!< (4) nonrepro_sum
	339	!< (5) global max nsummands reduction
	340
	341	real(r8), intent(out), optional :: rel_diff(2,nflds)
	342	!< relative and absolute
	343	!< differences between fixed
	344	!< and floating point sums
	345
	346	integer, intent(in), optional :: commid
	347	!< MPI communicator
	348
	349	! Local workspace
	350
	351	logical :: use_ddpdd_sum ! flag indicating whether to
	352	! use oasis_reprosum_ddpdd or not
	353	logical :: recompute ! flag indicating need to
	354	! determine gmax/gmin before
	355	! computing sum
	356	logical :: validate ! flag indicating need to
	357	! verify gmax and max_levels
	358	! are accurate/sufficient
	359	integer :: omp_nthreads ! number of OpenMP threads
	360	integer :: mpi_comm ! MPI subcommunicator
	361	integer :: tasks ! number of MPI processes
	362	integer :: ierr ! MPI error return
	363	integer :: ifld, isum, ithread ! loop variables
	364	integer :: max_nsummands ! max nsummands over all processes
	365	! or threads (used in both ways)
	366
	367	integer, allocatable :: isum_beg(:), isum_end(:)
	368	! range of summand indices for each
	369	! OpenMP thread
	370	integer, allocatable :: arr_tlmin_exp(:,:)
	371	! per thread local exponent minima
	372	integer, allocatable :: arr_tlmax_exp(:,:)
	373	! per thread local exponent maxima
	374	integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
	375	! summand exponent and working min/max
	376	integer :: arr_lmin_exp(nflds) ! local exponent minima
	377	integer :: arr_lmax_exp(nflds) ! local exponent maxima
	378	integer :: arr_lextremes(0:nflds,2)! local exponent extrema
	379	integer :: arr_gextremes(0:nflds,2)! global exponent extrema
	380
	381	integer :: arr_gmax_exp(nflds) ! global exponents maxima
	382	integer :: arr_gmin_exp(nflds) ! global exponents minima
	383	integer :: arr_max_shift ! maximum safe exponent for
	384	! value < 1 (so that sum does
	385	! not overflow)
	386	integer :: max_levels(nflds) ! maximum number of levels of
	387	! integer expansion to use
	388	integer :: max_level ! maximum value in max_levels
	389	integer :: gbl_max_red ! global max local sum reduction? (0/1)
	390	integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1)
	391	integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1)
	392	integer :: repro_sum_both ! both fast and slow? (0/1)
	393	integer :: nonrepro_sum ! nonrepro_sum? (0/1)
	394
	395	real(r8) :: xmax_nsummands ! dble of max_nsummands
	396	real(r8) :: arr_lsum(nflds) ! local sums
	397	real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using
	398	! fast, nonreproducible,
	399	! floating point alg.
	400	real(r8) :: abs_diff ! absolute difference between
	401	! fixed and floating point
	402	! sums
	403	#ifdef _OPENMP
	404	integer omp_get_max_threads
	405	external omp_get_max_threads
	406	#endif
	407	character(len=*),parameter :: subname = '(oasis_reprosum_calc)'
	408
	409	!-----------------------------------------------------------------------
	410
	411	! check whether should use oasis_reprosum_ddpdd algorithm
	412	use_ddpdd_sum = repro_sum_use_ddpdd
	413	if ( present(ddpdd_sum) ) then
	414	use_ddpdd_sum = ddpdd_sum
	415	endif
	416
	417	! check whether intrinsic-based algorithm will work on this system
	418	! (requires floating point and integer bases to be the same)
	419	! If not, always use ddpdd.
	420	use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
	421
	422	! initialize local statistics variables
	423	gbl_max_red = 0
	424	repro_sum_fast = 0
	425	repro_sum_slow = 0
	426	repro_sum_both = 0
	427	nonrepro_sum = 0
	428
	429	! set MPI communicator
	430	if ( present(commid) ) then
	431	mpi_comm = commid
	432	else
	433	mpi_comm = MPI_COMM_WORLD
	434	endif
	435
	436	if (detailed_timing) then
	437	call oasis_timer_start('oasis_reprosum_prebarrier')
	438	call oasis_mpi_barrier(mpi_comm,subname)
	439	call oasis_timer_stop ('oasis_reprosum_prebarrier')
	440	endif
	441
	442	if ( use_ddpdd_sum ) then
	443
	444	if (detailed_timing) call oasis_timer_start('oasis_reprosum_ddpdd')
	445
	446	call oasis_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
	447	nflds, mpi_comm)
	448	repro_sum_fast = 1
	449
	450	if (detailed_timing) call oasis_timer_stop('oasis_reprosum_ddpdd')
	451
	452	else
	453
	454	if (detailed_timing) call oasis_timer_start('oasis_reprosum_int')
	455
	456	! get number of MPI tasks
	457	call mpi_comm_size(mpi_comm, tasks, ierr)
	458
	459	! get number of OpenMP threads
	460	#ifdef _OPENMP
	461	omp_nthreads = omp_get_max_threads()
	462	#else
	463	omp_nthreads = 1
	464	#endif
	465
	466	! see if have sufficient information to not require max/min allreduce
	467	recompute = .true.
	468	validate = .false.
	469	if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
	470	recompute = .false.
	471
	472	! setting lower bound on max_level*nflds to be 64 to improve OpenMP
	473	! performance for loopb in oasis_reprosum_int
	474	max_level = (64/nflds) + 1
	475	do ifld=1,nflds
	476	if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
	477	(arr_max_levels(ifld) > 0)) then
	478
	479	arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld))
	480	if (max_level < arr_max_levels(ifld)) &
	481	max_level = arr_max_levels(ifld)
	482
	483	else
	484	recompute = .true.
	485	endif
	486	enddo
	487
	488	if (.not. recompute) then
	489
	490	! determine maximum number of summands in local phases of the
	491	! algorithm
	492	if (detailed_timing) call oasis_timer_start("repro_sum_allr_max")
	493	if ( present(gbl_max_nsummands) ) then
	494	if (gbl_max_nsummands < 1) then
	495	call mpi_allreduce (nsummands, max_nsummands, 1, &
	496	MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
	497	gbl_max_red = 1
	498	else
	499	max_nsummands = gbl_max_nsummands
	500	endif
	501	else
	502	call mpi_allreduce (nsummands, max_nsummands, 1, &
	503	MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
	504	gbl_max_red = 1
	505	endif
	506	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_max")
	507
	508	! determine maximum shift. Shift needs to be small enough that summation
	509	! does not exceed maximum number of digits in i8.
	510
	511	! if requested, return max_nsummands before it is redefined
	512	if ( present( gbl_max_nsummands_out) ) then
	513	gbl_max_nsummands_out = max_nsummands
	514	endif
	515
	516	! summing within each thread first
	517	max_nsummands = (max_nsummands/omp_nthreads) + 1
	518	! then over threads and tasks
	519	max_nsummands = max(max_nsummands, tasks*omp_nthreads)
	520
	521	xmax_nsummands = dble(max_nsummands)
	522	arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
	523	if (arr_max_shift < 2) then
	524	write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm'
	525	call oasis_abort(file=__FILE__,line=__LINE__)
	526	endif
	527
	528	! calculate sum
	529	if (present(repro_sum_validate)) then
	530	validate = repro_sum_validate
	531	else
	532	validate = .true.
	533	endif
	534	call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
	535	nflds, arr_max_shift, arr_gmax_exp, &
	536	arr_max_levels, max_level, validate, &
	537	recompute, omp_nthreads, mpi_comm)
	538
	539	! record statistics, etc.
	540	repro_sum_fast = 1
	541	if (recompute) then
	542	repro_sum_both = 1
	543	else
	544	! if requested, return specified levels and upper bounds on maxima
	545	if ( present(arr_max_levels_out) ) then
	546	do ifld=1,nflds
	547	arr_max_levels_out(ifld) = arr_max_levels(ifld)
	548	enddo
	549	endif
	550	if ( present(arr_gbl_max_out) ) then
	551	do ifld=1,nflds
	552	arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
	553	enddo
	554	endif
	555	endif
	556	endif
	557	endif
	558
	559	! do not have sufficient information; calculate global max/min and
	560	! use to compute required number of levels
	561	if (recompute) then
	562
	563	! record statistic
	564	repro_sum_slow = 1
	565
	566	! determine maximum and minimum (non-zero) summand values and
	567	! maximum number of local summands
	568
	569	! allocate thread-specific work space
	570	allocate(arr_tlmax_exp(nflds,omp_nthreads))
	571	allocate(arr_tlmin_exp(nflds,omp_nthreads))
	572	allocate(isum_beg(omp_nthreads))
	573	allocate(isum_end(omp_nthreads))
	574
	575	! split summand index range over OpenMP threads
	576	call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
	577
	578	!$omp parallel do &
	579	!$omp default(shared) &
	580	!$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
	581	do ithread=1,omp_nthreads
	582	if (detailed_timing) call oasis_timer_start('repro_sum_loopa')
	583	do ifld=1,nflds
	584	arr_exp_tlmin = MAXEXPONENT(1._r8)
	585	arr_exp_tlmax = MINEXPONENT(1._r8)
	586	do isum=isum_beg(ithread),isum_end(ithread)
	587	if (arr(isum,ifld) .ne. 0.0_r8) then
	588	arr_exp = exponent(arr(isum,ifld))
	589	arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
	590	arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
	591	endif
	592	end do
	593	arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
	594	arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
	595	end do
	596	if (detailed_timing) call oasis_timer_stop('repro_sum_loopa')
	597	end do
	598
	599	do ifld=1,nflds
	600	arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
	601	arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
	602	end do
	603	deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
	604
	605	arr_lextremes(0,:) = -nsummands
	606	arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
	607	arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
	608	if (detailed_timing) call oasis_timer_start("repro_sum_allr_minmax")
	609	call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
	610	MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
	611	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_minmax")
	612	max_nsummands = -arr_gextremes(0,1)
	613	arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
	614	arr_gmin_exp(:) = arr_gextremes(1:nflds,2)
	615
	616	! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
	617	! and arr_gmax_exp still equals MINEXPONENT. In this case, set
	618	! arr_gmin_exp = arr_gmax_exp = MINEXPONENT
	619	do ifld=1,nflds
	620	arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
	621	enddo
	622
	623	! if requested, return upper bounds on observed maxima
	624	if ( present(arr_gbl_max_out) ) then
	625	do ifld=1,nflds
	626	arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
	627	enddo
	628	endif
	629
	630	! if requested, return max_nsummands before it is redefined
	631	if ( present( gbl_max_nsummands_out) ) then
	632	gbl_max_nsummands_out = max_nsummands
	633	endif
	634
	635	! determine maximum shift (same as in previous branch, but with calculated
	636	! max_nsummands). Shift needs to be small enough that summation does not
	637	! exceed maximum number of digits in i8.
	638
	639	! summing within each thread first
	640	max_nsummands = (max_nsummands/omp_nthreads) + 1
	641	! then over threads and tasks
	642	max_nsummands = max(max_nsummands, tasks*omp_nthreads)
	643
	644	xmax_nsummands = dble(max_nsummands)
	645	arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
	646	if (arr_max_shift < 2) then
	647	write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm'
	648	call oasis_abort(file=__FILE__,line=__LINE__)
	649	endif
	650
	651	! determine maximum number of levels required for each field
	652	! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
	653	! + 1 because first truncation probably does not involve a maximal shift
	654	! + 1 to guarantee that the integer division rounds up (not down)
	655	! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
	656	! performance for loopb in oasis_reprosum_int)
	657	max_level = (64/nflds) + 1
	658	do ifld=1,nflds
	659	max_levels(ifld) = 2 + &
	660	((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &
	661	/ arr_max_shift)
	662	if ( present(arr_max_levels) .and. (.not. validate) ) then
	663	! if validate true, then computation with arr_max_levels failed
	664	! previously
	665	if ( arr_max_levels(ifld) > 0 ) then
	666	max_levels(ifld) = &
	667	min(arr_max_levels(ifld),max_levels(ifld))
	668	endif
	669	endif
	670	if (max_level < max_levels(ifld)) &
	671	max_level = max_levels(ifld)
	672	enddo
	673
	674	! if requested, return calculated levels
	675	if ( present(arr_max_levels_out) ) then
	676	do ifld=1,nflds
	677	arr_max_levels_out(ifld) = max_levels(ifld)
	678	enddo
	679	endif
	680
	681	! calculate sum
	682	validate = .false.
	683	call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
	684	arr_max_shift, arr_gmax_exp, max_levels, &
	685	max_level, validate, recompute, &
	686	omp_nthreads, mpi_comm)
	687
	688	endif
	689
	690	if (detailed_timing) call oasis_timer_stop('oasis_reprosum_int')
	691
	692	endif
	693
	694	! compare fixed and floating point results
	695	if ( present(rel_diff) ) then
	696	if (oasis_reprosum_reldiffmax >= 0.0_r8) then
	697
	698	if (detailed_timing) then
	699	call oasis_timer_start('oasis_nonreprosum_prebarrier')
	700	call oasis_mpi_barrier(mpi_comm,subname)
	701	call oasis_timer_stop ('oasis_nonreprosum_prebarrier')
	702	endif
	703
	704	if (detailed_timing) call oasis_timer_start('nonrepro_sum')
	705	! record statistic
	706	nonrepro_sum = 1
	707	! compute nonreproducible sum
	708	arr_lsum(:) = 0._r8
	709	!$omp parallel do &
	710	!$omp default(shared) &
	711	!$omp private(ifld, isum)
	712	do ifld=1,nflds
	713	do isum=1,nsummands
	714	arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
	715	end do
	716	end do
	717
	718	call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
	719	MPI_REAL8, MPI_SUM, mpi_comm, ierr)
	720
	721	if (detailed_timing) call oasis_timer_stop('nonrepro_sum')
	722
	723	! determine differences
	724	!$omp parallel do &
	725	!$omp default(shared) &
	726	!$omp private(ifld, abs_diff)
	727	do ifld=1,nflds
	728	abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
	729	if (abs(arr_gsum(ifld)) > abs_diff) then
	730	rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
	731	else
	732	rel_diff(1,ifld) = abs_diff
	733	endif
	734	rel_diff(2,ifld) = abs_diff
	735	enddo
	736	else
	737	rel_diff(:,:) = 0.0_r8
	738	endif
	739	endif
	740
	741	! return statistics
	742	if ( present(repro_sum_stats) ) then
	743	repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
	744	repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
	745	repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
	746	repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
	747	repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
	748	endif
	749
	750
	751	end subroutine oasis_reprosum_calc
	752
	753	!========================================================================
	754	!
	755	! Purpose:
	756	!> Compute the global sum of each field in "arr" using the indicated
	757	!> communicator with a reproducible yet scalable implementation based
	758	!> on a fixed point algorithm. The accuracy of the fixed point algorithm
	759	!> is controlled by the number of "levels" of integer expansion, the
	760	!> maximum value of which is specified by max_level.
	761	!
	762	!----------------------------------------------------------------------
	763
	764	subroutine oasis_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
	765	arr_max_shift, arr_gmax_exp, max_levels, &
	766	max_level, validate, recompute, &
	767	omp_nthreads, mpi_comm )
	768
	769	!----------------------------------------------------------------------
	770
	771	! Arguments
	772
	773	integer, intent(in) :: nsummands !< number of local summands
	774	integer, intent(in) :: dsummands !< declared first dimension
	775	integer, intent(in) :: nflds !< number of fields
	776	integer, intent(in) :: arr_max_shift !< maximum safe exponent for
	777	!< value < 1 (so that sum
	778	!< does not overflow)
	779	integer, intent(in) :: arr_gmax_exp(nflds)
	780	!< exponents of global maxima
	781	integer, intent(in) :: max_levels(nflds)
	782	!< maximum number of levels
	783	!< of integer expansion
	784	integer, intent(in) :: max_level !< maximum value in
	785	!< max_levels
	786	integer, intent(in) :: omp_nthreads !< number of OpenMP threads
	787	integer, intent(in) :: mpi_comm !< MPI subcommunicator
	788
	789	real(r8), intent(in) :: arr(dsummands,nflds)
	790	!< input array
	791
	792	logical, intent(in):: validate
	793	!< flag indicating that accuracy of solution generated from
	794	!< arr_gmax_exp and max_levels should be tested
	795
	796	logical, intent(out):: recompute
	797	!< flag indicating that either the upper bounds are inaccurate,
	798	!< or max_levels and arr_gmax_exp do not generate accurate
	799	!< enough sums
	800
	801	real(r8), intent(out):: arr_gsum(nflds) !< global means
	802
	803	! Local workspace
	804
	805	integer, parameter :: max_jlevel = &
	806	1 + (digits(0_i8)/digits(0.0_r8))
	807
	808	integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
	809	! integer vector representing local
	810	! sum (per thread, per field)
	811	integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
	812	! integer vector representing local
	813	! sum
	814	integer(i8) :: i8_arr_level ! integer part of summand for current
	815	! expansion level
	816	integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
	817	! integer vector representing global
	818	! sum
	819	integer(i8) :: IX_8 ! integer representation of current
	820	! jlevels of X_8 ('part' of
	821	! i8_arr_gsum_level)
	822	integer(i8) :: i8_sign ! sign global sum
	823	integer(i8) :: i8_radix ! radix for i8 variables
	824
	825	integer :: max_error(nflds,omp_nthreads)
	826	! accurate upper bound on data?
	827	integer :: not_exact(nflds,omp_nthreads)
	828	! max_levels sufficient to
	829	! capture all digits?
	830	integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
	831	! range of summand indices for each
	832	! OpenMP thread
	833	integer :: ifld, isum, ithread
	834	! loop variables
	835	integer :: arr_exp ! exponent of summand
	836	integer :: arr_shift ! exponent used to generate integer
	837	! for current expansion level
	838	integer :: ilevel ! current integer expansion level
	839	integer :: offset(nflds) ! beginning location in
	840	! i8_arr_{g,l}sum_level for integer
	841	! expansion of current ifld
	842	integer :: voffset ! modification to offset used to
	843	! include validation metrics
	844	integer :: ioffset ! offset(ifld)
	845	integer :: jlevel ! number of floating point 'pieces'
	846	! extracted from a given i8 integer
	847	integer :: ierr ! MPI error return
	848	integer :: LX(max_jlevel) ! exponent of X_8 (see below)
	849	integer :: veclth ! total length of i8_arr_lsum_level
	850	integer :: sum_digits ! lower bound on number of significant
	851	! in integer expansion of sum
	852	integer :: curr_exp ! exponent of partial sum during
	853	! reconstruction from integer vector
	854	integer :: corr_exp ! exponent of current summand in
	855	! reconstruction from integer vector
	856
	857	real(r8) :: arr_frac ! fraction of summand
	858	real(r8) :: arr_remainder ! part of summand remaining after
	859	! current level of integer expansion
	860	real(r8) :: X_8(max_jlevel) ! r8 vector representation of current
	861	! i8_arr_gsum_level
	862	real(r8) :: RX_8 ! r8 representation of difference
	863	! between current i8_arr_gsum_level
	864	! and current jlevels of X_8
	865	! (== IX_8). Also used in final
	866	! scaling step
	867
	868	logical :: first ! flag used to indicate that just
	869	! beginning reconstruction of sum
	870	! from integer vector
	871
	872	character(len=*),parameter :: subname = '(oasis_reprosum_int)'
	873
	874	!-----------------------------------------------------------------------
	875	! Save radix of i8 variables in an i8 variable
	876	i8_radix = radix(IX_8)
	877
	878	! If validating upper bounds, reserve space for validation metrics
	879	! In both cases, reserve an extra level for overflow from the top level
	880	if (validate) then
	881	voffset = 3
	882	else
	883	voffset = 1
	884	endif
	885
	886	! compute offsets for each field
	887	offset(1) = voffset
	888	do ifld=2,nflds
	889	offset(ifld) = offset(ifld-1) &
	890	+ (max_levels(ifld-1) + voffset)
	891	enddo
	892	veclth = offset(nflds) + max_levels(nflds)
	893
	894	! split summand index range over OpenMP threads
	895	call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
	896
	897	! convert local summands to vector of integers and sum
	898	! (Using scale instead of set_exponent because arr_remainder may not be
	899	! "normal" after level 1 calculation)
	900	i8_arr_lsum_level(:) = 0_i8
	901
	902	!$omp parallel do &
	903	!$omp default(shared) &
	904	!$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &
	905	!$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
	906	do ithread=1,omp_nthreads
	907	if (detailed_timing) call oasis_timer_start('repro_sum_loopb')
	908	do ifld=1,nflds
	909	ioffset = offset(ifld)
	910
	911	max_error(ifld,ithread) = 0
	912	not_exact(ifld,ithread) = 0
	913
	914	i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
	915	do isum=isum_beg(ithread),isum_end(ithread)
	916	arr_remainder = 0.0_r8
	917
	918	if (arr(isum,ifld) .ne. 0.0_r8) then
	919	arr_exp = exponent(arr(isum,ifld))
	920	arr_frac = fraction(arr(isum,ifld))
	921
	922	! test that global maximum upper bound is an upper bound
	923	if (arr_exp > arr_gmax_exp(ifld)) then
	924	max_error(ifld,ithread) = 1
	925	exit
	926	endif
	927
	928	! calculate first shift
	929	arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
	930
	931	! determine first (probably) nonzero level (assuming initial fraction is
	932	! 'normal' - algorithm still works if this is not true)
	933	! NOTE: this is critical; scale will set to zero if min exponent is too small.
	934	if (arr_shift < 1) then
	935	ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
	936	arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
	937
	938	do while (arr_shift < 1)
	939	arr_shift = arr_shift + arr_max_shift
	940	ilevel = ilevel + 1
	941	enddo
	942	else
	943	ilevel = 1
	944	endif
	945
	946	if (ilevel .le. max_levels(ifld)) then
	947	! apply first shift/truncate, add it to the relevant running
	948	! sum, and calculate the remainder.
	949	arr_remainder = scale(arr_frac,arr_shift)
	950	i8_arr_level = int(arr_remainder,i8)
	951	i8_arr_tlsum_level(ilevel,ifld,ithread) = &
	952	i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
	953	arr_remainder = arr_remainder - i8_arr_level
	954
	955	! while the remainder is non-zero, continue to shift, truncate,
	956	! sum, and calculate new remainder
	957	do while ((arr_remainder .ne. 0.0_r8) &
	958	.and. (ilevel < max_levels(ifld)))
	959	ilevel = ilevel + 1
	960	arr_remainder = scale(arr_remainder,arr_max_shift)
	961	i8_arr_level = int(arr_remainder,i8)
	962	i8_arr_tlsum_level(ilevel,ifld,ithread) = &
	963	i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
	964	arr_remainder = arr_remainder - i8_arr_level
	965	enddo
	966
	967	endif
	968	endif
	969
	970	if (arr_remainder .ne. 0.0_r8) then
	971	not_exact(ifld,ithread) = 1
	972	endif
	973
	974	enddo
	975
	976	! postprocess integer vector to eliminate potential for overlap in the following
	977	! sums over threads and processes: if value larger than or equal to
	978	! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
	979	! vector, resulting in nonoverlapping ranges for each component. Note that
	980	! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
	981	! over threads and processes do not overflow for ilevel==1.
	982	do ilevel=max_levels(ifld),1,-1
	983	RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
	984	IX_8 = int(scale(RX_8,-arr_max_shift),i8)
	985	if (IX_8 .ne. 0_i8) then
	986	i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
	987	i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
	988	IX_8 = IX_8(i8_radix*arr_max_shift)
	989	i8_arr_tlsum_level(ilevel,ifld,ithread) = &
	990	i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
	991	endif
	992	enddo
	993	enddo
	994	if (detailed_timing) call oasis_timer_stop('repro_sum_loopb')
	995	enddo
	996
	997	! sum contributions from different threads
	998	do ifld=1,nflds
	999	ioffset = offset(ifld)
	1000	do ithread = 1,omp_nthreads
	1001	do ilevel = 0,max_levels(ifld)
	1002	i8_arr_lsum_level(ioffset+ilevel) = &
	1003	i8_arr_lsum_level(ioffset+ilevel) &
	1004	+ i8_arr_tlsum_level(ilevel,ifld,ithread)
	1005	enddo
	1006	enddo
	1007	enddo
	1008
	1009	! record if upper bound was inaccurate or if level expansion stopped
	1010	! before full accuracy was achieved
	1011	if (validate) then
	1012	do ifld=1,nflds
	1013	ioffset = offset(ifld)
	1014	i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
	1015	i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
	1016	enddo
	1017	endif
	1018
	1019	! sum integer vector element-wise
	1020	#if ( defined noI8 )
	1021	! Workaround for when i8 is not supported.
	1022	if (detailed_timing) call oasis_timer_start("repro_sum_allr_i4")
	1023	call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
	1024	veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
	1025	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i4")
	1026	#else
	1027	if (detailed_timing) call oasis_timer_start("repro_sum_allr_i8")
	1028	call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
	1029	veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
	1030	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i8")
	1031	#endif
	1032
	1033	! Construct global sum from integer vector representation:
	1034	! 1) arr_max_shift is the shift applied to fraction(arr_gmax) .
	1035	! When shifting back, need to "add back in" true arr_gmax exponent. This was
	1036	! removed implicitly by working only with the fraction .
	1037	! 2) want to add levels into sum in reverse order (smallest to largest). However,
	1038	! even this can generate floating point rounding errors if signs of integers
	1039	! alternate. To avoid this, do some arithmetic with integer vectors so that all
	1040	! components have the same sign. This should keep relative difference between
	1041	! using different integer sizes (e.g. i8 and i4) to machine epsilon
	1042	! 3) assignment to X_8 will usually lose accuracy since maximum integer
	1043	! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
	1044	! correction not very large). Calculate remainder and add in first (since
	1045	! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
	1046	! For r4 (24 digits) may need to correct twice. Code is written in a general
	1047	! fashion, to work no matter how many corrections are necessary (assuming
	1048	! max_jlevel parameter calculation is correct).
	1049
	1050	recompute = .false.
	1051	do ifld=1,nflds
	1052	arr_gsum(ifld) = 0.0_r8
	1053	ioffset = offset(ifld)
	1054
	1055	! if validate is .true., test whether the summand upper bound
	1056	! was exceeded on any of the processes
	1057	if (validate) then
	1058	if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
	1059	recompute = .true.
	1060	endif
	1061	endif
	1062
	1063	if (.not. recompute) then
	1064
	1065	! preprocess integer vector:
	1066	! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
	1067	! to next larger integer in vector, resulting in nonoverlapping ranges for each
	1068	! component. Note that have "ilevel-1=0" level here as described above.
	1069	do ilevel=max_levels(ifld),1,-1
	1070	RX_8 = i8_arr_gsum_level(ioffset+ilevel)
	1071	IX_8 = int(scale(RX_8,-arr_max_shift),i8)
	1072	if (IX_8 .ne. 0_i8) then
	1073	i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
	1074	+ IX_8
	1075	IX_8 = IX_8(i8_radix*arr_max_shift)
	1076	i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) &
	1077	- IX_8
	1078	endif
	1079	enddo
	1080	! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
	1081	! so that all vector components have the same sign (eliminating loss
	1082	! of accuracy arising from difference of large values when
	1083	! reconstructing r8 sum from integer vector)
	1084	ilevel = 0
	1085	do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
	1086	.and. (ilevel < max_levels(ifld)))
	1087	ilevel = ilevel + 1
	1088	enddo
	1089	!
	1090	if (ilevel < max_levels(ifld)) then
	1091	if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
	1092	i8_sign = 1_i8
	1093	else
	1094	i8_sign = -1_i8
	1095	endif
	1096	do jlevel=ilevel,max_levels(ifld)-1
	1097	if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
	1098	.ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
	1099	i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) &
	1100	- i8_sign
	1101	i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
	1102	+ i8_sign(i8_radix*arr_max_shift)
	1103	endif
	1104	enddo
	1105	endif
	1106
	1107	! start with maximum shift, and work up to larger values
	1108	arr_shift = arr_gmax_exp(ifld) &
	1109	- max_levels(ifld)*arr_max_shift
	1110	curr_exp = 0
	1111	first = .true.
	1112	do ilevel=max_levels(ifld),0,-1
	1113
	1114	if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
	1115	jlevel = 1
	1116
	1117	! r8 representation of higher order bits in integer
	1118	X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
	1119	LX(jlevel) = exponent(X_8(jlevel))
	1120
	1121	! calculate remainder
	1122	IX_8 = int(X_8(jlevel),i8)
	1123	RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
	1124
	1125	! repeat using remainder
	1126	do while (RX_8 .ne. 0.0_r8)
	1127	jlevel = jlevel + 1
	1128	X_8(jlevel) = RX_8
	1129	LX(jlevel) = exponent(RX_8)
	1130	IX_8 = IX_8 + int(RX_8,i8)
	1131	RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
	1132	enddo
	1133
	1134	! add in contributions, smaller to larger, rescaling for each
	1135	! addition to guarantee that exponent of working summand is always
	1136	! larger than minexponent
	1137	do while (jlevel > 0)
	1138	if (first) then
	1139	curr_exp = LX(jlevel) + arr_shift
	1140	arr_gsum(ifld) = fraction(X_8(jlevel))
	1141	first = .false.
	1142	else
	1143	corr_exp = curr_exp - (LX(jlevel) + arr_shift)
	1144	arr_gsum(ifld) = fraction(X_8(jlevel)) &
	1145	+ scale(arr_gsum(ifld),corr_exp)
	1146	curr_exp = LX(jlevel) + arr_shift
	1147	endif
	1148	jlevel = jlevel - 1
	1149	enddo
	1150
	1151	endif
	1152
	1153	arr_shift = arr_shift + arr_max_shift
	1154	enddo
	1155
	1156	! apply final exponent correction, scaling first if exponent is too small
	1157	! to apply directly
	1158	corr_exp = curr_exp + exponent(arr_gsum(ifld))
	1159	if (corr_exp .ge. MINEXPONENT(1._r8)) then
	1160	arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
	1161	else
	1162	RX_8 = set_exponent(arr_gsum(ifld), &
	1163	corr_exp-MINEXPONENT(1._r8))
	1164	arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
	1165	endif
	1166
	1167	! if validate is .true. and some precision lost, test whether 'too much'
	1168	! was lost, due to too loose an upper bound, too stringent a limit on number
	1169	! of levels of expansion, cancellation, .... Calculated by comparing lower
	1170	! bound on number of sigificant digits with number of digits in 1.0_r8 .
	1171	if (validate) then
	1172	if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
	1173
	1174	! find first nonzero level and use exponent for this level, then assume all
	1175	! subsequent levels contribute arr_max_shift digits.
	1176	sum_digits = 0
	1177	do ilevel=0,max_levels(ifld)
	1178	if (sum_digits .eq. 0) then
	1179	if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
	1180	X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
	1181	LX(1) = exponent(X_8(1))
	1182	sum_digits = LX(1)
	1183	endif
	1184	else
	1185	sum_digits = sum_digits + arr_max_shift
	1186	endif
	1187	enddo
	1188
	1189	if (sum_digits < digits(1.0_r8)) then
	1190	recompute = .true.
	1191	endif
	1192	endif
	1193	endif
	1194
	1195	endif
	1196
	1197	enddo
	1198
	1199
	1200	end subroutine oasis_reprosum_int
	1201
	1202	!========================================================================
	1203	!
	1204	! Purpose:
	1205	!> Test whether distributed sum exceeds tolerance and print out a
	1206	!> warning message.
	1207	!
	1208	!----------------------------------------------------------------------
	1209
	1210	logical function oasis_reprosum_tolExceeded (name, nflds, master, &
	1211	logunit, rel_diff )
	1212	!----------------------------------------------------------------------
	1213
	1214	! Arguments
	1215
	1216	character(len=*), intent(in) :: name !< distributed sum identifier
	1217	integer, intent(in) :: nflds !< number of fields
	1218	logical, intent(in) :: master !< process that will write
	1219	!< warning messages?
	1220	integer, optional, intent(in) :: logunit!< unit warning messages
	1221	!< written to
	1222	real(r8), intent(in) :: rel_diff(2,nflds)
	1223	!< relative and absolute
	1224	!< differences between fixed
	1225	!< and floating point sums
	1226
	1227	! Local workspace
	1228
	1229	integer :: llogunit ! local log unit
	1230	integer :: ifld ! field index
	1231	integer :: exceeds_limit ! number of fields whose
	1232	! sum exceeds tolerance
	1233	real(r8) :: max_rel_diff ! maximum relative difference
	1234	integer :: max_rel_diff_idx ! field index for max. rel. diff.
	1235	real(r8) :: max_abs_diff ! maximum absolute difference
	1236	integer :: max_abs_diff_idx ! field index for max. abs. diff.
	1237	character(len=*),parameter :: subname = '(oasis_reprosum_tolExceeded)'
	1238
	1239	!-----------------------------------------------------------------------
	1240
	1241	oasis_reprosum_tolExceeded = .false.
	1242	if (oasis_reprosum_reldiffmax < 0.0_r8) return
	1243
	1244	if ( present(logunit) ) then
	1245	llogunit = logunit
	1246	else
	1247	llogunit = nulprt
	1248	endif
	1249
	1250	! check that "fast" reproducible sum is accurate enough.
	1251	exceeds_limit = 0
	1252	max_rel_diff = 0.0_r8
	1253	max_abs_diff = 0.0_r8
	1254	do ifld=1,nflds
	1255	if (rel_diff(1,ifld) > oasis_reprosum_reldiffmax) then
	1256	exceeds_limit = exceeds_limit + 1
	1257	if (rel_diff(1,ifld) > max_rel_diff) then
	1258	max_rel_diff = rel_diff(1,ifld)
	1259	max_rel_diff_idx = ifld
	1260	endif
	1261	if (rel_diff(2,ifld) > max_abs_diff) then
	1262	max_abs_diff = rel_diff(2,ifld)
	1263	max_abs_diff_idx = ifld
	1264	endif
	1265	endif
	1266	enddo
	1267
	1268	if (exceeds_limit > 0) then
	1269	if (master) then
	1270	write(llogunit,*) subname,trim(name), &
	1271	': difference in fixed and floating point sums ', &
	1272	' exceeds tolerance in ', exceeds_limit, &
	1273	' fields.'
	1274	write(llogunit,*) subname,' Maximum relative diff: (rel)', &
	1275	rel_diff(1,max_rel_diff_idx), ' (abs) ', &
	1276	rel_diff(2,max_rel_diff_idx)
	1277	write(llogunit,*) subname,' Maximum absolute diff: (rel)', &
	1278	rel_diff(1,max_abs_diff_idx), ' (abs) ', &
	1279	rel_diff(2,max_abs_diff_idx)
	1280	endif
	1281	oasis_reprosum_tolExceeded = .true.
	1282	endif
	1283
	1284
	1285	end function oasis_reprosum_tolExceeded
	1286
	1287	!========================================================================
	1288	!
	1289	! Purpose:
	1290	!> Compute the global sum of each field in "arr" using the indicated
	1291	!> communicator with a reproducible yet scalable implementation based
	1292	!> on He and Ding's implementation of the double-double algorithm.
	1293	!
	1294	!----------------------------------------------------------------------
	1295
	1296	subroutine oasis_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, &
	1297	nflds, mpi_comm )
	1298	!----------------------------------------------------------------------
	1299
	1300	! Arguments
	1301
	1302	integer, intent(in) :: nsummands !< number of local summands
	1303	integer, intent(in) :: dsummands !< declared first dimension
	1304	integer, intent(in) :: nflds !< number of fields
	1305	real(r8), intent(in) :: arr(dsummands,nflds)
	1306	!< input array
	1307	integer, intent(in) :: mpi_comm !< MPI subcommunicator
	1308
	1309	real(r8), intent(out):: arr_gsum(nflds)
	1310	!< global sums
	1311
	1312
	1313	! Local workspace
	1314
	1315	integer :: old_cw ! for x86 processors, save
	1316	! current arithmetic mode
	1317	integer :: ifld, isum ! loop variables
	1318	integer :: ierr ! MPI error return
	1319
	1320	real(r8) :: e, t1, t2 ! temporaries
	1321	complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double
	1322	! format)
	1323	complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double
	1324	! format)
	1325
	1326	integer, save :: mpi_sumdd
	1327	logical, save :: first_time = .true.
	1328	character(len=*),parameter :: subname = '(oasis_reprosum_ddpdd)'
	1329
	1330	!-----------------------------------------------------------------------
	1331
	1332	call shr_reprosumx86_fix_start (old_cw)
	1333
	1334	if (first_time) then
	1335	call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
	1336	first_time = .false.
	1337	endif
	1338
	1339	do ifld=1,nflds
	1340	arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
	1341
	1342	do isum=1,nsummands
	1343
	1344	! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
	1345	! trick.
	1346	t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
	1347	e = t1 - arr(isum,ifld)
	1348	t2 = ((real(arr_lsum_dd(ifld)) - e) &
	1349	+ (arr(isum,ifld) - (t1 - e))) &
	1350	+ aimag(arr_lsum_dd(ifld))
	1351
	1352	! The result is t1 + t2, after normalization.
	1353	arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
	1354	enddo
	1355
	1356	enddo
	1357
	1358	call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
	1359	MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
	1360	do ifld=1,nflds
	1361	arr_gsum(ifld) = real(arr_gsum_dd(ifld))
	1362	enddo
	1363
	1364	call shr_reprosumx86_fix_end (old_cw)
	1365
	1366	end subroutine oasis_reprosum_ddpdd
	1367
	1368	!-----------------------------------------------------------------------
	1369
	1370	subroutine DDPDD (dda, ddb, len, itype)
	1371	!----------------------------------------------------------------------
	1372	!
	1373	! Purpose:
	1374	! Modification of original codes written by David H. Bailey
	1375	! This subroutine computes ddb(i) = dda(i)+ddb(i)
	1376	!
	1377	!----------------------------------------------------------------------
	1378
	1379	! Arguments
	1380
	1381	integer, intent(in) :: len ! array length
	1382	complex(r8), intent(in) :: dda(len) ! input
	1383	complex(r8), intent(inout) :: ddb(len) ! result
	1384	integer, intent(in) :: itype ! unused
	1385
	1386	! Local workspace
	1387
	1388	real(r8) e, t1, t2
	1389	integer i
	1390	character(len=*),parameter :: subname = '(oasis_reprosum_mod:DDPDD)'
	1391
	1392	!-----------------------------------------------------------------------
	1393
	1394	do i = 1, len
	1395	! Compute dda + ddb using Knuth's trick.
	1396	t1 = real(dda(i)) + real(ddb(i))
	1397	e = t1 - real(dda(i))
	1398	t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
	1399	+ aimag(dda(i)) + aimag(ddb(i))
	1400
	1401	! The result is t1 + t2, after normalization.
	1402	ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
	1403	enddo
	1404
	1405
	1406	end subroutine DDPDD
	1407
	1408	!-----------------------------------------------------------------------
	1409
	1410	subroutine split_indices(total,num_pieces,ibeg,iend)
	1411	!----------------------------------------------------------------------
	1412	!
	1413	! Purpose:
	1414	! Split range into 'num_pieces'
	1415	!
	1416	!----------------------------------------------------------------------
	1417
	1418	! Arguments
	1419
	1420	integer, intent(in) :: total
	1421	integer, intent(in) :: num_pieces
	1422	integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
	1423
	1424	! Local workspace
	1425
	1426	integer :: itmp1, itmp2, ioffset, i
	1427	character(len=*),parameter :: subname = '(oasis_reprosum_mod:split_indices)'
	1428
	1429	!-----------------------------------------------------------------------
	1430
	1431	itmp1 = total/num_pieces
	1432	itmp2 = mod(total,num_pieces)
	1433	ioffset = 0
	1434	do i=1,itmp2
	1435	ibeg(i) = ioffset + 1
	1436	iend(i) = ioffset + (itmp1+1)
	1437	ioffset = iend(i)
	1438	enddo
	1439	do i=itmp2+1,num_pieces
	1440	ibeg(i) = ioffset + 1
	1441	if (ibeg(i) > total) then
	1442	iend(i) = ibeg(i) - 1
	1443	else
	1444	iend(i) = ioffset + itmp1
	1445	ioffset = iend(i)
	1446	endif
	1447	enddo
	1448
	1449	end subroutine split_indices
	1450
	1451	!========================================================================
	1452
	1453	end module mod_oasis_reprosum

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CPL/oasis3-mct/branches/OASIS3-MCT_5.0_branch/lib/psmile/src/mod_oasis_reprosum.F90

Download in other formats: