Context Navigation

mod_oasis_reprosum.F90

Last change on this file was 6331, checked in by aclsce, 17 months ago
Moved oasis-mct_5.0 in oasis3-mct/branches directory.
File size: 60.9 KB

Line
1
2	!> OASIS reproducible sum method from P. Worley
3
4	MODULE mod_oasis_reprosum
5
6	!-----------------------------------------------------------------------
7	!
8	! Purpose:
9	!> Compute reproducible global sums of a set of arrays across an MPI
10	!> subcommunicator
11	!
12	! Methods:
13	!> Compute using either or both a scalable, reproducible algorithm and a
14	!> scalable, nonreproducible algorithm:
15	!> * Reproducible (scalable):
16	!> Convert to fixed point (integer vector representation) to enable
17	!> reproducibility when using MPI_Allreduce
18	!> * Alternative usually reproducible (scalable):
19	!> Use parallel double-double algorithm due to Helen He and
20	!> Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
21	!> * Nonreproducible (scalable):
22	!> Floating point and MPI_Allreduce based.
23	!> If computing both reproducible and nonreproducible sums, compare
24	!> these and report relative difference (if absolute difference
25	!> less than sum) or absolute difference back to calling routine.
26	!
27	! Author: P. Worley (based on suggestions from J. White for fixed
28	! point algorithm and on He/Ding paper for ddpdd
29	! algorithm)
30	!
31	! Modified by T.Craig for Oasis, May 2017 based on the public version in
32	! CESM1.2.
33	!
34	!-----------------------------------------------------------------------
35
36	!-----------------------------------------------------------------------
37	!- use statements ------------------------------------------------------
38	!-----------------------------------------------------------------------
39	#if ( defined noI8 )
40	! Workaround for when shr_kind_i8 is not supported.
41	use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i4_p
42	#else
43	use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i8_p
44	#endif
45	use mod_oasis_data, only: nulprt
46	use mod_oasis_sys, only: oasis_abort, estr
47	use mod_oasis_mpi, only: oasis_mpi_barrier
48	use mod_oasis_timer, only: oasis_timer_start, oasis_timer_stop
49	use, intrinsic :: iso_c_binding
50
51	!-----------------------------------------------------------------------
52	!- module boilerplate --------------------------------------------------
53	!-----------------------------------------------------------------------
54	implicit none
55	private
56
57	!-----------------------------------------------------------------------
58	!- include statements --------------------------------------------------
59	!-----------------------------------------------------------------------
60	#include <mpif.h>
61
62	!-----------------------------------------------------------------------
63	! Public interfaces ----------------------------------------------------
64	!-----------------------------------------------------------------------
65	public :: &
66	oasis_reprosum_setopts, &! set runtime options
67	oasis_reprosum_calc, &! calculate distributed sum
68	oasis_reprosum_tolExceeded ! utility function to check relative
69	! differences against the tolerance
70
71	!-----------------------------------------------------------------------
72	! Public data ----------------------------------------------------------
73	!-----------------------------------------------------------------------
74	logical, public :: oasis_reprosum_recompute = .false.
75
76	real(r8), public :: oasis_reprosum_reldiffmax = -1.0_r8
77
78	!-----------------------------------------------------------------------
79	! Private interfaces ---------------------------------------------------
80	!-----------------------------------------------------------------------
81	private :: &
82	ddpdd, &! double-double sum routine
83	split_indices ! split indices among OMP threads
84
85	!-----------------------------------------------------------------------
86	! Private data ---------------------------------------------------------
87	!-----------------------------------------------------------------------
88
89	logical :: repro_sum_use_ddpdd = .false.
90	logical :: detailed_timing = .false.
91
92	!-----------------------------------------------------------------------
93	! External C bindings --------------------------------------------------
94	!-----------------------------------------------------------------------
95
96	interface
97	subroutine shr_reprosumx86_fix_start(old_cw) bind(C)
98	use iso_c_binding, only: c_int
99	integer(c_int), intent(out) :: old_cw
100	end subroutine shr_reprosumx86_fix_start
101
102	subroutine shr_reprosumx86_fix_end(old_cw) bind(C)
103	use iso_c_binding, only: c_int
104	integer(c_int), intent(in) :: old_cw
105	end subroutine shr_reprosumx86_fix_end
106	end interface
107
108	CONTAINS
109
110	!========================================================================
111	!-----------------------------------------------------------------------
112	! Purpose:
113	!> Set runtime options
114	! Author: P. Worley
115	!-----------------------------------------------------------------------
116
117	subroutine oasis_reprosum_setopts(repro_sum_use_ddpdd_in, &
118	repro_sum_rel_diff_max_in, &
119	repro_sum_recompute_in, &
120	repro_sum_master, &
121	repro_sum_logunit )
122
123	!------------------------------Arguments--------------------------------
124	logical, intent(in), optional :: repro_sum_use_ddpdd_in
125	!< Use DDPDD algorithm instead of fixed precision algorithm
126	real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
127	!< maximum permissible difference between reproducible and
128	!< nonreproducible sums
129	logical, intent(in), optional :: repro_sum_recompute_in
130	!< recompute using different algorithm when difference between
131	!< reproducible and nonreproducible sums is too great
132	logical, intent(in), optional :: repro_sum_master
133	!< flag indicating whether this process should output
134	!< log messages
135	integer, intent(in), optional :: repro_sum_logunit
136	!< unit number for log messages
137	!---------------------------Local Workspace-----------------------------
138	integer llogunit ! unit number for log messages
139	logical master ! local master?
140	logical,save :: firstcall = .true. ! first call
141	character(len=*),parameter :: subname = '(oasis_reprosum_setopts)'
142	!-----------------------------------------------------------------------
143
144	if ( present(repro_sum_master) ) then
145	master = repro_sum_master
146	else
147	master = .false.
148	endif
149
150	if ( present(repro_sum_logunit) ) then
151	llogunit = repro_sum_logunit
152	else
153	llogunit = nulprt
154	endif
155
156	if (.not. firstcall) then
157	write(llogunit,*) subname,estr,' can only be called once'
158	call oasis_abort(file=__FILE__,line=__LINE__)
159	endif
160	firstcall = .false.
161
162	if ( present(repro_sum_use_ddpdd_in) ) then
163	repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
164	endif
165	if ( present(repro_sum_rel_diff_max_in) ) then
166	oasis_reprosum_reldiffmax = repro_sum_rel_diff_max_in
167	endif
168	if ( present(repro_sum_recompute_in) ) then
169	oasis_reprosum_recompute = repro_sum_recompute_in
170	endif
171	if (master) then
172	if ( repro_sum_use_ddpdd ) then
173	write(llogunit,*) subname, &
174	'Using double-double-based (scalable) usually reproducible ', &
175	'distributed sum algorithm'
176	else
177	write(llogunit,*) subname, &
178	'Using fixed-point-based (scalable) reproducible ', &
179	'distributed sum algorithm'
180	endif
181
182	if (oasis_reprosum_reldiffmax >= 0._r8) then
183	write(llogunit,*) subname, &
184	' with a maximum relative error tolerance of ', &
185	oasis_reprosum_reldiffmax
186	if (oasis_reprosum_recompute) then
187	write(llogunit,*) subname, &
188	'If tolerance exceeded, sum is recomputed using ', &
189	'a serial algorithm.'
190	else
191	write(llogunit,*) subname, &
192	'If tolerance exceeded, fixed-precision is sum used ', &
193	'but a warning is output.'
194	endif
195	else
196	write(llogunit,*) subname, &
197	'and not comparing with floating point algorithms.'
198	endif
199
200	endif
201	end subroutine oasis_reprosum_setopts
202
203	!========================================================================
204
205	!
206	! Purpose:
207	!> Compute the global sum of each field in "arr" using the indicated
208	!> communicator with a reproducible yet scalable implementation based
209	!> on a fixed point algorithm. An alternative is to use an "almost
210	!> always reproducible" floating point algorithm.
211	!
212	! The accuracy of the fixed point algorithm is controlled by the
213	! number of "levels" of integer expansion. The algorithm will calculate
214	! the number of levels that is required for the sum to be essentially
215	! exact. The optional parameter arr_max_levels can be used to override
216	! the calculated value. The optional parameter arr_max_levels_out can be
217	! used to return the values used.
218	!
219	! The algorithm also requires an upper bound on
220	! the maximum summand (in absolute value) for each field, and will
221	! calculate this internally. However, if the optional parameters
222	! arr_max_levels and arr_gbl_max are both set, then the algorithm will
223	! use the values in arr_gbl_max for the upper bounds instead. If these
224	! are not upper bounds, or if the upper bounds are not tight enough
225	! to achieve the requisite accuracy, and if the optional parameter
226	! repro_sum_validate is NOT set to .false., the algorithm will repeat the
227	! computation with appropriate upper bounds. If only arr_gbl_max is present,
228	! then the maxima are computed internally (and the specified values are
229	! ignored). The optional parameter arr_gbl_max_out can be
230	! used to return the values used.
231	!
232	! Finally, the algorithm requires an upper bound on the number of
233	! local summands across all processes. This will be calculated internally,
234	! using an MPI collective, but the value in the optional argument
235	! gbl_max_nsummands will be used instead if (1) it is present, (2)
236	! it is > 0, and (3) the maximum value and required number of levels
237	! are also specified. (If the maximum value is calculated, the same
238	! MPI collective is used to determine the maximum number of local
239	! summands.) The accuracy of the user-specified value is not checked.
240	! However, if set to < 1, the value will instead be calculated. If the
241	! optional parameter gbl_max_nsummands_out is present, then the value
242	! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
243	! returned.
244	!
245	! If requested (by setting oasis_reprosum_reldiffmax >= 0.0 and passing in
246	! the optional rel_diff parameter), results are compared with a
247	! nonreproducible floating point algorithm.
248	!
249	! Note that the cost of the algorithm is not strongly correlated with
250	! the number of levels, which primarily shows up as a (modest) increase
251	! in cost of the MPI_Allreduce as a function of vector length. Rather the
252	! cost is more a function of (a) the number of integers required to
253	! represent an individual summand and (b) the number of MPI_Allreduce
254	! calls. The number of integers required to represent an individual
255	! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
256	! when the number of local summands is not too large. As the number of
257	! local summands increases, the number of integers required increases.
258	! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
259	! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
260	! correctly). When specifying arr_max_levels and arr_gbl_max
261	! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
262	!
263	! The alternative algorithm is a minor modification of a parallel
264	! implementation of David Bailey's routine DDPDD by Helen He
265	! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
266	! precision summation of double precision values with 10 double
267	! precision operations. The advantage of this algorithm is that
268	! it requires a single MPI_Allreduce and is less expensive per summand
269	! than is the fixed precision algorithm. The disadvantage is that it
270	! is not guaranteed to be reproducible (though it is reproducible
271	! much more often than is the standard algorithm). This alternative
272	! is used when the optional parameter ddpdd_sum is set to .true. It is
273	! also used if the fixed precision algorithm radix assumption does not
274	! hold.
275
276	!----------------------------------------------------------------------
277	subroutine oasis_reprosum_calc (arr, arr_gsum, nsummands, dsummands, &
278	nflds, ddpdd_sum, &
279	arr_gbl_max, arr_gbl_max_out, &
280	arr_max_levels, arr_max_levels_out, &
281	gbl_max_nsummands, gbl_max_nsummands_out,&
282	gbl_count, repro_sum_validate, &
283	repro_sum_stats, rel_diff, commid )
284	!----------------------------------------------------------------------
285
286	! Arguments
287
288	integer, intent(in) :: nsummands !< number of local summands
289	integer, intent(in) :: dsummands !< declared first dimension
290	integer, intent(in) :: nflds !< number of fields
291	real(r8), intent(in) :: arr(dsummands,nflds)
292	!< input array
293
294	real(r8), intent(out):: arr_gsum(nflds)
295	!< global means
296
297	logical, intent(in), optional :: ddpdd_sum
298	!< use ddpdd algorithm instead
299	!< of fixed precision algorithm
300
301	real(r8), intent(in), optional :: arr_gbl_max(nflds)
302	!< upper bound on max(abs(arr))
303
304	real(r8), intent(out), optional :: arr_gbl_max_out(nflds)
305	!< calculated upper bound on
306	!< max(abs(arr))
307
308	integer, intent(in), optional :: arr_max_levels(nflds)
309	!< maximum number of levels of
310	!< integer expansion to use
311
312	integer, intent(out), optional :: arr_max_levels_out(nflds)
313	!< output of number of levels of
314	!< integer expansion to used
315
316	integer, intent(in), optional :: gbl_max_nsummands
317	!< maximum of nsummand over all
318	!< processes
319
320	integer, intent(out), optional :: gbl_max_nsummands_out
321	!< calculated maximum nsummands
322	!< over all processes
323
324	integer, intent(in), optional :: gbl_count
325	!< was total number of summands;
326	!< now is ignored; use
327	!< gbl_max_nsummands instead
328
329	logical, intent(in), optional :: repro_sum_validate
330	!< flag enabling/disabling testing that gmax and max_levels are
331	!< accurate/sufficient. Default is enabled.
332
333	integer, intent(inout), optional :: repro_sum_stats(5)
334	!< increment running totals for
335	!< (1) one-reduction repro_sum
336	!< (2) two-reduction repro_sum
337	!< (3) both types in one call
338	!< (4) nonrepro_sum
339	!< (5) global max nsummands reduction
340
341	real(r8), intent(out), optional :: rel_diff(2,nflds)
342	!< relative and absolute
343	!< differences between fixed
344	!< and floating point sums
345
346	integer, intent(in), optional :: commid
347	!< MPI communicator
348
349	! Local workspace
350
351	logical :: use_ddpdd_sum ! flag indicating whether to
352	! use oasis_reprosum_ddpdd or not
353	logical :: recompute ! flag indicating need to
354	! determine gmax/gmin before
355	! computing sum
356	logical :: validate ! flag indicating need to
357	! verify gmax and max_levels
358	! are accurate/sufficient
359	integer :: omp_nthreads ! number of OpenMP threads
360	integer :: mpi_comm ! MPI subcommunicator
361	integer :: tasks ! number of MPI processes
362	integer :: ierr ! MPI error return
363	integer :: ifld, isum, ithread ! loop variables
364	integer :: max_nsummands ! max nsummands over all processes
365	! or threads (used in both ways)
366
367	integer, allocatable :: isum_beg(:), isum_end(:)
368	! range of summand indices for each
369	! OpenMP thread
370	integer, allocatable :: arr_tlmin_exp(:,:)
371	! per thread local exponent minima
372	integer, allocatable :: arr_tlmax_exp(:,:)
373	! per thread local exponent maxima
374	integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
375	! summand exponent and working min/max
376	integer :: arr_lmin_exp(nflds) ! local exponent minima
377	integer :: arr_lmax_exp(nflds) ! local exponent maxima
378	integer :: arr_lextremes(0:nflds,2)! local exponent extrema
379	integer :: arr_gextremes(0:nflds,2)! global exponent extrema
380
381	integer :: arr_gmax_exp(nflds) ! global exponents maxima
382	integer :: arr_gmin_exp(nflds) ! global exponents minima
383	integer :: arr_max_shift ! maximum safe exponent for
384	! value < 1 (so that sum does
385	! not overflow)
386	integer :: max_levels(nflds) ! maximum number of levels of
387	! integer expansion to use
388	integer :: max_level ! maximum value in max_levels
389	integer :: gbl_max_red ! global max local sum reduction? (0/1)
390	integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1)
391	integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1)
392	integer :: repro_sum_both ! both fast and slow? (0/1)
393	integer :: nonrepro_sum ! nonrepro_sum? (0/1)
394
395	real(r8) :: xmax_nsummands ! dble of max_nsummands
396	real(r8) :: arr_lsum(nflds) ! local sums
397	real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using
398	! fast, nonreproducible,
399	! floating point alg.
400	real(r8) :: abs_diff ! absolute difference between
401	! fixed and floating point
402	! sums
403	#ifdef _OPENMP
404	integer omp_get_max_threads
405	external omp_get_max_threads
406	#endif
407	character(len=*),parameter :: subname = '(oasis_reprosum_calc)'
408
409	!-----------------------------------------------------------------------
410
411	! check whether should use oasis_reprosum_ddpdd algorithm
412	use_ddpdd_sum = repro_sum_use_ddpdd
413	if ( present(ddpdd_sum) ) then
414	use_ddpdd_sum = ddpdd_sum
415	endif
416
417	! check whether intrinsic-based algorithm will work on this system
418	! (requires floating point and integer bases to be the same)
419	! If not, always use ddpdd.
420	use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
421
422	! initialize local statistics variables
423	gbl_max_red = 0
424	repro_sum_fast = 0
425	repro_sum_slow = 0
426	repro_sum_both = 0
427	nonrepro_sum = 0
428
429	! set MPI communicator
430	if ( present(commid) ) then
431	mpi_comm = commid
432	else
433	mpi_comm = MPI_COMM_WORLD
434	endif
435
436	if (detailed_timing) then
437	call oasis_timer_start('oasis_reprosum_prebarrier')
438	call oasis_mpi_barrier(mpi_comm,subname)
439	call oasis_timer_stop ('oasis_reprosum_prebarrier')
440	endif
441
442	if ( use_ddpdd_sum ) then
443
444	if (detailed_timing) call oasis_timer_start('oasis_reprosum_ddpdd')
445
446	call oasis_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
447	nflds, mpi_comm)
448	repro_sum_fast = 1
449
450	if (detailed_timing) call oasis_timer_stop('oasis_reprosum_ddpdd')
451
452	else
453
454	if (detailed_timing) call oasis_timer_start('oasis_reprosum_int')
455
456	! get number of MPI tasks
457	call mpi_comm_size(mpi_comm, tasks, ierr)
458
459	! get number of OpenMP threads
460	#ifdef _OPENMP
461	omp_nthreads = omp_get_max_threads()
462	#else
463	omp_nthreads = 1
464	#endif
465
466	! see if have sufficient information to not require max/min allreduce
467	recompute = .true.
468	validate = .false.
469	if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
470	recompute = .false.
471
472	! setting lower bound on max_level*nflds to be 64 to improve OpenMP
473	! performance for loopb in oasis_reprosum_int
474	max_level = (64/nflds) + 1
475	do ifld=1,nflds
476	if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
477	(arr_max_levels(ifld) > 0)) then
478
479	arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld))
480	if (max_level < arr_max_levels(ifld)) &
481	max_level = arr_max_levels(ifld)
482
483	else
484	recompute = .true.
485	endif
486	enddo
487
488	if (.not. recompute) then
489
490	! determine maximum number of summands in local phases of the
491	! algorithm
492	if (detailed_timing) call oasis_timer_start("repro_sum_allr_max")
493	if ( present(gbl_max_nsummands) ) then
494	if (gbl_max_nsummands < 1) then
495	call mpi_allreduce (nsummands, max_nsummands, 1, &
496	MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
497	gbl_max_red = 1
498	else
499	max_nsummands = gbl_max_nsummands
500	endif
501	else
502	call mpi_allreduce (nsummands, max_nsummands, 1, &
503	MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
504	gbl_max_red = 1
505	endif
506	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_max")
507
508	! determine maximum shift. Shift needs to be small enough that summation
509	! does not exceed maximum number of digits in i8.
510
511	! if requested, return max_nsummands before it is redefined
512	if ( present( gbl_max_nsummands_out) ) then
513	gbl_max_nsummands_out = max_nsummands
514	endif
515
516	! summing within each thread first
517	max_nsummands = (max_nsummands/omp_nthreads) + 1
518	! then over threads and tasks
519	max_nsummands = max(max_nsummands, tasks*omp_nthreads)
520
521	xmax_nsummands = dble(max_nsummands)
522	arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
523	if (arr_max_shift < 2) then
524	write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm'
525	call oasis_abort(file=__FILE__,line=__LINE__)
526	endif
527
528	! calculate sum
529	if (present(repro_sum_validate)) then
530	validate = repro_sum_validate
531	else
532	validate = .true.
533	endif
534	call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
535	nflds, arr_max_shift, arr_gmax_exp, &
536	arr_max_levels, max_level, validate, &
537	recompute, omp_nthreads, mpi_comm)
538
539	! record statistics, etc.
540	repro_sum_fast = 1
541	if (recompute) then
542	repro_sum_both = 1
543	else
544	! if requested, return specified levels and upper bounds on maxima
545	if ( present(arr_max_levels_out) ) then
546	do ifld=1,nflds
547	arr_max_levels_out(ifld) = arr_max_levels(ifld)
548	enddo
549	endif
550	if ( present(arr_gbl_max_out) ) then
551	do ifld=1,nflds
552	arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
553	enddo
554	endif
555	endif
556	endif
557	endif
558
559	! do not have sufficient information; calculate global max/min and
560	! use to compute required number of levels
561	if (recompute) then
562
563	! record statistic
564	repro_sum_slow = 1
565
566	! determine maximum and minimum (non-zero) summand values and
567	! maximum number of local summands
568
569	! allocate thread-specific work space
570	allocate(arr_tlmax_exp(nflds,omp_nthreads))
571	allocate(arr_tlmin_exp(nflds,omp_nthreads))
572	allocate(isum_beg(omp_nthreads))
573	allocate(isum_end(omp_nthreads))
574
575	! split summand index range over OpenMP threads
576	call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
577
578	!$omp parallel do &
579	!$omp default(shared) &
580	!$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
581	do ithread=1,omp_nthreads
582	if (detailed_timing) call oasis_timer_start('repro_sum_loopa')
583	do ifld=1,nflds
584	arr_exp_tlmin = MAXEXPONENT(1._r8)
585	arr_exp_tlmax = MINEXPONENT(1._r8)
586	do isum=isum_beg(ithread),isum_end(ithread)
587	if (arr(isum,ifld) .ne. 0.0_r8) then
588	arr_exp = exponent(arr(isum,ifld))
589	arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
590	arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
591	endif
592	end do
593	arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
594	arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
595	end do
596	if (detailed_timing) call oasis_timer_stop('repro_sum_loopa')
597	end do
598
599	do ifld=1,nflds
600	arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
601	arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
602	end do
603	deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
604
605	arr_lextremes(0,:) = -nsummands
606	arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
607	arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
608	if (detailed_timing) call oasis_timer_start("repro_sum_allr_minmax")
609	call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
610	MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
611	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_minmax")
612	max_nsummands = -arr_gextremes(0,1)
613	arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
614	arr_gmin_exp(:) = arr_gextremes(1:nflds,2)
615
616	! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
617	! and arr_gmax_exp still equals MINEXPONENT. In this case, set
618	! arr_gmin_exp = arr_gmax_exp = MINEXPONENT
619	do ifld=1,nflds
620	arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
621	enddo
622
623	! if requested, return upper bounds on observed maxima
624	if ( present(arr_gbl_max_out) ) then
625	do ifld=1,nflds
626	arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
627	enddo
628	endif
629
630	! if requested, return max_nsummands before it is redefined
631	if ( present( gbl_max_nsummands_out) ) then
632	gbl_max_nsummands_out = max_nsummands
633	endif
634
635	! determine maximum shift (same as in previous branch, but with calculated
636	! max_nsummands). Shift needs to be small enough that summation does not
637	! exceed maximum number of digits in i8.
638
639	! summing within each thread first
640	max_nsummands = (max_nsummands/omp_nthreads) + 1
641	! then over threads and tasks
642	max_nsummands = max(max_nsummands, tasks*omp_nthreads)
643
644	xmax_nsummands = dble(max_nsummands)
645	arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
646	if (arr_max_shift < 2) then
647	write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm'
648	call oasis_abort(file=__FILE__,line=__LINE__)
649	endif
650
651	! determine maximum number of levels required for each field
652	! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
653	! + 1 because first truncation probably does not involve a maximal shift
654	! + 1 to guarantee that the integer division rounds up (not down)
655	! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
656	! performance for loopb in oasis_reprosum_int)
657	max_level = (64/nflds) + 1
658	do ifld=1,nflds
659	max_levels(ifld) = 2 + &
660	((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &
661	/ arr_max_shift)
662	if ( present(arr_max_levels) .and. (.not. validate) ) then
663	! if validate true, then computation with arr_max_levels failed
664	! previously
665	if ( arr_max_levels(ifld) > 0 ) then
666	max_levels(ifld) = &
667	min(arr_max_levels(ifld),max_levels(ifld))
668	endif
669	endif
670	if (max_level < max_levels(ifld)) &
671	max_level = max_levels(ifld)
672	enddo
673
674	! if requested, return calculated levels
675	if ( present(arr_max_levels_out) ) then
676	do ifld=1,nflds
677	arr_max_levels_out(ifld) = max_levels(ifld)
678	enddo
679	endif
680
681	! calculate sum
682	validate = .false.
683	call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
684	arr_max_shift, arr_gmax_exp, max_levels, &
685	max_level, validate, recompute, &
686	omp_nthreads, mpi_comm)
687
688	endif
689
690	if (detailed_timing) call oasis_timer_stop('oasis_reprosum_int')
691
692	endif
693
694	! compare fixed and floating point results
695	if ( present(rel_diff) ) then
696	if (oasis_reprosum_reldiffmax >= 0.0_r8) then
697
698	if (detailed_timing) then
699	call oasis_timer_start('oasis_nonreprosum_prebarrier')
700	call oasis_mpi_barrier(mpi_comm,subname)
701	call oasis_timer_stop ('oasis_nonreprosum_prebarrier')
702	endif
703
704	if (detailed_timing) call oasis_timer_start('nonrepro_sum')
705	! record statistic
706	nonrepro_sum = 1
707	! compute nonreproducible sum
708	arr_lsum(:) = 0._r8
709	!$omp parallel do &
710	!$omp default(shared) &
711	!$omp private(ifld, isum)
712	do ifld=1,nflds
713	do isum=1,nsummands
714	arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
715	end do
716	end do
717
718	call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
719	MPI_REAL8, MPI_SUM, mpi_comm, ierr)
720
721	if (detailed_timing) call oasis_timer_stop('nonrepro_sum')
722
723	! determine differences
724	!$omp parallel do &
725	!$omp default(shared) &
726	!$omp private(ifld, abs_diff)
727	do ifld=1,nflds
728	abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
729	if (abs(arr_gsum(ifld)) > abs_diff) then
730	rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
731	else
732	rel_diff(1,ifld) = abs_diff
733	endif
734	rel_diff(2,ifld) = abs_diff
735	enddo
736	else
737	rel_diff(:,:) = 0.0_r8
738	endif
739	endif
740
741	! return statistics
742	if ( present(repro_sum_stats) ) then
743	repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
744	repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
745	repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
746	repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
747	repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
748	endif
749
750
751	end subroutine oasis_reprosum_calc
752
753	!========================================================================
754	!
755	! Purpose:
756	!> Compute the global sum of each field in "arr" using the indicated
757	!> communicator with a reproducible yet scalable implementation based
758	!> on a fixed point algorithm. The accuracy of the fixed point algorithm
759	!> is controlled by the number of "levels" of integer expansion, the
760	!> maximum value of which is specified by max_level.
761	!
762	!----------------------------------------------------------------------
763
764	subroutine oasis_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
765	arr_max_shift, arr_gmax_exp, max_levels, &
766	max_level, validate, recompute, &
767	omp_nthreads, mpi_comm )
768
769	!----------------------------------------------------------------------
770
771	! Arguments
772
773	integer, intent(in) :: nsummands !< number of local summands
774	integer, intent(in) :: dsummands !< declared first dimension
775	integer, intent(in) :: nflds !< number of fields
776	integer, intent(in) :: arr_max_shift !< maximum safe exponent for
777	!< value < 1 (so that sum
778	!< does not overflow)
779	integer, intent(in) :: arr_gmax_exp(nflds)
780	!< exponents of global maxima
781	integer, intent(in) :: max_levels(nflds)
782	!< maximum number of levels
783	!< of integer expansion
784	integer, intent(in) :: max_level !< maximum value in
785	!< max_levels
786	integer, intent(in) :: omp_nthreads !< number of OpenMP threads
787	integer, intent(in) :: mpi_comm !< MPI subcommunicator
788
789	real(r8), intent(in) :: arr(dsummands,nflds)
790	!< input array
791
792	logical, intent(in):: validate
793	!< flag indicating that accuracy of solution generated from
794	!< arr_gmax_exp and max_levels should be tested
795
796	logical, intent(out):: recompute
797	!< flag indicating that either the upper bounds are inaccurate,
798	!< or max_levels and arr_gmax_exp do not generate accurate
799	!< enough sums
800
801	real(r8), intent(out):: arr_gsum(nflds) !< global means
802
803	! Local workspace
804
805	integer, parameter :: max_jlevel = &
806	1 + (digits(0_i8)/digits(0.0_r8))
807
808	integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
809	! integer vector representing local
810	! sum (per thread, per field)
811	integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
812	! integer vector representing local
813	! sum
814	integer(i8) :: i8_arr_level ! integer part of summand for current
815	! expansion level
816	integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
817	! integer vector representing global
818	! sum
819	integer(i8) :: IX_8 ! integer representation of current
820	! jlevels of X_8 ('part' of
821	! i8_arr_gsum_level)
822	integer(i8) :: i8_sign ! sign global sum
823	integer(i8) :: i8_radix ! radix for i8 variables
824
825	integer :: max_error(nflds,omp_nthreads)
826	! accurate upper bound on data?
827	integer :: not_exact(nflds,omp_nthreads)
828	! max_levels sufficient to
829	! capture all digits?
830	integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
831	! range of summand indices for each
832	! OpenMP thread
833	integer :: ifld, isum, ithread
834	! loop variables
835	integer :: arr_exp ! exponent of summand
836	integer :: arr_shift ! exponent used to generate integer
837	! for current expansion level
838	integer :: ilevel ! current integer expansion level
839	integer :: offset(nflds) ! beginning location in
840	! i8_arr_{g,l}sum_level for integer
841	! expansion of current ifld
842	integer :: voffset ! modification to offset used to
843	! include validation metrics
844	integer :: ioffset ! offset(ifld)
845	integer :: jlevel ! number of floating point 'pieces'
846	! extracted from a given i8 integer
847	integer :: ierr ! MPI error return
848	integer :: LX(max_jlevel) ! exponent of X_8 (see below)
849	integer :: veclth ! total length of i8_arr_lsum_level
850	integer :: sum_digits ! lower bound on number of significant
851	! in integer expansion of sum
852	integer :: curr_exp ! exponent of partial sum during
853	! reconstruction from integer vector
854	integer :: corr_exp ! exponent of current summand in
855	! reconstruction from integer vector
856
857	real(r8) :: arr_frac ! fraction of summand
858	real(r8) :: arr_remainder ! part of summand remaining after
859	! current level of integer expansion
860	real(r8) :: X_8(max_jlevel) ! r8 vector representation of current
861	! i8_arr_gsum_level
862	real(r8) :: RX_8 ! r8 representation of difference
863	! between current i8_arr_gsum_level
864	! and current jlevels of X_8
865	! (== IX_8). Also used in final
866	! scaling step
867
868	logical :: first ! flag used to indicate that just
869	! beginning reconstruction of sum
870	! from integer vector
871
872	character(len=*),parameter :: subname = '(oasis_reprosum_int)'
873
874	!-----------------------------------------------------------------------
875	! Save radix of i8 variables in an i8 variable
876	i8_radix = radix(IX_8)
877
878	! If validating upper bounds, reserve space for validation metrics
879	! In both cases, reserve an extra level for overflow from the top level
880	if (validate) then
881	voffset = 3
882	else
883	voffset = 1
884	endif
885
886	! compute offsets for each field
887	offset(1) = voffset
888	do ifld=2,nflds
889	offset(ifld) = offset(ifld-1) &
890	+ (max_levels(ifld-1) + voffset)
891	enddo
892	veclth = offset(nflds) + max_levels(nflds)
893
894	! split summand index range over OpenMP threads
895	call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
896
897	! convert local summands to vector of integers and sum
898	! (Using scale instead of set_exponent because arr_remainder may not be
899	! "normal" after level 1 calculation)
900	i8_arr_lsum_level(:) = 0_i8
901
902	!$omp parallel do &
903	!$omp default(shared) &
904	!$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &
905	!$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
906	do ithread=1,omp_nthreads
907	if (detailed_timing) call oasis_timer_start('repro_sum_loopb')
908	do ifld=1,nflds
909	ioffset = offset(ifld)
910
911	max_error(ifld,ithread) = 0
912	not_exact(ifld,ithread) = 0
913
914	i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
915	do isum=isum_beg(ithread),isum_end(ithread)
916	arr_remainder = 0.0_r8
917
918	if (arr(isum,ifld) .ne. 0.0_r8) then
919	arr_exp = exponent(arr(isum,ifld))
920	arr_frac = fraction(arr(isum,ifld))
921
922	! test that global maximum upper bound is an upper bound
923	if (arr_exp > arr_gmax_exp(ifld)) then
924	max_error(ifld,ithread) = 1
925	exit
926	endif
927
928	! calculate first shift
929	arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
930
931	! determine first (probably) nonzero level (assuming initial fraction is
932	! 'normal' - algorithm still works if this is not true)
933	! NOTE: this is critical; scale will set to zero if min exponent is too small.
934	if (arr_shift < 1) then
935	ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
936	arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
937
938	do while (arr_shift < 1)
939	arr_shift = arr_shift + arr_max_shift
940	ilevel = ilevel + 1
941	enddo
942	else
943	ilevel = 1
944	endif
945
946	if (ilevel .le. max_levels(ifld)) then
947	! apply first shift/truncate, add it to the relevant running
948	! sum, and calculate the remainder.
949	arr_remainder = scale(arr_frac,arr_shift)
950	i8_arr_level = int(arr_remainder,i8)
951	i8_arr_tlsum_level(ilevel,ifld,ithread) = &
952	i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
953	arr_remainder = arr_remainder - i8_arr_level
954
955	! while the remainder is non-zero, continue to shift, truncate,
956	! sum, and calculate new remainder
957	do while ((arr_remainder .ne. 0.0_r8) &
958	.and. (ilevel < max_levels(ifld)))
959	ilevel = ilevel + 1
960	arr_remainder = scale(arr_remainder,arr_max_shift)
961	i8_arr_level = int(arr_remainder,i8)
962	i8_arr_tlsum_level(ilevel,ifld,ithread) = &
963	i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
964	arr_remainder = arr_remainder - i8_arr_level
965	enddo
966
967	endif
968	endif
969
970	if (arr_remainder .ne. 0.0_r8) then
971	not_exact(ifld,ithread) = 1
972	endif
973
974	enddo
975
976	! postprocess integer vector to eliminate potential for overlap in the following
977	! sums over threads and processes: if value larger than or equal to
978	! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
979	! vector, resulting in nonoverlapping ranges for each component. Note that
980	! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
981	! over threads and processes do not overflow for ilevel==1.
982	do ilevel=max_levels(ifld),1,-1
983	RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
984	IX_8 = int(scale(RX_8,-arr_max_shift),i8)
985	if (IX_8 .ne. 0_i8) then
986	i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
987	i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
988	IX_8 = IX_8(i8_radix*arr_max_shift)
989	i8_arr_tlsum_level(ilevel,ifld,ithread) = &
990	i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
991	endif
992	enddo
993	enddo
994	if (detailed_timing) call oasis_timer_stop('repro_sum_loopb')
995	enddo
996
997	! sum contributions from different threads
998	do ifld=1,nflds
999	ioffset = offset(ifld)
1000	do ithread = 1,omp_nthreads
1001	do ilevel = 0,max_levels(ifld)
1002	i8_arr_lsum_level(ioffset+ilevel) = &
1003	i8_arr_lsum_level(ioffset+ilevel) &
1004	+ i8_arr_tlsum_level(ilevel,ifld,ithread)
1005	enddo
1006	enddo
1007	enddo
1008
1009	! record if upper bound was inaccurate or if level expansion stopped
1010	! before full accuracy was achieved
1011	if (validate) then
1012	do ifld=1,nflds
1013	ioffset = offset(ifld)
1014	i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
1015	i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
1016	enddo
1017	endif
1018
1019	! sum integer vector element-wise
1020	#if ( defined noI8 )
1021	! Workaround for when i8 is not supported.
1022	if (detailed_timing) call oasis_timer_start("repro_sum_allr_i4")
1023	call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
1024	veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
1025	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i4")
1026	#else
1027	if (detailed_timing) call oasis_timer_start("repro_sum_allr_i8")
1028	call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
1029	veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
1030	if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i8")
1031	#endif
1032
1033	! Construct global sum from integer vector representation:
1034	! 1) arr_max_shift is the shift applied to fraction(arr_gmax) .
1035	! When shifting back, need to "add back in" true arr_gmax exponent. This was
1036	! removed implicitly by working only with the fraction .
1037	! 2) want to add levels into sum in reverse order (smallest to largest). However,
1038	! even this can generate floating point rounding errors if signs of integers
1039	! alternate. To avoid this, do some arithmetic with integer vectors so that all
1040	! components have the same sign. This should keep relative difference between
1041	! using different integer sizes (e.g. i8 and i4) to machine epsilon
1042	! 3) assignment to X_8 will usually lose accuracy since maximum integer
1043	! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
1044	! correction not very large). Calculate remainder and add in first (since
1045	! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
1046	! For r4 (24 digits) may need to correct twice. Code is written in a general
1047	! fashion, to work no matter how many corrections are necessary (assuming
1048	! max_jlevel parameter calculation is correct).
1049
1050	recompute = .false.
1051	do ifld=1,nflds
1052	arr_gsum(ifld) = 0.0_r8
1053	ioffset = offset(ifld)
1054
1055	! if validate is .true., test whether the summand upper bound
1056	! was exceeded on any of the processes
1057	if (validate) then
1058	if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
1059	recompute = .true.
1060	endif
1061	endif
1062
1063	if (.not. recompute) then
1064
1065	! preprocess integer vector:
1066	! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
1067	! to next larger integer in vector, resulting in nonoverlapping ranges for each
1068	! component. Note that have "ilevel-1=0" level here as described above.
1069	do ilevel=max_levels(ifld),1,-1
1070	RX_8 = i8_arr_gsum_level(ioffset+ilevel)
1071	IX_8 = int(scale(RX_8,-arr_max_shift),i8)
1072	if (IX_8 .ne. 0_i8) then
1073	i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
1074	+ IX_8
1075	IX_8 = IX_8(i8_radix*arr_max_shift)
1076	i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) &
1077	- IX_8
1078	endif
1079	enddo
1080	! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
1081	! so that all vector components have the same sign (eliminating loss
1082	! of accuracy arising from difference of large values when
1083	! reconstructing r8 sum from integer vector)
1084	ilevel = 0
1085	do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
1086	.and. (ilevel < max_levels(ifld)))
1087	ilevel = ilevel + 1
1088	enddo
1089	!
1090	if (ilevel < max_levels(ifld)) then
1091	if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
1092	i8_sign = 1_i8
1093	else
1094	i8_sign = -1_i8
1095	endif
1096	do jlevel=ilevel,max_levels(ifld)-1
1097	if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
1098	.ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
1099	i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) &
1100	- i8_sign
1101	i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
1102	+ i8_sign(i8_radix*arr_max_shift)
1103	endif
1104	enddo
1105	endif
1106
1107	! start with maximum shift, and work up to larger values
1108	arr_shift = arr_gmax_exp(ifld) &
1109	- max_levels(ifld)*arr_max_shift
1110	curr_exp = 0
1111	first = .true.
1112	do ilevel=max_levels(ifld),0,-1
1113
1114	if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
1115	jlevel = 1
1116
1117	! r8 representation of higher order bits in integer
1118	X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
1119	LX(jlevel) = exponent(X_8(jlevel))
1120
1121	! calculate remainder
1122	IX_8 = int(X_8(jlevel),i8)
1123	RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
1124
1125	! repeat using remainder
1126	do while (RX_8 .ne. 0.0_r8)
1127	jlevel = jlevel + 1
1128	X_8(jlevel) = RX_8
1129	LX(jlevel) = exponent(RX_8)
1130	IX_8 = IX_8 + int(RX_8,i8)
1131	RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
1132	enddo
1133
1134	! add in contributions, smaller to larger, rescaling for each
1135	! addition to guarantee that exponent of working summand is always
1136	! larger than minexponent
1137	do while (jlevel > 0)
1138	if (first) then
1139	curr_exp = LX(jlevel) + arr_shift
1140	arr_gsum(ifld) = fraction(X_8(jlevel))
1141	first = .false.
1142	else
1143	corr_exp = curr_exp - (LX(jlevel) + arr_shift)
1144	arr_gsum(ifld) = fraction(X_8(jlevel)) &
1145	+ scale(arr_gsum(ifld),corr_exp)
1146	curr_exp = LX(jlevel) + arr_shift
1147	endif
1148	jlevel = jlevel - 1
1149	enddo
1150
1151	endif
1152
1153	arr_shift = arr_shift + arr_max_shift
1154	enddo
1155
1156	! apply final exponent correction, scaling first if exponent is too small
1157	! to apply directly
1158	corr_exp = curr_exp + exponent(arr_gsum(ifld))
1159	if (corr_exp .ge. MINEXPONENT(1._r8)) then
1160	arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
1161	else
1162	RX_8 = set_exponent(arr_gsum(ifld), &
1163	corr_exp-MINEXPONENT(1._r8))
1164	arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
1165	endif
1166
1167	! if validate is .true. and some precision lost, test whether 'too much'
1168	! was lost, due to too loose an upper bound, too stringent a limit on number
1169	! of levels of expansion, cancellation, .... Calculated by comparing lower
1170	! bound on number of sigificant digits with number of digits in 1.0_r8 .
1171	if (validate) then
1172	if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
1173
1174	! find first nonzero level and use exponent for this level, then assume all
1175	! subsequent levels contribute arr_max_shift digits.
1176	sum_digits = 0
1177	do ilevel=0,max_levels(ifld)
1178	if (sum_digits .eq. 0) then
1179	if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
1180	X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
1181	LX(1) = exponent(X_8(1))
1182	sum_digits = LX(1)
1183	endif
1184	else
1185	sum_digits = sum_digits + arr_max_shift
1186	endif
1187	enddo
1188
1189	if (sum_digits < digits(1.0_r8)) then
1190	recompute = .true.
1191	endif
1192	endif
1193	endif
1194
1195	endif
1196
1197	enddo
1198
1199
1200	end subroutine oasis_reprosum_int
1201
1202	!========================================================================
1203	!
1204	! Purpose:
1205	!> Test whether distributed sum exceeds tolerance and print out a
1206	!> warning message.
1207	!
1208	!----------------------------------------------------------------------
1209
1210	logical function oasis_reprosum_tolExceeded (name, nflds, master, &
1211	logunit, rel_diff )
1212	!----------------------------------------------------------------------
1213
1214	! Arguments
1215
1216	character(len=*), intent(in) :: name !< distributed sum identifier
1217	integer, intent(in) :: nflds !< number of fields
1218	logical, intent(in) :: master !< process that will write
1219	!< warning messages?
1220	integer, optional, intent(in) :: logunit!< unit warning messages
1221	!< written to
1222	real(r8), intent(in) :: rel_diff(2,nflds)
1223	!< relative and absolute
1224	!< differences between fixed
1225	!< and floating point sums
1226
1227	! Local workspace
1228
1229	integer :: llogunit ! local log unit
1230	integer :: ifld ! field index
1231	integer :: exceeds_limit ! number of fields whose
1232	! sum exceeds tolerance
1233	real(r8) :: max_rel_diff ! maximum relative difference
1234	integer :: max_rel_diff_idx ! field index for max. rel. diff.
1235	real(r8) :: max_abs_diff ! maximum absolute difference
1236	integer :: max_abs_diff_idx ! field index for max. abs. diff.
1237	character(len=*),parameter :: subname = '(oasis_reprosum_tolExceeded)'
1238
1239	!-----------------------------------------------------------------------
1240
1241	oasis_reprosum_tolExceeded = .false.
1242	if (oasis_reprosum_reldiffmax < 0.0_r8) return
1243
1244	if ( present(logunit) ) then
1245	llogunit = logunit
1246	else
1247	llogunit = nulprt
1248	endif
1249
1250	! check that "fast" reproducible sum is accurate enough.
1251	exceeds_limit = 0
1252	max_rel_diff = 0.0_r8
1253	max_abs_diff = 0.0_r8
1254	do ifld=1,nflds
1255	if (rel_diff(1,ifld) > oasis_reprosum_reldiffmax) then
1256	exceeds_limit = exceeds_limit + 1
1257	if (rel_diff(1,ifld) > max_rel_diff) then
1258	max_rel_diff = rel_diff(1,ifld)
1259	max_rel_diff_idx = ifld
1260	endif
1261	if (rel_diff(2,ifld) > max_abs_diff) then
1262	max_abs_diff = rel_diff(2,ifld)
1263	max_abs_diff_idx = ifld
1264	endif
1265	endif
1266	enddo
1267
1268	if (exceeds_limit > 0) then
1269	if (master) then
1270	write(llogunit,*) subname,trim(name), &
1271	': difference in fixed and floating point sums ', &
1272	' exceeds tolerance in ', exceeds_limit, &
1273	' fields.'
1274	write(llogunit,*) subname,' Maximum relative diff: (rel)', &
1275	rel_diff(1,max_rel_diff_idx), ' (abs) ', &
1276	rel_diff(2,max_rel_diff_idx)
1277	write(llogunit,*) subname,' Maximum absolute diff: (rel)', &
1278	rel_diff(1,max_abs_diff_idx), ' (abs) ', &
1279	rel_diff(2,max_abs_diff_idx)
1280	endif
1281	oasis_reprosum_tolExceeded = .true.
1282	endif
1283
1284
1285	end function oasis_reprosum_tolExceeded
1286
1287	!========================================================================
1288	!
1289	! Purpose:
1290	!> Compute the global sum of each field in "arr" using the indicated
1291	!> communicator with a reproducible yet scalable implementation based
1292	!> on He and Ding's implementation of the double-double algorithm.
1293	!
1294	!----------------------------------------------------------------------
1295
1296	subroutine oasis_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, &
1297	nflds, mpi_comm )
1298	!----------------------------------------------------------------------
1299
1300	! Arguments
1301
1302	integer, intent(in) :: nsummands !< number of local summands
1303	integer, intent(in) :: dsummands !< declared first dimension
1304	integer, intent(in) :: nflds !< number of fields
1305	real(r8), intent(in) :: arr(dsummands,nflds)
1306	!< input array
1307	integer, intent(in) :: mpi_comm !< MPI subcommunicator
1308
1309	real(r8), intent(out):: arr_gsum(nflds)
1310	!< global sums
1311
1312
1313	! Local workspace
1314
1315	integer :: old_cw ! for x86 processors, save
1316	! current arithmetic mode
1317	integer :: ifld, isum ! loop variables
1318	integer :: ierr ! MPI error return
1319
1320	real(r8) :: e, t1, t2 ! temporaries
1321	complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double
1322	! format)
1323	complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double
1324	! format)
1325
1326	integer, save :: mpi_sumdd
1327	logical, save :: first_time = .true.
1328	character(len=*),parameter :: subname = '(oasis_reprosum_ddpdd)'
1329
1330	!-----------------------------------------------------------------------
1331
1332	call shr_reprosumx86_fix_start (old_cw)
1333
1334	if (first_time) then
1335	call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
1336	first_time = .false.
1337	endif
1338
1339	do ifld=1,nflds
1340	arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
1341
1342	do isum=1,nsummands
1343
1344	! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
1345	! trick.
1346	t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
1347	e = t1 - arr(isum,ifld)
1348	t2 = ((real(arr_lsum_dd(ifld)) - e) &
1349	+ (arr(isum,ifld) - (t1 - e))) &
1350	+ aimag(arr_lsum_dd(ifld))
1351
1352	! The result is t1 + t2, after normalization.
1353	arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
1354	enddo
1355
1356	enddo
1357
1358	call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
1359	MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
1360	do ifld=1,nflds
1361	arr_gsum(ifld) = real(arr_gsum_dd(ifld))
1362	enddo
1363
1364	call shr_reprosumx86_fix_end (old_cw)
1365
1366	end subroutine oasis_reprosum_ddpdd
1367
1368	!-----------------------------------------------------------------------
1369
1370	subroutine DDPDD (dda, ddb, len, itype)
1371	!----------------------------------------------------------------------
1372	!
1373	! Purpose:
1374	! Modification of original codes written by David H. Bailey
1375	! This subroutine computes ddb(i) = dda(i)+ddb(i)
1376	!
1377	!----------------------------------------------------------------------
1378
1379	! Arguments
1380
1381	integer, intent(in) :: len ! array length
1382	complex(r8), intent(in) :: dda(len) ! input
1383	complex(r8), intent(inout) :: ddb(len) ! result
1384	integer, intent(in) :: itype ! unused
1385
1386	! Local workspace
1387
1388	real(r8) e, t1, t2
1389	integer i
1390	character(len=*),parameter :: subname = '(oasis_reprosum_mod:DDPDD)'
1391
1392	!-----------------------------------------------------------------------
1393
1394	do i = 1, len
1395	! Compute dda + ddb using Knuth's trick.
1396	t1 = real(dda(i)) + real(ddb(i))
1397	e = t1 - real(dda(i))
1398	t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
1399	+ aimag(dda(i)) + aimag(ddb(i))
1400
1401	! The result is t1 + t2, after normalization.
1402	ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
1403	enddo
1404
1405
1406	end subroutine DDPDD
1407
1408	!-----------------------------------------------------------------------
1409
1410	subroutine split_indices(total,num_pieces,ibeg,iend)
1411	!----------------------------------------------------------------------
1412	!
1413	! Purpose:
1414	! Split range into 'num_pieces'
1415	!
1416	!----------------------------------------------------------------------
1417
1418	! Arguments
1419
1420	integer, intent(in) :: total
1421	integer, intent(in) :: num_pieces
1422	integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
1423
1424	! Local workspace
1425
1426	integer :: itmp1, itmp2, ioffset, i
1427	character(len=*),parameter :: subname = '(oasis_reprosum_mod:split_indices)'
1428
1429	!-----------------------------------------------------------------------
1430
1431	itmp1 = total/num_pieces
1432	itmp2 = mod(total,num_pieces)
1433	ioffset = 0
1434	do i=1,itmp2
1435	ibeg(i) = ioffset + 1
1436	iend(i) = ioffset + (itmp1+1)
1437	ioffset = iend(i)
1438	enddo
1439	do i=itmp2+1,num_pieces
1440	ibeg(i) = ioffset + 1
1441	if (ibeg(i) > total) then
1442	iend(i) = ibeg(i) - 1
1443	else
1444	iend(i) = ioffset + itmp1
1445	ioffset = iend(i)
1446	endif
1447	enddo
1448
1449	end subroutine split_indices
1450
1451	!========================================================================
1452
1453	end module mod_oasis_reprosum

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CPL/oasis3-mct/branches/OASIS3-MCT_5.0_branch/lib/psmile/src/mod_oasis_reprosum.F90

Download in other formats: