1 | |
---|
2 | !> OASIS reproducible sum method from P. Worley |
---|
3 | |
---|
4 | MODULE mod_oasis_reprosum |
---|
5 | |
---|
6 | !----------------------------------------------------------------------- |
---|
7 | ! |
---|
8 | ! Purpose: |
---|
9 | !> Compute reproducible global sums of a set of arrays across an MPI |
---|
10 | !> subcommunicator |
---|
11 | ! |
---|
12 | ! Methods: |
---|
13 | !> Compute using either or both a scalable, reproducible algorithm and a |
---|
14 | !> scalable, nonreproducible algorithm: |
---|
15 | !> * Reproducible (scalable): |
---|
16 | !> Convert to fixed point (integer vector representation) to enable |
---|
17 | !> reproducibility when using MPI_Allreduce |
---|
18 | !> * Alternative usually reproducible (scalable): |
---|
19 | !> Use parallel double-double algorithm due to Helen He and |
---|
20 | !> Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm |
---|
21 | !> * Nonreproducible (scalable): |
---|
22 | !> Floating point and MPI_Allreduce based. |
---|
23 | !> If computing both reproducible and nonreproducible sums, compare |
---|
24 | !> these and report relative difference (if absolute difference |
---|
25 | !> less than sum) or absolute difference back to calling routine. |
---|
26 | ! |
---|
27 | ! Author: P. Worley (based on suggestions from J. White for fixed |
---|
28 | ! point algorithm and on He/Ding paper for ddpdd |
---|
29 | ! algorithm) |
---|
30 | ! |
---|
31 | ! Modified by T.Craig for Oasis, May 2017 based on the public version in |
---|
32 | ! CESM1.2. |
---|
33 | ! |
---|
34 | !----------------------------------------------------------------------- |
---|
35 | |
---|
36 | !----------------------------------------------------------------------- |
---|
37 | !- use statements ------------------------------------------------------ |
---|
38 | !----------------------------------------------------------------------- |
---|
39 | #if ( defined noI8 ) |
---|
40 | ! Workaround for when shr_kind_i8 is not supported. |
---|
41 | use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i4_p |
---|
42 | #else |
---|
43 | use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i8_p |
---|
44 | #endif |
---|
45 | use mod_oasis_data, only: nulprt |
---|
46 | use mod_oasis_sys, only: oasis_abort, estr |
---|
47 | use mod_oasis_mpi, only: oasis_mpi_barrier |
---|
48 | use mod_oasis_timer, only: oasis_timer_start, oasis_timer_stop |
---|
49 | use, intrinsic :: iso_c_binding |
---|
50 | |
---|
51 | !----------------------------------------------------------------------- |
---|
52 | !- module boilerplate -------------------------------------------------- |
---|
53 | !----------------------------------------------------------------------- |
---|
54 | implicit none |
---|
55 | private |
---|
56 | |
---|
57 | !----------------------------------------------------------------------- |
---|
58 | !- include statements -------------------------------------------------- |
---|
59 | !----------------------------------------------------------------------- |
---|
60 | #include <mpif.h> |
---|
61 | |
---|
62 | !----------------------------------------------------------------------- |
---|
63 | ! Public interfaces ---------------------------------------------------- |
---|
64 | !----------------------------------------------------------------------- |
---|
65 | public :: & |
---|
66 | oasis_reprosum_setopts, &! set runtime options |
---|
67 | oasis_reprosum_calc, &! calculate distributed sum |
---|
68 | oasis_reprosum_tolExceeded ! utility function to check relative |
---|
69 | ! differences against the tolerance |
---|
70 | |
---|
71 | !----------------------------------------------------------------------- |
---|
72 | ! Public data ---------------------------------------------------------- |
---|
73 | !----------------------------------------------------------------------- |
---|
74 | logical, public :: oasis_reprosum_recompute = .false. |
---|
75 | |
---|
76 | real(r8), public :: oasis_reprosum_reldiffmax = -1.0_r8 |
---|
77 | |
---|
78 | !----------------------------------------------------------------------- |
---|
79 | ! Private interfaces --------------------------------------------------- |
---|
80 | !----------------------------------------------------------------------- |
---|
81 | private :: & |
---|
82 | ddpdd, &! double-double sum routine |
---|
83 | split_indices ! split indices among OMP threads |
---|
84 | |
---|
85 | !----------------------------------------------------------------------- |
---|
86 | ! Private data --------------------------------------------------------- |
---|
87 | !----------------------------------------------------------------------- |
---|
88 | |
---|
89 | logical :: repro_sum_use_ddpdd = .false. |
---|
90 | logical :: detailed_timing = .false. |
---|
91 | |
---|
92 | !----------------------------------------------------------------------- |
---|
93 | ! External C bindings -------------------------------------------------- |
---|
94 | !----------------------------------------------------------------------- |
---|
95 | |
---|
96 | interface |
---|
97 | subroutine shr_reprosumx86_fix_start(old_cw) bind(C) |
---|
98 | use iso_c_binding, only: c_int |
---|
99 | integer(c_int), intent(out) :: old_cw |
---|
100 | end subroutine shr_reprosumx86_fix_start |
---|
101 | |
---|
102 | subroutine shr_reprosumx86_fix_end(old_cw) bind(C) |
---|
103 | use iso_c_binding, only: c_int |
---|
104 | integer(c_int), intent(in) :: old_cw |
---|
105 | end subroutine shr_reprosumx86_fix_end |
---|
106 | end interface |
---|
107 | |
---|
108 | CONTAINS |
---|
109 | |
---|
110 | !======================================================================== |
---|
111 | !----------------------------------------------------------------------- |
---|
112 | ! Purpose: |
---|
113 | !> Set runtime options |
---|
114 | ! Author: P. Worley |
---|
115 | !----------------------------------------------------------------------- |
---|
116 | |
---|
117 | subroutine oasis_reprosum_setopts(repro_sum_use_ddpdd_in, & |
---|
118 | repro_sum_rel_diff_max_in, & |
---|
119 | repro_sum_recompute_in, & |
---|
120 | repro_sum_master, & |
---|
121 | repro_sum_logunit ) |
---|
122 | |
---|
123 | !------------------------------Arguments-------------------------------- |
---|
124 | logical, intent(in), optional :: repro_sum_use_ddpdd_in |
---|
125 | !< Use DDPDD algorithm instead of fixed precision algorithm |
---|
126 | real(r8), intent(in), optional :: repro_sum_rel_diff_max_in |
---|
127 | !< maximum permissible difference between reproducible and |
---|
128 | !< nonreproducible sums |
---|
129 | logical, intent(in), optional :: repro_sum_recompute_in |
---|
130 | !< recompute using different algorithm when difference between |
---|
131 | !< reproducible and nonreproducible sums is too great |
---|
132 | logical, intent(in), optional :: repro_sum_master |
---|
133 | !< flag indicating whether this process should output |
---|
134 | !< log messages |
---|
135 | integer, intent(in), optional :: repro_sum_logunit |
---|
136 | !< unit number for log messages |
---|
137 | !---------------------------Local Workspace----------------------------- |
---|
138 | integer llogunit ! unit number for log messages |
---|
139 | logical master ! local master? |
---|
140 | logical,save :: firstcall = .true. ! first call |
---|
141 | character(len=*),parameter :: subname = '(oasis_reprosum_setopts)' |
---|
142 | !----------------------------------------------------------------------- |
---|
143 | |
---|
144 | if ( present(repro_sum_master) ) then |
---|
145 | master = repro_sum_master |
---|
146 | else |
---|
147 | master = .false. |
---|
148 | endif |
---|
149 | |
---|
150 | if ( present(repro_sum_logunit) ) then |
---|
151 | llogunit = repro_sum_logunit |
---|
152 | else |
---|
153 | llogunit = nulprt |
---|
154 | endif |
---|
155 | |
---|
156 | if (.not. firstcall) then |
---|
157 | write(llogunit,*) subname,estr,' can only be called once' |
---|
158 | call oasis_abort(file=__FILE__,line=__LINE__) |
---|
159 | endif |
---|
160 | firstcall = .false. |
---|
161 | |
---|
162 | if ( present(repro_sum_use_ddpdd_in) ) then |
---|
163 | repro_sum_use_ddpdd = repro_sum_use_ddpdd_in |
---|
164 | endif |
---|
165 | if ( present(repro_sum_rel_diff_max_in) ) then |
---|
166 | oasis_reprosum_reldiffmax = repro_sum_rel_diff_max_in |
---|
167 | endif |
---|
168 | if ( present(repro_sum_recompute_in) ) then |
---|
169 | oasis_reprosum_recompute = repro_sum_recompute_in |
---|
170 | endif |
---|
171 | if (master) then |
---|
172 | if ( repro_sum_use_ddpdd ) then |
---|
173 | write(llogunit,*) subname, & |
---|
174 | 'Using double-double-based (scalable) usually reproducible ', & |
---|
175 | 'distributed sum algorithm' |
---|
176 | else |
---|
177 | write(llogunit,*) subname, & |
---|
178 | 'Using fixed-point-based (scalable) reproducible ', & |
---|
179 | 'distributed sum algorithm' |
---|
180 | endif |
---|
181 | |
---|
182 | if (oasis_reprosum_reldiffmax >= 0._r8) then |
---|
183 | write(llogunit,*) subname, & |
---|
184 | ' with a maximum relative error tolerance of ', & |
---|
185 | oasis_reprosum_reldiffmax |
---|
186 | if (oasis_reprosum_recompute) then |
---|
187 | write(llogunit,*) subname, & |
---|
188 | 'If tolerance exceeded, sum is recomputed using ', & |
---|
189 | 'a serial algorithm.' |
---|
190 | else |
---|
191 | write(llogunit,*) subname, & |
---|
192 | 'If tolerance exceeded, fixed-precision is sum used ', & |
---|
193 | 'but a warning is output.' |
---|
194 | endif |
---|
195 | else |
---|
196 | write(llogunit,*) subname, & |
---|
197 | 'and not comparing with floating point algorithms.' |
---|
198 | endif |
---|
199 | |
---|
200 | endif |
---|
201 | end subroutine oasis_reprosum_setopts |
---|
202 | |
---|
203 | !======================================================================== |
---|
204 | |
---|
205 | ! |
---|
206 | ! Purpose: |
---|
207 | !> Compute the global sum of each field in "arr" using the indicated |
---|
208 | !> communicator with a reproducible yet scalable implementation based |
---|
209 | !> on a fixed point algorithm. An alternative is to use an "almost |
---|
210 | !> always reproducible" floating point algorithm. |
---|
211 | ! |
---|
212 | ! The accuracy of the fixed point algorithm is controlled by the |
---|
213 | ! number of "levels" of integer expansion. The algorithm will calculate |
---|
214 | ! the number of levels that is required for the sum to be essentially |
---|
215 | ! exact. The optional parameter arr_max_levels can be used to override |
---|
216 | ! the calculated value. The optional parameter arr_max_levels_out can be |
---|
217 | ! used to return the values used. |
---|
218 | ! |
---|
219 | ! The algorithm also requires an upper bound on |
---|
220 | ! the maximum summand (in absolute value) for each field, and will |
---|
221 | ! calculate this internally. However, if the optional parameters |
---|
222 | ! arr_max_levels and arr_gbl_max are both set, then the algorithm will |
---|
223 | ! use the values in arr_gbl_max for the upper bounds instead. If these |
---|
224 | ! are not upper bounds, or if the upper bounds are not tight enough |
---|
225 | ! to achieve the requisite accuracy, and if the optional parameter |
---|
226 | ! repro_sum_validate is NOT set to .false., the algorithm will repeat the |
---|
227 | ! computation with appropriate upper bounds. If only arr_gbl_max is present, |
---|
228 | ! then the maxima are computed internally (and the specified values are |
---|
229 | ! ignored). The optional parameter arr_gbl_max_out can be |
---|
230 | ! used to return the values used. |
---|
231 | ! |
---|
232 | ! Finally, the algorithm requires an upper bound on the number of |
---|
233 | ! local summands across all processes. This will be calculated internally, |
---|
234 | ! using an MPI collective, but the value in the optional argument |
---|
235 | ! gbl_max_nsummands will be used instead if (1) it is present, (2) |
---|
236 | ! it is > 0, and (3) the maximum value and required number of levels |
---|
237 | ! are also specified. (If the maximum value is calculated, the same |
---|
238 | ! MPI collective is used to determine the maximum number of local |
---|
239 | ! summands.) The accuracy of the user-specified value is not checked. |
---|
240 | ! However, if set to < 1, the value will instead be calculated. If the |
---|
241 | ! optional parameter gbl_max_nsummands_out is present, then the value |
---|
242 | ! used (gbl_max_nsummands if >= 1; calculated otherwise) will be |
---|
243 | ! returned. |
---|
244 | ! |
---|
245 | ! If requested (by setting oasis_reprosum_reldiffmax >= 0.0 and passing in |
---|
246 | ! the optional rel_diff parameter), results are compared with a |
---|
247 | ! nonreproducible floating point algorithm. |
---|
248 | ! |
---|
249 | ! Note that the cost of the algorithm is not strongly correlated with |
---|
250 | ! the number of levels, which primarily shows up as a (modest) increase |
---|
251 | ! in cost of the MPI_Allreduce as a function of vector length. Rather the |
---|
252 | ! cost is more a function of (a) the number of integers required to |
---|
253 | ! represent an individual summand and (b) the number of MPI_Allreduce |
---|
254 | ! calls. The number of integers required to represent an individual |
---|
255 | ! summand is 1 or 2 when using 8-byte integers for 8-byte real summands |
---|
256 | ! when the number of local summands is not too large. As the number of |
---|
257 | ! local summands increases, the number of integers required increases. |
---|
258 | ! The number of MPI_Allreduce calls is either 2 (specifying nothing) or |
---|
259 | ! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max |
---|
260 | ! correctly). When specifying arr_max_levels and arr_gbl_max |
---|
261 | ! incorrectly, 3 or 4 MPI_Allreduce calls will be required. |
---|
262 | ! |
---|
263 | ! The alternative algorithm is a minor modification of a parallel |
---|
264 | ! implementation of David Bailey's routine DDPDD by Helen He |
---|
265 | ! and Chris Ding. Bailey uses the Knuth trick to implement quadruple |
---|
266 | ! precision summation of double precision values with 10 double |
---|
267 | ! precision operations. The advantage of this algorithm is that |
---|
268 | ! it requires a single MPI_Allreduce and is less expensive per summand |
---|
269 | ! than is the fixed precision algorithm. The disadvantage is that it |
---|
270 | ! is not guaranteed to be reproducible (though it is reproducible |
---|
271 | ! much more often than is the standard algorithm). This alternative |
---|
272 | ! is used when the optional parameter ddpdd_sum is set to .true. It is |
---|
273 | ! also used if the fixed precision algorithm radix assumption does not |
---|
274 | ! hold. |
---|
275 | |
---|
276 | !---------------------------------------------------------------------- |
---|
277 | subroutine oasis_reprosum_calc (arr, arr_gsum, nsummands, dsummands, & |
---|
278 | nflds, ddpdd_sum, & |
---|
279 | arr_gbl_max, arr_gbl_max_out, & |
---|
280 | arr_max_levels, arr_max_levels_out, & |
---|
281 | gbl_max_nsummands, gbl_max_nsummands_out,& |
---|
282 | gbl_count, repro_sum_validate, & |
---|
283 | repro_sum_stats, rel_diff, commid ) |
---|
284 | !---------------------------------------------------------------------- |
---|
285 | |
---|
286 | ! Arguments |
---|
287 | |
---|
288 | integer, intent(in) :: nsummands !< number of local summands |
---|
289 | integer, intent(in) :: dsummands !< declared first dimension |
---|
290 | integer, intent(in) :: nflds !< number of fields |
---|
291 | real(r8), intent(in) :: arr(dsummands,nflds) |
---|
292 | !< input array |
---|
293 | |
---|
294 | real(r8), intent(out):: arr_gsum(nflds) |
---|
295 | !< global means |
---|
296 | |
---|
297 | logical, intent(in), optional :: ddpdd_sum |
---|
298 | !< use ddpdd algorithm instead |
---|
299 | !< of fixed precision algorithm |
---|
300 | |
---|
301 | real(r8), intent(in), optional :: arr_gbl_max(nflds) |
---|
302 | !< upper bound on max(abs(arr)) |
---|
303 | |
---|
304 | real(r8), intent(out), optional :: arr_gbl_max_out(nflds) |
---|
305 | !< calculated upper bound on |
---|
306 | !< max(abs(arr)) |
---|
307 | |
---|
308 | integer, intent(in), optional :: arr_max_levels(nflds) |
---|
309 | !< maximum number of levels of |
---|
310 | !< integer expansion to use |
---|
311 | |
---|
312 | integer, intent(out), optional :: arr_max_levels_out(nflds) |
---|
313 | !< output of number of levels of |
---|
314 | !< integer expansion to used |
---|
315 | |
---|
316 | integer, intent(in), optional :: gbl_max_nsummands |
---|
317 | !< maximum of nsummand over all |
---|
318 | !< processes |
---|
319 | |
---|
320 | integer, intent(out), optional :: gbl_max_nsummands_out |
---|
321 | !< calculated maximum nsummands |
---|
322 | !< over all processes |
---|
323 | |
---|
324 | integer, intent(in), optional :: gbl_count |
---|
325 | !< was total number of summands; |
---|
326 | !< now is ignored; use |
---|
327 | !< gbl_max_nsummands instead |
---|
328 | |
---|
329 | logical, intent(in), optional :: repro_sum_validate |
---|
330 | !< flag enabling/disabling testing that gmax and max_levels are |
---|
331 | !< accurate/sufficient. Default is enabled. |
---|
332 | |
---|
333 | integer, intent(inout), optional :: repro_sum_stats(5) |
---|
334 | !< increment running totals for |
---|
335 | !< (1) one-reduction repro_sum |
---|
336 | !< (2) two-reduction repro_sum |
---|
337 | !< (3) both types in one call |
---|
338 | !< (4) nonrepro_sum |
---|
339 | !< (5) global max nsummands reduction |
---|
340 | |
---|
341 | real(r8), intent(out), optional :: rel_diff(2,nflds) |
---|
342 | !< relative and absolute |
---|
343 | !< differences between fixed |
---|
344 | !< and floating point sums |
---|
345 | |
---|
346 | integer, intent(in), optional :: commid |
---|
347 | !< MPI communicator |
---|
348 | |
---|
349 | ! Local workspace |
---|
350 | |
---|
351 | logical :: use_ddpdd_sum ! flag indicating whether to |
---|
352 | ! use oasis_reprosum_ddpdd or not |
---|
353 | logical :: recompute ! flag indicating need to |
---|
354 | ! determine gmax/gmin before |
---|
355 | ! computing sum |
---|
356 | logical :: validate ! flag indicating need to |
---|
357 | ! verify gmax and max_levels |
---|
358 | ! are accurate/sufficient |
---|
359 | integer :: omp_nthreads ! number of OpenMP threads |
---|
360 | integer :: mpi_comm ! MPI subcommunicator |
---|
361 | integer :: tasks ! number of MPI processes |
---|
362 | integer :: ierr ! MPI error return |
---|
363 | integer :: ifld, isum, ithread ! loop variables |
---|
364 | integer :: max_nsummands ! max nsummands over all processes |
---|
365 | ! or threads (used in both ways) |
---|
366 | |
---|
367 | integer, allocatable :: isum_beg(:), isum_end(:) |
---|
368 | ! range of summand indices for each |
---|
369 | ! OpenMP thread |
---|
370 | integer, allocatable :: arr_tlmin_exp(:,:) |
---|
371 | ! per thread local exponent minima |
---|
372 | integer, allocatable :: arr_tlmax_exp(:,:) |
---|
373 | ! per thread local exponent maxima |
---|
374 | integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax |
---|
375 | ! summand exponent and working min/max |
---|
376 | integer :: arr_lmin_exp(nflds) ! local exponent minima |
---|
377 | integer :: arr_lmax_exp(nflds) ! local exponent maxima |
---|
378 | integer :: arr_lextremes(0:nflds,2)! local exponent extrema |
---|
379 | integer :: arr_gextremes(0:nflds,2)! global exponent extrema |
---|
380 | |
---|
381 | integer :: arr_gmax_exp(nflds) ! global exponents maxima |
---|
382 | integer :: arr_gmin_exp(nflds) ! global exponents minima |
---|
383 | integer :: arr_max_shift ! maximum safe exponent for |
---|
384 | ! value < 1 (so that sum does |
---|
385 | ! not overflow) |
---|
386 | integer :: max_levels(nflds) ! maximum number of levels of |
---|
387 | ! integer expansion to use |
---|
388 | integer :: max_level ! maximum value in max_levels |
---|
389 | integer :: gbl_max_red ! global max local sum reduction? (0/1) |
---|
390 | integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1) |
---|
391 | integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1) |
---|
392 | integer :: repro_sum_both ! both fast and slow? (0/1) |
---|
393 | integer :: nonrepro_sum ! nonrepro_sum? (0/1) |
---|
394 | |
---|
395 | real(r8) :: xmax_nsummands ! dble of max_nsummands |
---|
396 | real(r8) :: arr_lsum(nflds) ! local sums |
---|
397 | real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using |
---|
398 | ! fast, nonreproducible, |
---|
399 | ! floating point alg. |
---|
400 | real(r8) :: abs_diff ! absolute difference between |
---|
401 | ! fixed and floating point |
---|
402 | ! sums |
---|
403 | #ifdef _OPENMP |
---|
404 | integer omp_get_max_threads |
---|
405 | external omp_get_max_threads |
---|
406 | #endif |
---|
407 | character(len=*),parameter :: subname = '(oasis_reprosum_calc)' |
---|
408 | |
---|
409 | !----------------------------------------------------------------------- |
---|
410 | |
---|
411 | ! check whether should use oasis_reprosum_ddpdd algorithm |
---|
412 | use_ddpdd_sum = repro_sum_use_ddpdd |
---|
413 | if ( present(ddpdd_sum) ) then |
---|
414 | use_ddpdd_sum = ddpdd_sum |
---|
415 | endif |
---|
416 | |
---|
417 | ! check whether intrinsic-based algorithm will work on this system |
---|
418 | ! (requires floating point and integer bases to be the same) |
---|
419 | ! If not, always use ddpdd. |
---|
420 | use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8)) |
---|
421 | |
---|
422 | ! initialize local statistics variables |
---|
423 | gbl_max_red = 0 |
---|
424 | repro_sum_fast = 0 |
---|
425 | repro_sum_slow = 0 |
---|
426 | repro_sum_both = 0 |
---|
427 | nonrepro_sum = 0 |
---|
428 | |
---|
429 | ! set MPI communicator |
---|
430 | if ( present(commid) ) then |
---|
431 | mpi_comm = commid |
---|
432 | else |
---|
433 | mpi_comm = MPI_COMM_WORLD |
---|
434 | endif |
---|
435 | |
---|
436 | if (detailed_timing) then |
---|
437 | call oasis_timer_start('oasis_reprosum_prebarrier') |
---|
438 | call oasis_mpi_barrier(mpi_comm,subname) |
---|
439 | call oasis_timer_stop ('oasis_reprosum_prebarrier') |
---|
440 | endif |
---|
441 | |
---|
442 | if ( use_ddpdd_sum ) then |
---|
443 | |
---|
444 | if (detailed_timing) call oasis_timer_start('oasis_reprosum_ddpdd') |
---|
445 | |
---|
446 | call oasis_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & |
---|
447 | nflds, mpi_comm) |
---|
448 | repro_sum_fast = 1 |
---|
449 | |
---|
450 | if (detailed_timing) call oasis_timer_stop('oasis_reprosum_ddpdd') |
---|
451 | |
---|
452 | else |
---|
453 | |
---|
454 | if (detailed_timing) call oasis_timer_start('oasis_reprosum_int') |
---|
455 | |
---|
456 | ! get number of MPI tasks |
---|
457 | call mpi_comm_size(mpi_comm, tasks, ierr) |
---|
458 | |
---|
459 | ! get number of OpenMP threads |
---|
460 | #ifdef _OPENMP |
---|
461 | omp_nthreads = omp_get_max_threads() |
---|
462 | #else |
---|
463 | omp_nthreads = 1 |
---|
464 | #endif |
---|
465 | |
---|
466 | ! see if have sufficient information to not require max/min allreduce |
---|
467 | recompute = .true. |
---|
468 | validate = .false. |
---|
469 | if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then |
---|
470 | recompute = .false. |
---|
471 | |
---|
472 | ! setting lower bound on max_level*nflds to be 64 to improve OpenMP |
---|
473 | ! performance for loopb in oasis_reprosum_int |
---|
474 | max_level = (64/nflds) + 1 |
---|
475 | do ifld=1,nflds |
---|
476 | if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. & |
---|
477 | (arr_max_levels(ifld) > 0)) then |
---|
478 | |
---|
479 | arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld)) |
---|
480 | if (max_level < arr_max_levels(ifld)) & |
---|
481 | max_level = arr_max_levels(ifld) |
---|
482 | |
---|
483 | else |
---|
484 | recompute = .true. |
---|
485 | endif |
---|
486 | enddo |
---|
487 | |
---|
488 | if (.not. recompute) then |
---|
489 | |
---|
490 | ! determine maximum number of summands in local phases of the |
---|
491 | ! algorithm |
---|
492 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_max") |
---|
493 | if ( present(gbl_max_nsummands) ) then |
---|
494 | if (gbl_max_nsummands < 1) then |
---|
495 | call mpi_allreduce (nsummands, max_nsummands, 1, & |
---|
496 | MPI_INTEGER, MPI_MAX, mpi_comm, ierr) |
---|
497 | gbl_max_red = 1 |
---|
498 | else |
---|
499 | max_nsummands = gbl_max_nsummands |
---|
500 | endif |
---|
501 | else |
---|
502 | call mpi_allreduce (nsummands, max_nsummands, 1, & |
---|
503 | MPI_INTEGER, MPI_MAX, mpi_comm, ierr) |
---|
504 | gbl_max_red = 1 |
---|
505 | endif |
---|
506 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_max") |
---|
507 | |
---|
508 | ! determine maximum shift. Shift needs to be small enough that summation |
---|
509 | ! does not exceed maximum number of digits in i8. |
---|
510 | |
---|
511 | ! if requested, return max_nsummands before it is redefined |
---|
512 | if ( present( gbl_max_nsummands_out) ) then |
---|
513 | gbl_max_nsummands_out = max_nsummands |
---|
514 | endif |
---|
515 | |
---|
516 | ! summing within each thread first |
---|
517 | max_nsummands = (max_nsummands/omp_nthreads) + 1 |
---|
518 | ! then over threads and tasks |
---|
519 | max_nsummands = max(max_nsummands, tasks*omp_nthreads) |
---|
520 | |
---|
521 | xmax_nsummands = dble(max_nsummands) |
---|
522 | arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) |
---|
523 | if (arr_max_shift < 2) then |
---|
524 | write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm' |
---|
525 | call oasis_abort(file=__FILE__,line=__LINE__) |
---|
526 | endif |
---|
527 | |
---|
528 | ! calculate sum |
---|
529 | if (present(repro_sum_validate)) then |
---|
530 | validate = repro_sum_validate |
---|
531 | else |
---|
532 | validate = .true. |
---|
533 | endif |
---|
534 | call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, & |
---|
535 | nflds, arr_max_shift, arr_gmax_exp, & |
---|
536 | arr_max_levels, max_level, validate, & |
---|
537 | recompute, omp_nthreads, mpi_comm) |
---|
538 | |
---|
539 | ! record statistics, etc. |
---|
540 | repro_sum_fast = 1 |
---|
541 | if (recompute) then |
---|
542 | repro_sum_both = 1 |
---|
543 | else |
---|
544 | ! if requested, return specified levels and upper bounds on maxima |
---|
545 | if ( present(arr_max_levels_out) ) then |
---|
546 | do ifld=1,nflds |
---|
547 | arr_max_levels_out(ifld) = arr_max_levels(ifld) |
---|
548 | enddo |
---|
549 | endif |
---|
550 | if ( present(arr_gbl_max_out) ) then |
---|
551 | do ifld=1,nflds |
---|
552 | arr_gbl_max_out(ifld) = arr_gbl_max(ifld) |
---|
553 | enddo |
---|
554 | endif |
---|
555 | endif |
---|
556 | endif |
---|
557 | endif |
---|
558 | |
---|
559 | ! do not have sufficient information; calculate global max/min and |
---|
560 | ! use to compute required number of levels |
---|
561 | if (recompute) then |
---|
562 | |
---|
563 | ! record statistic |
---|
564 | repro_sum_slow = 1 |
---|
565 | |
---|
566 | ! determine maximum and minimum (non-zero) summand values and |
---|
567 | ! maximum number of local summands |
---|
568 | |
---|
569 | ! allocate thread-specific work space |
---|
570 | allocate(arr_tlmax_exp(nflds,omp_nthreads)) |
---|
571 | allocate(arr_tlmin_exp(nflds,omp_nthreads)) |
---|
572 | allocate(isum_beg(omp_nthreads)) |
---|
573 | allocate(isum_end(omp_nthreads)) |
---|
574 | |
---|
575 | ! split summand index range over OpenMP threads |
---|
576 | call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) |
---|
577 | |
---|
578 | !$omp parallel do & |
---|
579 | !$omp default(shared) & |
---|
580 | !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax) |
---|
581 | do ithread=1,omp_nthreads |
---|
582 | if (detailed_timing) call oasis_timer_start('repro_sum_loopa') |
---|
583 | do ifld=1,nflds |
---|
584 | arr_exp_tlmin = MAXEXPONENT(1._r8) |
---|
585 | arr_exp_tlmax = MINEXPONENT(1._r8) |
---|
586 | do isum=isum_beg(ithread),isum_end(ithread) |
---|
587 | if (arr(isum,ifld) .ne. 0.0_r8) then |
---|
588 | arr_exp = exponent(arr(isum,ifld)) |
---|
589 | arr_exp_tlmin = min(arr_exp,arr_exp_tlmin) |
---|
590 | arr_exp_tlmax = max(arr_exp,arr_exp_tlmax) |
---|
591 | endif |
---|
592 | end do |
---|
593 | arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin |
---|
594 | arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax |
---|
595 | end do |
---|
596 | if (detailed_timing) call oasis_timer_stop('repro_sum_loopa') |
---|
597 | end do |
---|
598 | |
---|
599 | do ifld=1,nflds |
---|
600 | arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:)) |
---|
601 | arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:)) |
---|
602 | end do |
---|
603 | deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end) |
---|
604 | |
---|
605 | arr_lextremes(0,:) = -nsummands |
---|
606 | arr_lextremes(1:nflds,1) = -arr_lmax_exp(:) |
---|
607 | arr_lextremes(1:nflds,2) = arr_lmin_exp(:) |
---|
608 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_minmax") |
---|
609 | call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), & |
---|
610 | MPI_INTEGER, MPI_MIN, mpi_comm, ierr) |
---|
611 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_minmax") |
---|
612 | max_nsummands = -arr_gextremes(0,1) |
---|
613 | arr_gmax_exp(:) = -arr_gextremes(1:nflds,1) |
---|
614 | arr_gmin_exp(:) = arr_gextremes(1:nflds,2) |
---|
615 | |
---|
616 | ! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT |
---|
617 | ! and arr_gmax_exp still equals MINEXPONENT. In this case, set |
---|
618 | ! arr_gmin_exp = arr_gmax_exp = MINEXPONENT |
---|
619 | do ifld=1,nflds |
---|
620 | arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld)) |
---|
621 | enddo |
---|
622 | |
---|
623 | ! if requested, return upper bounds on observed maxima |
---|
624 | if ( present(arr_gbl_max_out) ) then |
---|
625 | do ifld=1,nflds |
---|
626 | arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld)) |
---|
627 | enddo |
---|
628 | endif |
---|
629 | |
---|
630 | ! if requested, return max_nsummands before it is redefined |
---|
631 | if ( present( gbl_max_nsummands_out) ) then |
---|
632 | gbl_max_nsummands_out = max_nsummands |
---|
633 | endif |
---|
634 | |
---|
635 | ! determine maximum shift (same as in previous branch, but with calculated |
---|
636 | ! max_nsummands). Shift needs to be small enough that summation does not |
---|
637 | ! exceed maximum number of digits in i8. |
---|
638 | |
---|
639 | ! summing within each thread first |
---|
640 | max_nsummands = (max_nsummands/omp_nthreads) + 1 |
---|
641 | ! then over threads and tasks |
---|
642 | max_nsummands = max(max_nsummands, tasks*omp_nthreads) |
---|
643 | |
---|
644 | xmax_nsummands = dble(max_nsummands) |
---|
645 | arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) |
---|
646 | if (arr_max_shift < 2) then |
---|
647 | write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm' |
---|
648 | call oasis_abort(file=__FILE__,line=__LINE__) |
---|
649 | endif |
---|
650 | |
---|
651 | ! determine maximum number of levels required for each field |
---|
652 | ! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift) |
---|
653 | ! + 1 because first truncation probably does not involve a maximal shift |
---|
654 | ! + 1 to guarantee that the integer division rounds up (not down) |
---|
655 | ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP |
---|
656 | ! performance for loopb in oasis_reprosum_int) |
---|
657 | max_level = (64/nflds) + 1 |
---|
658 | do ifld=1,nflds |
---|
659 | max_levels(ifld) = 2 + & |
---|
660 | ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & |
---|
661 | / arr_max_shift) |
---|
662 | if ( present(arr_max_levels) .and. (.not. validate) ) then |
---|
663 | ! if validate true, then computation with arr_max_levels failed |
---|
664 | ! previously |
---|
665 | if ( arr_max_levels(ifld) > 0 ) then |
---|
666 | max_levels(ifld) = & |
---|
667 | min(arr_max_levels(ifld),max_levels(ifld)) |
---|
668 | endif |
---|
669 | endif |
---|
670 | if (max_level < max_levels(ifld)) & |
---|
671 | max_level = max_levels(ifld) |
---|
672 | enddo |
---|
673 | |
---|
674 | ! if requested, return calculated levels |
---|
675 | if ( present(arr_max_levels_out) ) then |
---|
676 | do ifld=1,nflds |
---|
677 | arr_max_levels_out(ifld) = max_levels(ifld) |
---|
678 | enddo |
---|
679 | endif |
---|
680 | |
---|
681 | ! calculate sum |
---|
682 | validate = .false. |
---|
683 | call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, & |
---|
684 | arr_max_shift, arr_gmax_exp, max_levels, & |
---|
685 | max_level, validate, recompute, & |
---|
686 | omp_nthreads, mpi_comm) |
---|
687 | |
---|
688 | endif |
---|
689 | |
---|
690 | if (detailed_timing) call oasis_timer_stop('oasis_reprosum_int') |
---|
691 | |
---|
692 | endif |
---|
693 | |
---|
694 | ! compare fixed and floating point results |
---|
695 | if ( present(rel_diff) ) then |
---|
696 | if (oasis_reprosum_reldiffmax >= 0.0_r8) then |
---|
697 | |
---|
698 | if (detailed_timing) then |
---|
699 | call oasis_timer_start('oasis_nonreprosum_prebarrier') |
---|
700 | call oasis_mpi_barrier(mpi_comm,subname) |
---|
701 | call oasis_timer_stop ('oasis_nonreprosum_prebarrier') |
---|
702 | endif |
---|
703 | |
---|
704 | if (detailed_timing) call oasis_timer_start('nonrepro_sum') |
---|
705 | ! record statistic |
---|
706 | nonrepro_sum = 1 |
---|
707 | ! compute nonreproducible sum |
---|
708 | arr_lsum(:) = 0._r8 |
---|
709 | !$omp parallel do & |
---|
710 | !$omp default(shared) & |
---|
711 | !$omp private(ifld, isum) |
---|
712 | do ifld=1,nflds |
---|
713 | do isum=1,nsummands |
---|
714 | arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld) |
---|
715 | end do |
---|
716 | end do |
---|
717 | |
---|
718 | call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, & |
---|
719 | MPI_REAL8, MPI_SUM, mpi_comm, ierr) |
---|
720 | |
---|
721 | if (detailed_timing) call oasis_timer_stop('nonrepro_sum') |
---|
722 | |
---|
723 | ! determine differences |
---|
724 | !$omp parallel do & |
---|
725 | !$omp default(shared) & |
---|
726 | !$omp private(ifld, abs_diff) |
---|
727 | do ifld=1,nflds |
---|
728 | abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld)) |
---|
729 | if (abs(arr_gsum(ifld)) > abs_diff) then |
---|
730 | rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld)) |
---|
731 | else |
---|
732 | rel_diff(1,ifld) = abs_diff |
---|
733 | endif |
---|
734 | rel_diff(2,ifld) = abs_diff |
---|
735 | enddo |
---|
736 | else |
---|
737 | rel_diff(:,:) = 0.0_r8 |
---|
738 | endif |
---|
739 | endif |
---|
740 | |
---|
741 | ! return statistics |
---|
742 | if ( present(repro_sum_stats) ) then |
---|
743 | repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast |
---|
744 | repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow |
---|
745 | repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both |
---|
746 | repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum |
---|
747 | repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red |
---|
748 | endif |
---|
749 | |
---|
750 | |
---|
751 | end subroutine oasis_reprosum_calc |
---|
752 | |
---|
753 | !======================================================================== |
---|
754 | ! |
---|
755 | ! Purpose: |
---|
756 | !> Compute the global sum of each field in "arr" using the indicated |
---|
757 | !> communicator with a reproducible yet scalable implementation based |
---|
758 | !> on a fixed point algorithm. The accuracy of the fixed point algorithm |
---|
759 | !> is controlled by the number of "levels" of integer expansion, the |
---|
760 | !> maximum value of which is specified by max_level. |
---|
761 | ! |
---|
762 | !---------------------------------------------------------------------- |
---|
763 | |
---|
764 | subroutine oasis_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & |
---|
765 | arr_max_shift, arr_gmax_exp, max_levels, & |
---|
766 | max_level, validate, recompute, & |
---|
767 | omp_nthreads, mpi_comm ) |
---|
768 | |
---|
769 | !---------------------------------------------------------------------- |
---|
770 | |
---|
771 | ! Arguments |
---|
772 | |
---|
773 | integer, intent(in) :: nsummands !< number of local summands |
---|
774 | integer, intent(in) :: dsummands !< declared first dimension |
---|
775 | integer, intent(in) :: nflds !< number of fields |
---|
776 | integer, intent(in) :: arr_max_shift !< maximum safe exponent for |
---|
777 | !< value < 1 (so that sum |
---|
778 | !< does not overflow) |
---|
779 | integer, intent(in) :: arr_gmax_exp(nflds) |
---|
780 | !< exponents of global maxima |
---|
781 | integer, intent(in) :: max_levels(nflds) |
---|
782 | !< maximum number of levels |
---|
783 | !< of integer expansion |
---|
784 | integer, intent(in) :: max_level !< maximum value in |
---|
785 | !< max_levels |
---|
786 | integer, intent(in) :: omp_nthreads !< number of OpenMP threads |
---|
787 | integer, intent(in) :: mpi_comm !< MPI subcommunicator |
---|
788 | |
---|
789 | real(r8), intent(in) :: arr(dsummands,nflds) |
---|
790 | !< input array |
---|
791 | |
---|
792 | logical, intent(in):: validate |
---|
793 | !< flag indicating that accuracy of solution generated from |
---|
794 | !< arr_gmax_exp and max_levels should be tested |
---|
795 | |
---|
796 | logical, intent(out):: recompute |
---|
797 | !< flag indicating that either the upper bounds are inaccurate, |
---|
798 | !< or max_levels and arr_gmax_exp do not generate accurate |
---|
799 | !< enough sums |
---|
800 | |
---|
801 | real(r8), intent(out):: arr_gsum(nflds) !< global means |
---|
802 | |
---|
803 | ! Local workspace |
---|
804 | |
---|
805 | integer, parameter :: max_jlevel = & |
---|
806 | 1 + (digits(0_i8)/digits(0.0_r8)) |
---|
807 | |
---|
808 | integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads) |
---|
809 | ! integer vector representing local |
---|
810 | ! sum (per thread, per field) |
---|
811 | integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds) |
---|
812 | ! integer vector representing local |
---|
813 | ! sum |
---|
814 | integer(i8) :: i8_arr_level ! integer part of summand for current |
---|
815 | ! expansion level |
---|
816 | integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds) |
---|
817 | ! integer vector representing global |
---|
818 | ! sum |
---|
819 | integer(i8) :: IX_8 ! integer representation of current |
---|
820 | ! jlevels of X_8 ('part' of |
---|
821 | ! i8_arr_gsum_level) |
---|
822 | integer(i8) :: i8_sign ! sign global sum |
---|
823 | integer(i8) :: i8_radix ! radix for i8 variables |
---|
824 | |
---|
825 | integer :: max_error(nflds,omp_nthreads) |
---|
826 | ! accurate upper bound on data? |
---|
827 | integer :: not_exact(nflds,omp_nthreads) |
---|
828 | ! max_levels sufficient to |
---|
829 | ! capture all digits? |
---|
830 | integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads) |
---|
831 | ! range of summand indices for each |
---|
832 | ! OpenMP thread |
---|
833 | integer :: ifld, isum, ithread |
---|
834 | ! loop variables |
---|
835 | integer :: arr_exp ! exponent of summand |
---|
836 | integer :: arr_shift ! exponent used to generate integer |
---|
837 | ! for current expansion level |
---|
838 | integer :: ilevel ! current integer expansion level |
---|
839 | integer :: offset(nflds) ! beginning location in |
---|
840 | ! i8_arr_{g,l}sum_level for integer |
---|
841 | ! expansion of current ifld |
---|
842 | integer :: voffset ! modification to offset used to |
---|
843 | ! include validation metrics |
---|
844 | integer :: ioffset ! offset(ifld) |
---|
845 | integer :: jlevel ! number of floating point 'pieces' |
---|
846 | ! extracted from a given i8 integer |
---|
847 | integer :: ierr ! MPI error return |
---|
848 | integer :: LX(max_jlevel) ! exponent of X_8 (see below) |
---|
849 | integer :: veclth ! total length of i8_arr_lsum_level |
---|
850 | integer :: sum_digits ! lower bound on number of significant |
---|
851 | ! in integer expansion of sum |
---|
852 | integer :: curr_exp ! exponent of partial sum during |
---|
853 | ! reconstruction from integer vector |
---|
854 | integer :: corr_exp ! exponent of current summand in |
---|
855 | ! reconstruction from integer vector |
---|
856 | |
---|
857 | real(r8) :: arr_frac ! fraction of summand |
---|
858 | real(r8) :: arr_remainder ! part of summand remaining after |
---|
859 | ! current level of integer expansion |
---|
860 | real(r8) :: X_8(max_jlevel) ! r8 vector representation of current |
---|
861 | ! i8_arr_gsum_level |
---|
862 | real(r8) :: RX_8 ! r8 representation of difference |
---|
863 | ! between current i8_arr_gsum_level |
---|
864 | ! and current jlevels of X_8 |
---|
865 | ! (== IX_8). Also used in final |
---|
866 | ! scaling step |
---|
867 | |
---|
868 | logical :: first ! flag used to indicate that just |
---|
869 | ! beginning reconstruction of sum |
---|
870 | ! from integer vector |
---|
871 | |
---|
872 | character(len=*),parameter :: subname = '(oasis_reprosum_int)' |
---|
873 | |
---|
874 | !----------------------------------------------------------------------- |
---|
875 | ! Save radix of i8 variables in an i8 variable |
---|
876 | i8_radix = radix(IX_8) |
---|
877 | |
---|
878 | ! If validating upper bounds, reserve space for validation metrics |
---|
879 | ! In both cases, reserve an extra level for overflow from the top level |
---|
880 | if (validate) then |
---|
881 | voffset = 3 |
---|
882 | else |
---|
883 | voffset = 1 |
---|
884 | endif |
---|
885 | |
---|
886 | ! compute offsets for each field |
---|
887 | offset(1) = voffset |
---|
888 | do ifld=2,nflds |
---|
889 | offset(ifld) = offset(ifld-1) & |
---|
890 | + (max_levels(ifld-1) + voffset) |
---|
891 | enddo |
---|
892 | veclth = offset(nflds) + max_levels(nflds) |
---|
893 | |
---|
894 | ! split summand index range over OpenMP threads |
---|
895 | call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) |
---|
896 | |
---|
897 | ! convert local summands to vector of integers and sum |
---|
898 | ! (Using scale instead of set_exponent because arr_remainder may not be |
---|
899 | ! "normal" after level 1 calculation) |
---|
900 | i8_arr_lsum_level(:) = 0_i8 |
---|
901 | |
---|
902 | !$omp parallel do & |
---|
903 | !$omp default(shared) & |
---|
904 | !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, & |
---|
905 | !$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8) |
---|
906 | do ithread=1,omp_nthreads |
---|
907 | if (detailed_timing) call oasis_timer_start('repro_sum_loopb') |
---|
908 | do ifld=1,nflds |
---|
909 | ioffset = offset(ifld) |
---|
910 | |
---|
911 | max_error(ifld,ithread) = 0 |
---|
912 | not_exact(ifld,ithread) = 0 |
---|
913 | |
---|
914 | i8_arr_tlsum_level(:,ifld,ithread) = 0_i8 |
---|
915 | do isum=isum_beg(ithread),isum_end(ithread) |
---|
916 | arr_remainder = 0.0_r8 |
---|
917 | |
---|
918 | if (arr(isum,ifld) .ne. 0.0_r8) then |
---|
919 | arr_exp = exponent(arr(isum,ifld)) |
---|
920 | arr_frac = fraction(arr(isum,ifld)) |
---|
921 | |
---|
922 | ! test that global maximum upper bound is an upper bound |
---|
923 | if (arr_exp > arr_gmax_exp(ifld)) then |
---|
924 | max_error(ifld,ithread) = 1 |
---|
925 | exit |
---|
926 | endif |
---|
927 | |
---|
928 | ! calculate first shift |
---|
929 | arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) |
---|
930 | |
---|
931 | ! determine first (probably) nonzero level (assuming initial fraction is |
---|
932 | ! 'normal' - algorithm still works if this is not true) |
---|
933 | ! NOTE: this is critical; scale will set to zero if min exponent is too small. |
---|
934 | if (arr_shift < 1) then |
---|
935 | ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift |
---|
936 | arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) |
---|
937 | |
---|
938 | do while (arr_shift < 1) |
---|
939 | arr_shift = arr_shift + arr_max_shift |
---|
940 | ilevel = ilevel + 1 |
---|
941 | enddo |
---|
942 | else |
---|
943 | ilevel = 1 |
---|
944 | endif |
---|
945 | |
---|
946 | if (ilevel .le. max_levels(ifld)) then |
---|
947 | ! apply first shift/truncate, add it to the relevant running |
---|
948 | ! sum, and calculate the remainder. |
---|
949 | arr_remainder = scale(arr_frac,arr_shift) |
---|
950 | i8_arr_level = int(arr_remainder,i8) |
---|
951 | i8_arr_tlsum_level(ilevel,ifld,ithread) = & |
---|
952 | i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level |
---|
953 | arr_remainder = arr_remainder - i8_arr_level |
---|
954 | |
---|
955 | ! while the remainder is non-zero, continue to shift, truncate, |
---|
956 | ! sum, and calculate new remainder |
---|
957 | do while ((arr_remainder .ne. 0.0_r8) & |
---|
958 | .and. (ilevel < max_levels(ifld))) |
---|
959 | ilevel = ilevel + 1 |
---|
960 | arr_remainder = scale(arr_remainder,arr_max_shift) |
---|
961 | i8_arr_level = int(arr_remainder,i8) |
---|
962 | i8_arr_tlsum_level(ilevel,ifld,ithread) = & |
---|
963 | i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level |
---|
964 | arr_remainder = arr_remainder - i8_arr_level |
---|
965 | enddo |
---|
966 | |
---|
967 | endif |
---|
968 | endif |
---|
969 | |
---|
970 | if (arr_remainder .ne. 0.0_r8) then |
---|
971 | not_exact(ifld,ithread) = 1 |
---|
972 | endif |
---|
973 | |
---|
974 | enddo |
---|
975 | |
---|
976 | ! postprocess integer vector to eliminate potential for overlap in the following |
---|
977 | ! sums over threads and processes: if value larger than or equal to |
---|
978 | ! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in |
---|
979 | ! vector, resulting in nonoverlapping ranges for each component. Note that |
---|
980 | ! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums |
---|
981 | ! over threads and processes do not overflow for ilevel==1. |
---|
982 | do ilevel=max_levels(ifld),1,-1 |
---|
983 | RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) |
---|
984 | IX_8 = int(scale(RX_8,-arr_max_shift),i8) |
---|
985 | if (IX_8 .ne. 0_i8) then |
---|
986 | i8_arr_tlsum_level(ilevel-1,ifld,ithread) = & |
---|
987 | i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8 |
---|
988 | IX_8 = IX_8*(i8_radix**arr_max_shift) |
---|
989 | i8_arr_tlsum_level(ilevel,ifld,ithread) = & |
---|
990 | i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8 |
---|
991 | endif |
---|
992 | enddo |
---|
993 | enddo |
---|
994 | if (detailed_timing) call oasis_timer_stop('repro_sum_loopb') |
---|
995 | enddo |
---|
996 | |
---|
997 | ! sum contributions from different threads |
---|
998 | do ifld=1,nflds |
---|
999 | ioffset = offset(ifld) |
---|
1000 | do ithread = 1,omp_nthreads |
---|
1001 | do ilevel = 0,max_levels(ifld) |
---|
1002 | i8_arr_lsum_level(ioffset+ilevel) = & |
---|
1003 | i8_arr_lsum_level(ioffset+ilevel) & |
---|
1004 | + i8_arr_tlsum_level(ilevel,ifld,ithread) |
---|
1005 | enddo |
---|
1006 | enddo |
---|
1007 | enddo |
---|
1008 | |
---|
1009 | ! record if upper bound was inaccurate or if level expansion stopped |
---|
1010 | ! before full accuracy was achieved |
---|
1011 | if (validate) then |
---|
1012 | do ifld=1,nflds |
---|
1013 | ioffset = offset(ifld) |
---|
1014 | i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:)) |
---|
1015 | i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:)) |
---|
1016 | enddo |
---|
1017 | endif |
---|
1018 | |
---|
1019 | ! sum integer vector element-wise |
---|
1020 | #if ( defined noI8 ) |
---|
1021 | ! Workaround for when i8 is not supported. |
---|
1022 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_i4") |
---|
1023 | call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & |
---|
1024 | veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr) |
---|
1025 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i4") |
---|
1026 | #else |
---|
1027 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_i8") |
---|
1028 | call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & |
---|
1029 | veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr) |
---|
1030 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i8") |
---|
1031 | #endif |
---|
1032 | |
---|
1033 | ! Construct global sum from integer vector representation: |
---|
1034 | ! 1) arr_max_shift is the shift applied to fraction(arr_gmax) . |
---|
1035 | ! When shifting back, need to "add back in" true arr_gmax exponent. This was |
---|
1036 | ! removed implicitly by working only with the fraction . |
---|
1037 | ! 2) want to add levels into sum in reverse order (smallest to largest). However, |
---|
1038 | ! even this can generate floating point rounding errors if signs of integers |
---|
1039 | ! alternate. To avoid this, do some arithmetic with integer vectors so that all |
---|
1040 | ! components have the same sign. This should keep relative difference between |
---|
1041 | ! using different integer sizes (e.g. i8 and i4) to machine epsilon |
---|
1042 | ! 3) assignment to X_8 will usually lose accuracy since maximum integer |
---|
1043 | ! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands |
---|
1044 | ! correction not very large). Calculate remainder and add in first (since |
---|
1045 | ! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits). |
---|
1046 | ! For r4 (24 digits) may need to correct twice. Code is written in a general |
---|
1047 | ! fashion, to work no matter how many corrections are necessary (assuming |
---|
1048 | ! max_jlevel parameter calculation is correct). |
---|
1049 | |
---|
1050 | recompute = .false. |
---|
1051 | do ifld=1,nflds |
---|
1052 | arr_gsum(ifld) = 0.0_r8 |
---|
1053 | ioffset = offset(ifld) |
---|
1054 | |
---|
1055 | ! if validate is .true., test whether the summand upper bound |
---|
1056 | ! was exceeded on any of the processes |
---|
1057 | if (validate) then |
---|
1058 | if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then |
---|
1059 | recompute = .true. |
---|
1060 | endif |
---|
1061 | endif |
---|
1062 | |
---|
1063 | if (.not. recompute) then |
---|
1064 | |
---|
1065 | ! preprocess integer vector: |
---|
1066 | ! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap' |
---|
1067 | ! to next larger integer in vector, resulting in nonoverlapping ranges for each |
---|
1068 | ! component. Note that have "ilevel-1=0" level here as described above. |
---|
1069 | do ilevel=max_levels(ifld),1,-1 |
---|
1070 | RX_8 = i8_arr_gsum_level(ioffset+ilevel) |
---|
1071 | IX_8 = int(scale(RX_8,-arr_max_shift),i8) |
---|
1072 | if (IX_8 .ne. 0_i8) then |
---|
1073 | i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) & |
---|
1074 | + IX_8 |
---|
1075 | IX_8 = IX_8*(i8_radix**arr_max_shift) |
---|
1076 | i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) & |
---|
1077 | - IX_8 |
---|
1078 | endif |
---|
1079 | enddo |
---|
1080 | ! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary |
---|
1081 | ! so that all vector components have the same sign (eliminating loss |
---|
1082 | ! of accuracy arising from difference of large values when |
---|
1083 | ! reconstructing r8 sum from integer vector) |
---|
1084 | ilevel = 0 |
---|
1085 | do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) & |
---|
1086 | .and. (ilevel < max_levels(ifld))) |
---|
1087 | ilevel = ilevel + 1 |
---|
1088 | enddo |
---|
1089 | ! |
---|
1090 | if (ilevel < max_levels(ifld)) then |
---|
1091 | if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then |
---|
1092 | i8_sign = 1_i8 |
---|
1093 | else |
---|
1094 | i8_sign = -1_i8 |
---|
1095 | endif |
---|
1096 | do jlevel=ilevel,max_levels(ifld)-1 |
---|
1097 | if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) & |
---|
1098 | .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then |
---|
1099 | i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) & |
---|
1100 | - i8_sign |
---|
1101 | i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) & |
---|
1102 | + i8_sign*(i8_radix**arr_max_shift) |
---|
1103 | endif |
---|
1104 | enddo |
---|
1105 | endif |
---|
1106 | |
---|
1107 | ! start with maximum shift, and work up to larger values |
---|
1108 | arr_shift = arr_gmax_exp(ifld) & |
---|
1109 | - max_levels(ifld)*arr_max_shift |
---|
1110 | curr_exp = 0 |
---|
1111 | first = .true. |
---|
1112 | do ilevel=max_levels(ifld),0,-1 |
---|
1113 | |
---|
1114 | if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then |
---|
1115 | jlevel = 1 |
---|
1116 | |
---|
1117 | ! r8 representation of higher order bits in integer |
---|
1118 | X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel) |
---|
1119 | LX(jlevel) = exponent(X_8(jlevel)) |
---|
1120 | |
---|
1121 | ! calculate remainder |
---|
1122 | IX_8 = int(X_8(jlevel),i8) |
---|
1123 | RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) |
---|
1124 | |
---|
1125 | ! repeat using remainder |
---|
1126 | do while (RX_8 .ne. 0.0_r8) |
---|
1127 | jlevel = jlevel + 1 |
---|
1128 | X_8(jlevel) = RX_8 |
---|
1129 | LX(jlevel) = exponent(RX_8) |
---|
1130 | IX_8 = IX_8 + int(RX_8,i8) |
---|
1131 | RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) |
---|
1132 | enddo |
---|
1133 | |
---|
1134 | ! add in contributions, smaller to larger, rescaling for each |
---|
1135 | ! addition to guarantee that exponent of working summand is always |
---|
1136 | ! larger than minexponent |
---|
1137 | do while (jlevel > 0) |
---|
1138 | if (first) then |
---|
1139 | curr_exp = LX(jlevel) + arr_shift |
---|
1140 | arr_gsum(ifld) = fraction(X_8(jlevel)) |
---|
1141 | first = .false. |
---|
1142 | else |
---|
1143 | corr_exp = curr_exp - (LX(jlevel) + arr_shift) |
---|
1144 | arr_gsum(ifld) = fraction(X_8(jlevel)) & |
---|
1145 | + scale(arr_gsum(ifld),corr_exp) |
---|
1146 | curr_exp = LX(jlevel) + arr_shift |
---|
1147 | endif |
---|
1148 | jlevel = jlevel - 1 |
---|
1149 | enddo |
---|
1150 | |
---|
1151 | endif |
---|
1152 | |
---|
1153 | arr_shift = arr_shift + arr_max_shift |
---|
1154 | enddo |
---|
1155 | |
---|
1156 | ! apply final exponent correction, scaling first if exponent is too small |
---|
1157 | ! to apply directly |
---|
1158 | corr_exp = curr_exp + exponent(arr_gsum(ifld)) |
---|
1159 | if (corr_exp .ge. MINEXPONENT(1._r8)) then |
---|
1160 | arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp) |
---|
1161 | else |
---|
1162 | RX_8 = set_exponent(arr_gsum(ifld), & |
---|
1163 | corr_exp-MINEXPONENT(1._r8)) |
---|
1164 | arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8)) |
---|
1165 | endif |
---|
1166 | |
---|
1167 | ! if validate is .true. and some precision lost, test whether 'too much' |
---|
1168 | ! was lost, due to too loose an upper bound, too stringent a limit on number |
---|
1169 | ! of levels of expansion, cancellation, .... Calculated by comparing lower |
---|
1170 | ! bound on number of sigificant digits with number of digits in 1.0_r8 . |
---|
1171 | if (validate) then |
---|
1172 | if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then |
---|
1173 | |
---|
1174 | ! find first nonzero level and use exponent for this level, then assume all |
---|
1175 | ! subsequent levels contribute arr_max_shift digits. |
---|
1176 | sum_digits = 0 |
---|
1177 | do ilevel=0,max_levels(ifld) |
---|
1178 | if (sum_digits .eq. 0) then |
---|
1179 | if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then |
---|
1180 | X_8(1) = i8_arr_gsum_level(ioffset+ilevel) |
---|
1181 | LX(1) = exponent(X_8(1)) |
---|
1182 | sum_digits = LX(1) |
---|
1183 | endif |
---|
1184 | else |
---|
1185 | sum_digits = sum_digits + arr_max_shift |
---|
1186 | endif |
---|
1187 | enddo |
---|
1188 | |
---|
1189 | if (sum_digits < digits(1.0_r8)) then |
---|
1190 | recompute = .true. |
---|
1191 | endif |
---|
1192 | endif |
---|
1193 | endif |
---|
1194 | |
---|
1195 | endif |
---|
1196 | |
---|
1197 | enddo |
---|
1198 | |
---|
1199 | |
---|
1200 | end subroutine oasis_reprosum_int |
---|
1201 | |
---|
1202 | !======================================================================== |
---|
1203 | ! |
---|
1204 | ! Purpose: |
---|
1205 | !> Test whether distributed sum exceeds tolerance and print out a |
---|
1206 | !> warning message. |
---|
1207 | ! |
---|
1208 | !---------------------------------------------------------------------- |
---|
1209 | |
---|
1210 | logical function oasis_reprosum_tolExceeded (name, nflds, master, & |
---|
1211 | logunit, rel_diff ) |
---|
1212 | !---------------------------------------------------------------------- |
---|
1213 | |
---|
1214 | ! Arguments |
---|
1215 | |
---|
1216 | character(len=*), intent(in) :: name !< distributed sum identifier |
---|
1217 | integer, intent(in) :: nflds !< number of fields |
---|
1218 | logical, intent(in) :: master !< process that will write |
---|
1219 | !< warning messages? |
---|
1220 | integer, optional, intent(in) :: logunit!< unit warning messages |
---|
1221 | !< written to |
---|
1222 | real(r8), intent(in) :: rel_diff(2,nflds) |
---|
1223 | !< relative and absolute |
---|
1224 | !< differences between fixed |
---|
1225 | !< and floating point sums |
---|
1226 | |
---|
1227 | ! Local workspace |
---|
1228 | |
---|
1229 | integer :: llogunit ! local log unit |
---|
1230 | integer :: ifld ! field index |
---|
1231 | integer :: exceeds_limit ! number of fields whose |
---|
1232 | ! sum exceeds tolerance |
---|
1233 | real(r8) :: max_rel_diff ! maximum relative difference |
---|
1234 | integer :: max_rel_diff_idx ! field index for max. rel. diff. |
---|
1235 | real(r8) :: max_abs_diff ! maximum absolute difference |
---|
1236 | integer :: max_abs_diff_idx ! field index for max. abs. diff. |
---|
1237 | character(len=*),parameter :: subname = '(oasis_reprosum_tolExceeded)' |
---|
1238 | |
---|
1239 | !----------------------------------------------------------------------- |
---|
1240 | |
---|
1241 | oasis_reprosum_tolExceeded = .false. |
---|
1242 | if (oasis_reprosum_reldiffmax < 0.0_r8) return |
---|
1243 | |
---|
1244 | if ( present(logunit) ) then |
---|
1245 | llogunit = logunit |
---|
1246 | else |
---|
1247 | llogunit = nulprt |
---|
1248 | endif |
---|
1249 | |
---|
1250 | ! check that "fast" reproducible sum is accurate enough. |
---|
1251 | exceeds_limit = 0 |
---|
1252 | max_rel_diff = 0.0_r8 |
---|
1253 | max_abs_diff = 0.0_r8 |
---|
1254 | do ifld=1,nflds |
---|
1255 | if (rel_diff(1,ifld) > oasis_reprosum_reldiffmax) then |
---|
1256 | exceeds_limit = exceeds_limit + 1 |
---|
1257 | if (rel_diff(1,ifld) > max_rel_diff) then |
---|
1258 | max_rel_diff = rel_diff(1,ifld) |
---|
1259 | max_rel_diff_idx = ifld |
---|
1260 | endif |
---|
1261 | if (rel_diff(2,ifld) > max_abs_diff) then |
---|
1262 | max_abs_diff = rel_diff(2,ifld) |
---|
1263 | max_abs_diff_idx = ifld |
---|
1264 | endif |
---|
1265 | endif |
---|
1266 | enddo |
---|
1267 | |
---|
1268 | if (exceeds_limit > 0) then |
---|
1269 | if (master) then |
---|
1270 | write(llogunit,*) subname,trim(name), & |
---|
1271 | ': difference in fixed and floating point sums ', & |
---|
1272 | ' exceeds tolerance in ', exceeds_limit, & |
---|
1273 | ' fields.' |
---|
1274 | write(llogunit,*) subname,' Maximum relative diff: (rel)', & |
---|
1275 | rel_diff(1,max_rel_diff_idx), ' (abs) ', & |
---|
1276 | rel_diff(2,max_rel_diff_idx) |
---|
1277 | write(llogunit,*) subname,' Maximum absolute diff: (rel)', & |
---|
1278 | rel_diff(1,max_abs_diff_idx), ' (abs) ', & |
---|
1279 | rel_diff(2,max_abs_diff_idx) |
---|
1280 | endif |
---|
1281 | oasis_reprosum_tolExceeded = .true. |
---|
1282 | endif |
---|
1283 | |
---|
1284 | |
---|
1285 | end function oasis_reprosum_tolExceeded |
---|
1286 | |
---|
1287 | !======================================================================== |
---|
1288 | ! |
---|
1289 | ! Purpose: |
---|
1290 | !> Compute the global sum of each field in "arr" using the indicated |
---|
1291 | !> communicator with a reproducible yet scalable implementation based |
---|
1292 | !> on He and Ding's implementation of the double-double algorithm. |
---|
1293 | ! |
---|
1294 | !---------------------------------------------------------------------- |
---|
1295 | |
---|
1296 | subroutine oasis_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & |
---|
1297 | nflds, mpi_comm ) |
---|
1298 | !---------------------------------------------------------------------- |
---|
1299 | |
---|
1300 | ! Arguments |
---|
1301 | |
---|
1302 | integer, intent(in) :: nsummands !< number of local summands |
---|
1303 | integer, intent(in) :: dsummands !< declared first dimension |
---|
1304 | integer, intent(in) :: nflds !< number of fields |
---|
1305 | real(r8), intent(in) :: arr(dsummands,nflds) |
---|
1306 | !< input array |
---|
1307 | integer, intent(in) :: mpi_comm !< MPI subcommunicator |
---|
1308 | |
---|
1309 | real(r8), intent(out):: arr_gsum(nflds) |
---|
1310 | !< global sums |
---|
1311 | |
---|
1312 | |
---|
1313 | ! Local workspace |
---|
1314 | |
---|
1315 | integer :: old_cw ! for x86 processors, save |
---|
1316 | ! current arithmetic mode |
---|
1317 | integer :: ifld, isum ! loop variables |
---|
1318 | integer :: ierr ! MPI error return |
---|
1319 | |
---|
1320 | real(r8) :: e, t1, t2 ! temporaries |
---|
1321 | complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double |
---|
1322 | ! format) |
---|
1323 | complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double |
---|
1324 | ! format) |
---|
1325 | |
---|
1326 | integer, save :: mpi_sumdd |
---|
1327 | logical, save :: first_time = .true. |
---|
1328 | character(len=*),parameter :: subname = '(oasis_reprosum_ddpdd)' |
---|
1329 | |
---|
1330 | !----------------------------------------------------------------------- |
---|
1331 | |
---|
1332 | call shr_reprosumx86_fix_start (old_cw) |
---|
1333 | |
---|
1334 | if (first_time) then |
---|
1335 | call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr) |
---|
1336 | first_time = .false. |
---|
1337 | endif |
---|
1338 | |
---|
1339 | do ifld=1,nflds |
---|
1340 | arr_lsum_dd(ifld) = (0.0_r8,0.0_r8) |
---|
1341 | |
---|
1342 | do isum=1,nsummands |
---|
1343 | |
---|
1344 | ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s |
---|
1345 | ! trick. |
---|
1346 | t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld)) |
---|
1347 | e = t1 - arr(isum,ifld) |
---|
1348 | t2 = ((real(arr_lsum_dd(ifld)) - e) & |
---|
1349 | + (arr(isum,ifld) - (t1 - e))) & |
---|
1350 | + aimag(arr_lsum_dd(ifld)) |
---|
1351 | |
---|
1352 | ! The result is t1 + t2, after normalization. |
---|
1353 | arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) |
---|
1354 | enddo |
---|
1355 | |
---|
1356 | enddo |
---|
1357 | |
---|
1358 | call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, & |
---|
1359 | MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr) |
---|
1360 | do ifld=1,nflds |
---|
1361 | arr_gsum(ifld) = real(arr_gsum_dd(ifld)) |
---|
1362 | enddo |
---|
1363 | |
---|
1364 | call shr_reprosumx86_fix_end (old_cw) |
---|
1365 | |
---|
1366 | end subroutine oasis_reprosum_ddpdd |
---|
1367 | |
---|
1368 | !----------------------------------------------------------------------- |
---|
1369 | |
---|
1370 | subroutine DDPDD (dda, ddb, len, itype) |
---|
1371 | !---------------------------------------------------------------------- |
---|
1372 | ! |
---|
1373 | ! Purpose: |
---|
1374 | ! Modification of original codes written by David H. Bailey |
---|
1375 | ! This subroutine computes ddb(i) = dda(i)+ddb(i) |
---|
1376 | ! |
---|
1377 | !---------------------------------------------------------------------- |
---|
1378 | |
---|
1379 | ! Arguments |
---|
1380 | |
---|
1381 | integer, intent(in) :: len ! array length |
---|
1382 | complex(r8), intent(in) :: dda(len) ! input |
---|
1383 | complex(r8), intent(inout) :: ddb(len) ! result |
---|
1384 | integer, intent(in) :: itype ! unused |
---|
1385 | |
---|
1386 | ! Local workspace |
---|
1387 | |
---|
1388 | real(r8) e, t1, t2 |
---|
1389 | integer i |
---|
1390 | character(len=*),parameter :: subname = '(oasis_reprosum_mod:DDPDD)' |
---|
1391 | |
---|
1392 | !----------------------------------------------------------------------- |
---|
1393 | |
---|
1394 | do i = 1, len |
---|
1395 | ! Compute dda + ddb using Knuth's trick. |
---|
1396 | t1 = real(dda(i)) + real(ddb(i)) |
---|
1397 | e = t1 - real(dda(i)) |
---|
1398 | t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) & |
---|
1399 | + aimag(dda(i)) + aimag(ddb(i)) |
---|
1400 | |
---|
1401 | ! The result is t1 + t2, after normalization. |
---|
1402 | ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) |
---|
1403 | enddo |
---|
1404 | |
---|
1405 | |
---|
1406 | end subroutine DDPDD |
---|
1407 | |
---|
1408 | !----------------------------------------------------------------------- |
---|
1409 | |
---|
1410 | subroutine split_indices(total,num_pieces,ibeg,iend) |
---|
1411 | !---------------------------------------------------------------------- |
---|
1412 | ! |
---|
1413 | ! Purpose: |
---|
1414 | ! Split range into 'num_pieces' |
---|
1415 | ! |
---|
1416 | !---------------------------------------------------------------------- |
---|
1417 | |
---|
1418 | ! Arguments |
---|
1419 | |
---|
1420 | integer, intent(in) :: total |
---|
1421 | integer, intent(in) :: num_pieces |
---|
1422 | integer, intent(out) :: ibeg(num_pieces), iend(num_pieces) |
---|
1423 | |
---|
1424 | ! Local workspace |
---|
1425 | |
---|
1426 | integer :: itmp1, itmp2, ioffset, i |
---|
1427 | character(len=*),parameter :: subname = '(oasis_reprosum_mod:split_indices)' |
---|
1428 | |
---|
1429 | !----------------------------------------------------------------------- |
---|
1430 | |
---|
1431 | itmp1 = total/num_pieces |
---|
1432 | itmp2 = mod(total,num_pieces) |
---|
1433 | ioffset = 0 |
---|
1434 | do i=1,itmp2 |
---|
1435 | ibeg(i) = ioffset + 1 |
---|
1436 | iend(i) = ioffset + (itmp1+1) |
---|
1437 | ioffset = iend(i) |
---|
1438 | enddo |
---|
1439 | do i=itmp2+1,num_pieces |
---|
1440 | ibeg(i) = ioffset + 1 |
---|
1441 | if (ibeg(i) > total) then |
---|
1442 | iend(i) = ibeg(i) - 1 |
---|
1443 | else |
---|
1444 | iend(i) = ioffset + itmp1 |
---|
1445 | ioffset = iend(i) |
---|
1446 | endif |
---|
1447 | enddo |
---|
1448 | |
---|
1449 | end subroutine split_indices |
---|
1450 | |
---|
1451 | !======================================================================== |
---|
1452 | |
---|
1453 | end module mod_oasis_reprosum |
---|