[6331] | 1 | |
---|
| 2 | !> OASIS reproducible sum method from P. Worley |
---|
| 3 | |
---|
| 4 | MODULE mod_oasis_reprosum |
---|
| 5 | |
---|
| 6 | !----------------------------------------------------------------------- |
---|
| 7 | ! |
---|
| 8 | ! Purpose: |
---|
| 9 | !> Compute reproducible global sums of a set of arrays across an MPI |
---|
| 10 | !> subcommunicator |
---|
| 11 | ! |
---|
| 12 | ! Methods: |
---|
| 13 | !> Compute using either or both a scalable, reproducible algorithm and a |
---|
| 14 | !> scalable, nonreproducible algorithm: |
---|
| 15 | !> * Reproducible (scalable): |
---|
| 16 | !> Convert to fixed point (integer vector representation) to enable |
---|
| 17 | !> reproducibility when using MPI_Allreduce |
---|
| 18 | !> * Alternative usually reproducible (scalable): |
---|
| 19 | !> Use parallel double-double algorithm due to Helen He and |
---|
| 20 | !> Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm |
---|
| 21 | !> * Nonreproducible (scalable): |
---|
| 22 | !> Floating point and MPI_Allreduce based. |
---|
| 23 | !> If computing both reproducible and nonreproducible sums, compare |
---|
| 24 | !> these and report relative difference (if absolute difference |
---|
| 25 | !> less than sum) or absolute difference back to calling routine. |
---|
| 26 | ! |
---|
| 27 | ! Author: P. Worley (based on suggestions from J. White for fixed |
---|
| 28 | ! point algorithm and on He/Ding paper for ddpdd |
---|
| 29 | ! algorithm) |
---|
| 30 | ! |
---|
| 31 | ! Modified by T.Craig for Oasis, May 2017 based on the public version in |
---|
| 32 | ! CESM1.2. |
---|
| 33 | ! |
---|
| 34 | !----------------------------------------------------------------------- |
---|
| 35 | |
---|
| 36 | !----------------------------------------------------------------------- |
---|
| 37 | !- use statements ------------------------------------------------------ |
---|
| 38 | !----------------------------------------------------------------------- |
---|
| 39 | #if ( defined noI8 ) |
---|
| 40 | ! Workaround for when shr_kind_i8 is not supported. |
---|
| 41 | use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i4_p |
---|
| 42 | #else |
---|
| 43 | use mod_oasis_kinds, only: r8 => ip_r8_p, i8 => ip_i8_p |
---|
| 44 | #endif |
---|
| 45 | use mod_oasis_data, only: nulprt |
---|
| 46 | use mod_oasis_sys, only: oasis_abort, estr |
---|
| 47 | use mod_oasis_mpi, only: oasis_mpi_barrier |
---|
| 48 | use mod_oasis_timer, only: oasis_timer_start, oasis_timer_stop |
---|
| 49 | use, intrinsic :: iso_c_binding |
---|
| 50 | |
---|
| 51 | !----------------------------------------------------------------------- |
---|
| 52 | !- module boilerplate -------------------------------------------------- |
---|
| 53 | !----------------------------------------------------------------------- |
---|
| 54 | implicit none |
---|
| 55 | private |
---|
| 56 | |
---|
| 57 | !----------------------------------------------------------------------- |
---|
| 58 | !- include statements -------------------------------------------------- |
---|
| 59 | !----------------------------------------------------------------------- |
---|
| 60 | #include <mpif.h> |
---|
| 61 | |
---|
| 62 | !----------------------------------------------------------------------- |
---|
| 63 | ! Public interfaces ---------------------------------------------------- |
---|
| 64 | !----------------------------------------------------------------------- |
---|
| 65 | public :: & |
---|
| 66 | oasis_reprosum_setopts, &! set runtime options |
---|
| 67 | oasis_reprosum_calc, &! calculate distributed sum |
---|
| 68 | oasis_reprosum_tolExceeded ! utility function to check relative |
---|
| 69 | ! differences against the tolerance |
---|
| 70 | |
---|
| 71 | !----------------------------------------------------------------------- |
---|
| 72 | ! Public data ---------------------------------------------------------- |
---|
| 73 | !----------------------------------------------------------------------- |
---|
| 74 | logical, public :: oasis_reprosum_recompute = .false. |
---|
| 75 | |
---|
| 76 | real(r8), public :: oasis_reprosum_reldiffmax = -1.0_r8 |
---|
| 77 | |
---|
| 78 | !----------------------------------------------------------------------- |
---|
| 79 | ! Private interfaces --------------------------------------------------- |
---|
| 80 | !----------------------------------------------------------------------- |
---|
| 81 | private :: & |
---|
| 82 | ddpdd, &! double-double sum routine |
---|
| 83 | split_indices ! split indices among OMP threads |
---|
| 84 | |
---|
| 85 | !----------------------------------------------------------------------- |
---|
| 86 | ! Private data --------------------------------------------------------- |
---|
| 87 | !----------------------------------------------------------------------- |
---|
| 88 | |
---|
| 89 | logical :: repro_sum_use_ddpdd = .false. |
---|
| 90 | logical :: detailed_timing = .false. |
---|
| 91 | |
---|
| 92 | !----------------------------------------------------------------------- |
---|
| 93 | ! External C bindings -------------------------------------------------- |
---|
| 94 | !----------------------------------------------------------------------- |
---|
| 95 | |
---|
| 96 | interface |
---|
| 97 | subroutine shr_reprosumx86_fix_start(old_cw) bind(C) |
---|
| 98 | use iso_c_binding, only: c_int |
---|
| 99 | integer(c_int), intent(out) :: old_cw |
---|
| 100 | end subroutine shr_reprosumx86_fix_start |
---|
| 101 | |
---|
| 102 | subroutine shr_reprosumx86_fix_end(old_cw) bind(C) |
---|
| 103 | use iso_c_binding, only: c_int |
---|
| 104 | integer(c_int), intent(in) :: old_cw |
---|
| 105 | end subroutine shr_reprosumx86_fix_end |
---|
| 106 | end interface |
---|
| 107 | |
---|
| 108 | CONTAINS |
---|
| 109 | |
---|
| 110 | !======================================================================== |
---|
| 111 | !----------------------------------------------------------------------- |
---|
| 112 | ! Purpose: |
---|
| 113 | !> Set runtime options |
---|
| 114 | ! Author: P. Worley |
---|
| 115 | !----------------------------------------------------------------------- |
---|
| 116 | |
---|
| 117 | subroutine oasis_reprosum_setopts(repro_sum_use_ddpdd_in, & |
---|
| 118 | repro_sum_rel_diff_max_in, & |
---|
| 119 | repro_sum_recompute_in, & |
---|
| 120 | repro_sum_master, & |
---|
| 121 | repro_sum_logunit ) |
---|
| 122 | |
---|
| 123 | !------------------------------Arguments-------------------------------- |
---|
| 124 | logical, intent(in), optional :: repro_sum_use_ddpdd_in |
---|
| 125 | !< Use DDPDD algorithm instead of fixed precision algorithm |
---|
| 126 | real(r8), intent(in), optional :: repro_sum_rel_diff_max_in |
---|
| 127 | !< maximum permissible difference between reproducible and |
---|
| 128 | !< nonreproducible sums |
---|
| 129 | logical, intent(in), optional :: repro_sum_recompute_in |
---|
| 130 | !< recompute using different algorithm when difference between |
---|
| 131 | !< reproducible and nonreproducible sums is too great |
---|
| 132 | logical, intent(in), optional :: repro_sum_master |
---|
| 133 | !< flag indicating whether this process should output |
---|
| 134 | !< log messages |
---|
| 135 | integer, intent(in), optional :: repro_sum_logunit |
---|
| 136 | !< unit number for log messages |
---|
| 137 | !---------------------------Local Workspace----------------------------- |
---|
| 138 | integer llogunit ! unit number for log messages |
---|
| 139 | logical master ! local master? |
---|
| 140 | logical,save :: firstcall = .true. ! first call |
---|
| 141 | character(len=*),parameter :: subname = '(oasis_reprosum_setopts)' |
---|
| 142 | !----------------------------------------------------------------------- |
---|
| 143 | |
---|
| 144 | if ( present(repro_sum_master) ) then |
---|
| 145 | master = repro_sum_master |
---|
| 146 | else |
---|
| 147 | master = .false. |
---|
| 148 | endif |
---|
| 149 | |
---|
| 150 | if ( present(repro_sum_logunit) ) then |
---|
| 151 | llogunit = repro_sum_logunit |
---|
| 152 | else |
---|
| 153 | llogunit = nulprt |
---|
| 154 | endif |
---|
| 155 | |
---|
| 156 | if (.not. firstcall) then |
---|
| 157 | write(llogunit,*) subname,estr,' can only be called once' |
---|
| 158 | call oasis_abort(file=__FILE__,line=__LINE__) |
---|
| 159 | endif |
---|
| 160 | firstcall = .false. |
---|
| 161 | |
---|
| 162 | if ( present(repro_sum_use_ddpdd_in) ) then |
---|
| 163 | repro_sum_use_ddpdd = repro_sum_use_ddpdd_in |
---|
| 164 | endif |
---|
| 165 | if ( present(repro_sum_rel_diff_max_in) ) then |
---|
| 166 | oasis_reprosum_reldiffmax = repro_sum_rel_diff_max_in |
---|
| 167 | endif |
---|
| 168 | if ( present(repro_sum_recompute_in) ) then |
---|
| 169 | oasis_reprosum_recompute = repro_sum_recompute_in |
---|
| 170 | endif |
---|
| 171 | if (master) then |
---|
| 172 | if ( repro_sum_use_ddpdd ) then |
---|
| 173 | write(llogunit,*) subname, & |
---|
| 174 | 'Using double-double-based (scalable) usually reproducible ', & |
---|
| 175 | 'distributed sum algorithm' |
---|
| 176 | else |
---|
| 177 | write(llogunit,*) subname, & |
---|
| 178 | 'Using fixed-point-based (scalable) reproducible ', & |
---|
| 179 | 'distributed sum algorithm' |
---|
| 180 | endif |
---|
| 181 | |
---|
| 182 | if (oasis_reprosum_reldiffmax >= 0._r8) then |
---|
| 183 | write(llogunit,*) subname, & |
---|
| 184 | ' with a maximum relative error tolerance of ', & |
---|
| 185 | oasis_reprosum_reldiffmax |
---|
| 186 | if (oasis_reprosum_recompute) then |
---|
| 187 | write(llogunit,*) subname, & |
---|
| 188 | 'If tolerance exceeded, sum is recomputed using ', & |
---|
| 189 | 'a serial algorithm.' |
---|
| 190 | else |
---|
| 191 | write(llogunit,*) subname, & |
---|
| 192 | 'If tolerance exceeded, fixed-precision is sum used ', & |
---|
| 193 | 'but a warning is output.' |
---|
| 194 | endif |
---|
| 195 | else |
---|
| 196 | write(llogunit,*) subname, & |
---|
| 197 | 'and not comparing with floating point algorithms.' |
---|
| 198 | endif |
---|
| 199 | |
---|
| 200 | endif |
---|
| 201 | end subroutine oasis_reprosum_setopts |
---|
| 202 | |
---|
| 203 | !======================================================================== |
---|
| 204 | |
---|
| 205 | ! |
---|
| 206 | ! Purpose: |
---|
| 207 | !> Compute the global sum of each field in "arr" using the indicated |
---|
| 208 | !> communicator with a reproducible yet scalable implementation based |
---|
| 209 | !> on a fixed point algorithm. An alternative is to use an "almost |
---|
| 210 | !> always reproducible" floating point algorithm. |
---|
| 211 | ! |
---|
| 212 | ! The accuracy of the fixed point algorithm is controlled by the |
---|
| 213 | ! number of "levels" of integer expansion. The algorithm will calculate |
---|
| 214 | ! the number of levels that is required for the sum to be essentially |
---|
| 215 | ! exact. The optional parameter arr_max_levels can be used to override |
---|
| 216 | ! the calculated value. The optional parameter arr_max_levels_out can be |
---|
| 217 | ! used to return the values used. |
---|
| 218 | ! |
---|
| 219 | ! The algorithm also requires an upper bound on |
---|
| 220 | ! the maximum summand (in absolute value) for each field, and will |
---|
| 221 | ! calculate this internally. However, if the optional parameters |
---|
| 222 | ! arr_max_levels and arr_gbl_max are both set, then the algorithm will |
---|
| 223 | ! use the values in arr_gbl_max for the upper bounds instead. If these |
---|
| 224 | ! are not upper bounds, or if the upper bounds are not tight enough |
---|
| 225 | ! to achieve the requisite accuracy, and if the optional parameter |
---|
| 226 | ! repro_sum_validate is NOT set to .false., the algorithm will repeat the |
---|
| 227 | ! computation with appropriate upper bounds. If only arr_gbl_max is present, |
---|
| 228 | ! then the maxima are computed internally (and the specified values are |
---|
| 229 | ! ignored). The optional parameter arr_gbl_max_out can be |
---|
| 230 | ! used to return the values used. |
---|
| 231 | ! |
---|
| 232 | ! Finally, the algorithm requires an upper bound on the number of |
---|
| 233 | ! local summands across all processes. This will be calculated internally, |
---|
| 234 | ! using an MPI collective, but the value in the optional argument |
---|
| 235 | ! gbl_max_nsummands will be used instead if (1) it is present, (2) |
---|
| 236 | ! it is > 0, and (3) the maximum value and required number of levels |
---|
| 237 | ! are also specified. (If the maximum value is calculated, the same |
---|
| 238 | ! MPI collective is used to determine the maximum number of local |
---|
| 239 | ! summands.) The accuracy of the user-specified value is not checked. |
---|
| 240 | ! However, if set to < 1, the value will instead be calculated. If the |
---|
| 241 | ! optional parameter gbl_max_nsummands_out is present, then the value |
---|
| 242 | ! used (gbl_max_nsummands if >= 1; calculated otherwise) will be |
---|
| 243 | ! returned. |
---|
| 244 | ! |
---|
| 245 | ! If requested (by setting oasis_reprosum_reldiffmax >= 0.0 and passing in |
---|
| 246 | ! the optional rel_diff parameter), results are compared with a |
---|
| 247 | ! nonreproducible floating point algorithm. |
---|
| 248 | ! |
---|
| 249 | ! Note that the cost of the algorithm is not strongly correlated with |
---|
| 250 | ! the number of levels, which primarily shows up as a (modest) increase |
---|
| 251 | ! in cost of the MPI_Allreduce as a function of vector length. Rather the |
---|
| 252 | ! cost is more a function of (a) the number of integers required to |
---|
| 253 | ! represent an individual summand and (b) the number of MPI_Allreduce |
---|
| 254 | ! calls. The number of integers required to represent an individual |
---|
| 255 | ! summand is 1 or 2 when using 8-byte integers for 8-byte real summands |
---|
| 256 | ! when the number of local summands is not too large. As the number of |
---|
| 257 | ! local summands increases, the number of integers required increases. |
---|
| 258 | ! The number of MPI_Allreduce calls is either 2 (specifying nothing) or |
---|
| 259 | ! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max |
---|
| 260 | ! correctly). When specifying arr_max_levels and arr_gbl_max |
---|
| 261 | ! incorrectly, 3 or 4 MPI_Allreduce calls will be required. |
---|
| 262 | ! |
---|
| 263 | ! The alternative algorithm is a minor modification of a parallel |
---|
| 264 | ! implementation of David Bailey's routine DDPDD by Helen He |
---|
| 265 | ! and Chris Ding. Bailey uses the Knuth trick to implement quadruple |
---|
| 266 | ! precision summation of double precision values with 10 double |
---|
| 267 | ! precision operations. The advantage of this algorithm is that |
---|
| 268 | ! it requires a single MPI_Allreduce and is less expensive per summand |
---|
| 269 | ! than is the fixed precision algorithm. The disadvantage is that it |
---|
| 270 | ! is not guaranteed to be reproducible (though it is reproducible |
---|
| 271 | ! much more often than is the standard algorithm). This alternative |
---|
| 272 | ! is used when the optional parameter ddpdd_sum is set to .true. It is |
---|
| 273 | ! also used if the fixed precision algorithm radix assumption does not |
---|
| 274 | ! hold. |
---|
| 275 | |
---|
| 276 | !---------------------------------------------------------------------- |
---|
| 277 | subroutine oasis_reprosum_calc (arr, arr_gsum, nsummands, dsummands, & |
---|
| 278 | nflds, ddpdd_sum, & |
---|
| 279 | arr_gbl_max, arr_gbl_max_out, & |
---|
| 280 | arr_max_levels, arr_max_levels_out, & |
---|
| 281 | gbl_max_nsummands, gbl_max_nsummands_out,& |
---|
| 282 | gbl_count, repro_sum_validate, & |
---|
| 283 | repro_sum_stats, rel_diff, commid ) |
---|
| 284 | !---------------------------------------------------------------------- |
---|
| 285 | |
---|
| 286 | ! Arguments |
---|
| 287 | |
---|
| 288 | integer, intent(in) :: nsummands !< number of local summands |
---|
| 289 | integer, intent(in) :: dsummands !< declared first dimension |
---|
| 290 | integer, intent(in) :: nflds !< number of fields |
---|
| 291 | real(r8), intent(in) :: arr(dsummands,nflds) |
---|
| 292 | !< input array |
---|
| 293 | |
---|
| 294 | real(r8), intent(out):: arr_gsum(nflds) |
---|
| 295 | !< global means |
---|
| 296 | |
---|
| 297 | logical, intent(in), optional :: ddpdd_sum |
---|
| 298 | !< use ddpdd algorithm instead |
---|
| 299 | !< of fixed precision algorithm |
---|
| 300 | |
---|
| 301 | real(r8), intent(in), optional :: arr_gbl_max(nflds) |
---|
| 302 | !< upper bound on max(abs(arr)) |
---|
| 303 | |
---|
| 304 | real(r8), intent(out), optional :: arr_gbl_max_out(nflds) |
---|
| 305 | !< calculated upper bound on |
---|
| 306 | !< max(abs(arr)) |
---|
| 307 | |
---|
| 308 | integer, intent(in), optional :: arr_max_levels(nflds) |
---|
| 309 | !< maximum number of levels of |
---|
| 310 | !< integer expansion to use |
---|
| 311 | |
---|
| 312 | integer, intent(out), optional :: arr_max_levels_out(nflds) |
---|
| 313 | !< output of number of levels of |
---|
| 314 | !< integer expansion to used |
---|
| 315 | |
---|
| 316 | integer, intent(in), optional :: gbl_max_nsummands |
---|
| 317 | !< maximum of nsummand over all |
---|
| 318 | !< processes |
---|
| 319 | |
---|
| 320 | integer, intent(out), optional :: gbl_max_nsummands_out |
---|
| 321 | !< calculated maximum nsummands |
---|
| 322 | !< over all processes |
---|
| 323 | |
---|
| 324 | integer, intent(in), optional :: gbl_count |
---|
| 325 | !< was total number of summands; |
---|
| 326 | !< now is ignored; use |
---|
| 327 | !< gbl_max_nsummands instead |
---|
| 328 | |
---|
| 329 | logical, intent(in), optional :: repro_sum_validate |
---|
| 330 | !< flag enabling/disabling testing that gmax and max_levels are |
---|
| 331 | !< accurate/sufficient. Default is enabled. |
---|
| 332 | |
---|
| 333 | integer, intent(inout), optional :: repro_sum_stats(5) |
---|
| 334 | !< increment running totals for |
---|
| 335 | !< (1) one-reduction repro_sum |
---|
| 336 | !< (2) two-reduction repro_sum |
---|
| 337 | !< (3) both types in one call |
---|
| 338 | !< (4) nonrepro_sum |
---|
| 339 | !< (5) global max nsummands reduction |
---|
| 340 | |
---|
| 341 | real(r8), intent(out), optional :: rel_diff(2,nflds) |
---|
| 342 | !< relative and absolute |
---|
| 343 | !< differences between fixed |
---|
| 344 | !< and floating point sums |
---|
| 345 | |
---|
| 346 | integer, intent(in), optional :: commid |
---|
| 347 | !< MPI communicator |
---|
| 348 | |
---|
| 349 | ! Local workspace |
---|
| 350 | |
---|
| 351 | logical :: use_ddpdd_sum ! flag indicating whether to |
---|
| 352 | ! use oasis_reprosum_ddpdd or not |
---|
| 353 | logical :: recompute ! flag indicating need to |
---|
| 354 | ! determine gmax/gmin before |
---|
| 355 | ! computing sum |
---|
| 356 | logical :: validate ! flag indicating need to |
---|
| 357 | ! verify gmax and max_levels |
---|
| 358 | ! are accurate/sufficient |
---|
| 359 | integer :: omp_nthreads ! number of OpenMP threads |
---|
| 360 | integer :: mpi_comm ! MPI subcommunicator |
---|
| 361 | integer :: tasks ! number of MPI processes |
---|
| 362 | integer :: ierr ! MPI error return |
---|
| 363 | integer :: ifld, isum, ithread ! loop variables |
---|
| 364 | integer :: max_nsummands ! max nsummands over all processes |
---|
| 365 | ! or threads (used in both ways) |
---|
| 366 | |
---|
| 367 | integer, allocatable :: isum_beg(:), isum_end(:) |
---|
| 368 | ! range of summand indices for each |
---|
| 369 | ! OpenMP thread |
---|
| 370 | integer, allocatable :: arr_tlmin_exp(:,:) |
---|
| 371 | ! per thread local exponent minima |
---|
| 372 | integer, allocatable :: arr_tlmax_exp(:,:) |
---|
| 373 | ! per thread local exponent maxima |
---|
| 374 | integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax |
---|
| 375 | ! summand exponent and working min/max |
---|
| 376 | integer :: arr_lmin_exp(nflds) ! local exponent minima |
---|
| 377 | integer :: arr_lmax_exp(nflds) ! local exponent maxima |
---|
| 378 | integer :: arr_lextremes(0:nflds,2)! local exponent extrema |
---|
| 379 | integer :: arr_gextremes(0:nflds,2)! global exponent extrema |
---|
| 380 | |
---|
| 381 | integer :: arr_gmax_exp(nflds) ! global exponents maxima |
---|
| 382 | integer :: arr_gmin_exp(nflds) ! global exponents minima |
---|
| 383 | integer :: arr_max_shift ! maximum safe exponent for |
---|
| 384 | ! value < 1 (so that sum does |
---|
| 385 | ! not overflow) |
---|
| 386 | integer :: max_levels(nflds) ! maximum number of levels of |
---|
| 387 | ! integer expansion to use |
---|
| 388 | integer :: max_level ! maximum value in max_levels |
---|
| 389 | integer :: gbl_max_red ! global max local sum reduction? (0/1) |
---|
| 390 | integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1) |
---|
| 391 | integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1) |
---|
| 392 | integer :: repro_sum_both ! both fast and slow? (0/1) |
---|
| 393 | integer :: nonrepro_sum ! nonrepro_sum? (0/1) |
---|
| 394 | |
---|
| 395 | real(r8) :: xmax_nsummands ! dble of max_nsummands |
---|
| 396 | real(r8) :: arr_lsum(nflds) ! local sums |
---|
| 397 | real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using |
---|
| 398 | ! fast, nonreproducible, |
---|
| 399 | ! floating point alg. |
---|
| 400 | real(r8) :: abs_diff ! absolute difference between |
---|
| 401 | ! fixed and floating point |
---|
| 402 | ! sums |
---|
| 403 | #ifdef _OPENMP |
---|
| 404 | integer omp_get_max_threads |
---|
| 405 | external omp_get_max_threads |
---|
| 406 | #endif |
---|
| 407 | character(len=*),parameter :: subname = '(oasis_reprosum_calc)' |
---|
| 408 | |
---|
| 409 | !----------------------------------------------------------------------- |
---|
| 410 | |
---|
| 411 | ! check whether should use oasis_reprosum_ddpdd algorithm |
---|
| 412 | use_ddpdd_sum = repro_sum_use_ddpdd |
---|
| 413 | if ( present(ddpdd_sum) ) then |
---|
| 414 | use_ddpdd_sum = ddpdd_sum |
---|
| 415 | endif |
---|
| 416 | |
---|
| 417 | ! check whether intrinsic-based algorithm will work on this system |
---|
| 418 | ! (requires floating point and integer bases to be the same) |
---|
| 419 | ! If not, always use ddpdd. |
---|
| 420 | use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8)) |
---|
| 421 | |
---|
| 422 | ! initialize local statistics variables |
---|
| 423 | gbl_max_red = 0 |
---|
| 424 | repro_sum_fast = 0 |
---|
| 425 | repro_sum_slow = 0 |
---|
| 426 | repro_sum_both = 0 |
---|
| 427 | nonrepro_sum = 0 |
---|
| 428 | |
---|
| 429 | ! set MPI communicator |
---|
| 430 | if ( present(commid) ) then |
---|
| 431 | mpi_comm = commid |
---|
| 432 | else |
---|
| 433 | mpi_comm = MPI_COMM_WORLD |
---|
| 434 | endif |
---|
| 435 | |
---|
| 436 | if (detailed_timing) then |
---|
| 437 | call oasis_timer_start('oasis_reprosum_prebarrier') |
---|
| 438 | call oasis_mpi_barrier(mpi_comm,subname) |
---|
| 439 | call oasis_timer_stop ('oasis_reprosum_prebarrier') |
---|
| 440 | endif |
---|
| 441 | |
---|
| 442 | if ( use_ddpdd_sum ) then |
---|
| 443 | |
---|
| 444 | if (detailed_timing) call oasis_timer_start('oasis_reprosum_ddpdd') |
---|
| 445 | |
---|
| 446 | call oasis_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & |
---|
| 447 | nflds, mpi_comm) |
---|
| 448 | repro_sum_fast = 1 |
---|
| 449 | |
---|
| 450 | if (detailed_timing) call oasis_timer_stop('oasis_reprosum_ddpdd') |
---|
| 451 | |
---|
| 452 | else |
---|
| 453 | |
---|
| 454 | if (detailed_timing) call oasis_timer_start('oasis_reprosum_int') |
---|
| 455 | |
---|
| 456 | ! get number of MPI tasks |
---|
| 457 | call mpi_comm_size(mpi_comm, tasks, ierr) |
---|
| 458 | |
---|
| 459 | ! get number of OpenMP threads |
---|
| 460 | #ifdef _OPENMP |
---|
| 461 | omp_nthreads = omp_get_max_threads() |
---|
| 462 | #else |
---|
| 463 | omp_nthreads = 1 |
---|
| 464 | #endif |
---|
| 465 | |
---|
| 466 | ! see if have sufficient information to not require max/min allreduce |
---|
| 467 | recompute = .true. |
---|
| 468 | validate = .false. |
---|
| 469 | if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then |
---|
| 470 | recompute = .false. |
---|
| 471 | |
---|
| 472 | ! setting lower bound on max_level*nflds to be 64 to improve OpenMP |
---|
| 473 | ! performance for loopb in oasis_reprosum_int |
---|
| 474 | max_level = (64/nflds) + 1 |
---|
| 475 | do ifld=1,nflds |
---|
| 476 | if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. & |
---|
| 477 | (arr_max_levels(ifld) > 0)) then |
---|
| 478 | |
---|
| 479 | arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld)) |
---|
| 480 | if (max_level < arr_max_levels(ifld)) & |
---|
| 481 | max_level = arr_max_levels(ifld) |
---|
| 482 | |
---|
| 483 | else |
---|
| 484 | recompute = .true. |
---|
| 485 | endif |
---|
| 486 | enddo |
---|
| 487 | |
---|
| 488 | if (.not. recompute) then |
---|
| 489 | |
---|
| 490 | ! determine maximum number of summands in local phases of the |
---|
| 491 | ! algorithm |
---|
| 492 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_max") |
---|
| 493 | if ( present(gbl_max_nsummands) ) then |
---|
| 494 | if (gbl_max_nsummands < 1) then |
---|
| 495 | call mpi_allreduce (nsummands, max_nsummands, 1, & |
---|
| 496 | MPI_INTEGER, MPI_MAX, mpi_comm, ierr) |
---|
| 497 | gbl_max_red = 1 |
---|
| 498 | else |
---|
| 499 | max_nsummands = gbl_max_nsummands |
---|
| 500 | endif |
---|
| 501 | else |
---|
| 502 | call mpi_allreduce (nsummands, max_nsummands, 1, & |
---|
| 503 | MPI_INTEGER, MPI_MAX, mpi_comm, ierr) |
---|
| 504 | gbl_max_red = 1 |
---|
| 505 | endif |
---|
| 506 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_max") |
---|
| 507 | |
---|
| 508 | ! determine maximum shift. Shift needs to be small enough that summation |
---|
| 509 | ! does not exceed maximum number of digits in i8. |
---|
| 510 | |
---|
| 511 | ! if requested, return max_nsummands before it is redefined |
---|
| 512 | if ( present( gbl_max_nsummands_out) ) then |
---|
| 513 | gbl_max_nsummands_out = max_nsummands |
---|
| 514 | endif |
---|
| 515 | |
---|
| 516 | ! summing within each thread first |
---|
| 517 | max_nsummands = (max_nsummands/omp_nthreads) + 1 |
---|
| 518 | ! then over threads and tasks |
---|
| 519 | max_nsummands = max(max_nsummands, tasks*omp_nthreads) |
---|
| 520 | |
---|
| 521 | xmax_nsummands = dble(max_nsummands) |
---|
| 522 | arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) |
---|
| 523 | if (arr_max_shift < 2) then |
---|
| 524 | write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm' |
---|
| 525 | call oasis_abort(file=__FILE__,line=__LINE__) |
---|
| 526 | endif |
---|
| 527 | |
---|
| 528 | ! calculate sum |
---|
| 529 | if (present(repro_sum_validate)) then |
---|
| 530 | validate = repro_sum_validate |
---|
| 531 | else |
---|
| 532 | validate = .true. |
---|
| 533 | endif |
---|
| 534 | call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, & |
---|
| 535 | nflds, arr_max_shift, arr_gmax_exp, & |
---|
| 536 | arr_max_levels, max_level, validate, & |
---|
| 537 | recompute, omp_nthreads, mpi_comm) |
---|
| 538 | |
---|
| 539 | ! record statistics, etc. |
---|
| 540 | repro_sum_fast = 1 |
---|
| 541 | if (recompute) then |
---|
| 542 | repro_sum_both = 1 |
---|
| 543 | else |
---|
| 544 | ! if requested, return specified levels and upper bounds on maxima |
---|
| 545 | if ( present(arr_max_levels_out) ) then |
---|
| 546 | do ifld=1,nflds |
---|
| 547 | arr_max_levels_out(ifld) = arr_max_levels(ifld) |
---|
| 548 | enddo |
---|
| 549 | endif |
---|
| 550 | if ( present(arr_gbl_max_out) ) then |
---|
| 551 | do ifld=1,nflds |
---|
| 552 | arr_gbl_max_out(ifld) = arr_gbl_max(ifld) |
---|
| 553 | enddo |
---|
| 554 | endif |
---|
| 555 | endif |
---|
| 556 | endif |
---|
| 557 | endif |
---|
| 558 | |
---|
| 559 | ! do not have sufficient information; calculate global max/min and |
---|
| 560 | ! use to compute required number of levels |
---|
| 561 | if (recompute) then |
---|
| 562 | |
---|
| 563 | ! record statistic |
---|
| 564 | repro_sum_slow = 1 |
---|
| 565 | |
---|
| 566 | ! determine maximum and minimum (non-zero) summand values and |
---|
| 567 | ! maximum number of local summands |
---|
| 568 | |
---|
| 569 | ! allocate thread-specific work space |
---|
| 570 | allocate(arr_tlmax_exp(nflds,omp_nthreads)) |
---|
| 571 | allocate(arr_tlmin_exp(nflds,omp_nthreads)) |
---|
| 572 | allocate(isum_beg(omp_nthreads)) |
---|
| 573 | allocate(isum_end(omp_nthreads)) |
---|
| 574 | |
---|
| 575 | ! split summand index range over OpenMP threads |
---|
| 576 | call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) |
---|
| 577 | |
---|
| 578 | !$omp parallel do & |
---|
| 579 | !$omp default(shared) & |
---|
| 580 | !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax) |
---|
| 581 | do ithread=1,omp_nthreads |
---|
| 582 | if (detailed_timing) call oasis_timer_start('repro_sum_loopa') |
---|
| 583 | do ifld=1,nflds |
---|
| 584 | arr_exp_tlmin = MAXEXPONENT(1._r8) |
---|
| 585 | arr_exp_tlmax = MINEXPONENT(1._r8) |
---|
| 586 | do isum=isum_beg(ithread),isum_end(ithread) |
---|
| 587 | if (arr(isum,ifld) .ne. 0.0_r8) then |
---|
| 588 | arr_exp = exponent(arr(isum,ifld)) |
---|
| 589 | arr_exp_tlmin = min(arr_exp,arr_exp_tlmin) |
---|
| 590 | arr_exp_tlmax = max(arr_exp,arr_exp_tlmax) |
---|
| 591 | endif |
---|
| 592 | end do |
---|
| 593 | arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin |
---|
| 594 | arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax |
---|
| 595 | end do |
---|
| 596 | if (detailed_timing) call oasis_timer_stop('repro_sum_loopa') |
---|
| 597 | end do |
---|
| 598 | |
---|
| 599 | do ifld=1,nflds |
---|
| 600 | arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:)) |
---|
| 601 | arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:)) |
---|
| 602 | end do |
---|
| 603 | deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end) |
---|
| 604 | |
---|
| 605 | arr_lextremes(0,:) = -nsummands |
---|
| 606 | arr_lextremes(1:nflds,1) = -arr_lmax_exp(:) |
---|
| 607 | arr_lextremes(1:nflds,2) = arr_lmin_exp(:) |
---|
| 608 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_minmax") |
---|
| 609 | call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), & |
---|
| 610 | MPI_INTEGER, MPI_MIN, mpi_comm, ierr) |
---|
| 611 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_minmax") |
---|
| 612 | max_nsummands = -arr_gextremes(0,1) |
---|
| 613 | arr_gmax_exp(:) = -arr_gextremes(1:nflds,1) |
---|
| 614 | arr_gmin_exp(:) = arr_gextremes(1:nflds,2) |
---|
| 615 | |
---|
| 616 | ! if a field is identically zero, arr_gmin_exp still equals MAXEXPONENT |
---|
| 617 | ! and arr_gmax_exp still equals MINEXPONENT. In this case, set |
---|
| 618 | ! arr_gmin_exp = arr_gmax_exp = MINEXPONENT |
---|
| 619 | do ifld=1,nflds |
---|
| 620 | arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld)) |
---|
| 621 | enddo |
---|
| 622 | |
---|
| 623 | ! if requested, return upper bounds on observed maxima |
---|
| 624 | if ( present(arr_gbl_max_out) ) then |
---|
| 625 | do ifld=1,nflds |
---|
| 626 | arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld)) |
---|
| 627 | enddo |
---|
| 628 | endif |
---|
| 629 | |
---|
| 630 | ! if requested, return max_nsummands before it is redefined |
---|
| 631 | if ( present( gbl_max_nsummands_out) ) then |
---|
| 632 | gbl_max_nsummands_out = max_nsummands |
---|
| 633 | endif |
---|
| 634 | |
---|
| 635 | ! determine maximum shift (same as in previous branch, but with calculated |
---|
| 636 | ! max_nsummands). Shift needs to be small enough that summation does not |
---|
| 637 | ! exceed maximum number of digits in i8. |
---|
| 638 | |
---|
| 639 | ! summing within each thread first |
---|
| 640 | max_nsummands = (max_nsummands/omp_nthreads) + 1 |
---|
| 641 | ! then over threads and tasks |
---|
| 642 | max_nsummands = max(max_nsummands, tasks*omp_nthreads) |
---|
| 643 | |
---|
| 644 | xmax_nsummands = dble(max_nsummands) |
---|
| 645 | arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) |
---|
| 646 | if (arr_max_shift < 2) then |
---|
| 647 | write(nulprt,*) subname,estr,' number of summands too large for fixed precision algorithm' |
---|
| 648 | call oasis_abort(file=__FILE__,line=__LINE__) |
---|
| 649 | endif |
---|
| 650 | |
---|
| 651 | ! determine maximum number of levels required for each field |
---|
| 652 | ! ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift) |
---|
| 653 | ! + 1 because first truncation probably does not involve a maximal shift |
---|
| 654 | ! + 1 to guarantee that the integer division rounds up (not down) |
---|
| 655 | ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP |
---|
| 656 | ! performance for loopb in oasis_reprosum_int) |
---|
| 657 | max_level = (64/nflds) + 1 |
---|
| 658 | do ifld=1,nflds |
---|
| 659 | max_levels(ifld) = 2 + & |
---|
| 660 | ((digits(0_i8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & |
---|
| 661 | / arr_max_shift) |
---|
| 662 | if ( present(arr_max_levels) .and. (.not. validate) ) then |
---|
| 663 | ! if validate true, then computation with arr_max_levels failed |
---|
| 664 | ! previously |
---|
| 665 | if ( arr_max_levels(ifld) > 0 ) then |
---|
| 666 | max_levels(ifld) = & |
---|
| 667 | min(arr_max_levels(ifld),max_levels(ifld)) |
---|
| 668 | endif |
---|
| 669 | endif |
---|
| 670 | if (max_level < max_levels(ifld)) & |
---|
| 671 | max_level = max_levels(ifld) |
---|
| 672 | enddo |
---|
| 673 | |
---|
| 674 | ! if requested, return calculated levels |
---|
| 675 | if ( present(arr_max_levels_out) ) then |
---|
| 676 | do ifld=1,nflds |
---|
| 677 | arr_max_levels_out(ifld) = max_levels(ifld) |
---|
| 678 | enddo |
---|
| 679 | endif |
---|
| 680 | |
---|
| 681 | ! calculate sum |
---|
| 682 | validate = .false. |
---|
| 683 | call oasis_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, & |
---|
| 684 | arr_max_shift, arr_gmax_exp, max_levels, & |
---|
| 685 | max_level, validate, recompute, & |
---|
| 686 | omp_nthreads, mpi_comm) |
---|
| 687 | |
---|
| 688 | endif |
---|
| 689 | |
---|
| 690 | if (detailed_timing) call oasis_timer_stop('oasis_reprosum_int') |
---|
| 691 | |
---|
| 692 | endif |
---|
| 693 | |
---|
| 694 | ! compare fixed and floating point results |
---|
| 695 | if ( present(rel_diff) ) then |
---|
| 696 | if (oasis_reprosum_reldiffmax >= 0.0_r8) then |
---|
| 697 | |
---|
| 698 | if (detailed_timing) then |
---|
| 699 | call oasis_timer_start('oasis_nonreprosum_prebarrier') |
---|
| 700 | call oasis_mpi_barrier(mpi_comm,subname) |
---|
| 701 | call oasis_timer_stop ('oasis_nonreprosum_prebarrier') |
---|
| 702 | endif |
---|
| 703 | |
---|
| 704 | if (detailed_timing) call oasis_timer_start('nonrepro_sum') |
---|
| 705 | ! record statistic |
---|
| 706 | nonrepro_sum = 1 |
---|
| 707 | ! compute nonreproducible sum |
---|
| 708 | arr_lsum(:) = 0._r8 |
---|
| 709 | !$omp parallel do & |
---|
| 710 | !$omp default(shared) & |
---|
| 711 | !$omp private(ifld, isum) |
---|
| 712 | do ifld=1,nflds |
---|
| 713 | do isum=1,nsummands |
---|
| 714 | arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld) |
---|
| 715 | end do |
---|
| 716 | end do |
---|
| 717 | |
---|
| 718 | call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, & |
---|
| 719 | MPI_REAL8, MPI_SUM, mpi_comm, ierr) |
---|
| 720 | |
---|
| 721 | if (detailed_timing) call oasis_timer_stop('nonrepro_sum') |
---|
| 722 | |
---|
| 723 | ! determine differences |
---|
| 724 | !$omp parallel do & |
---|
| 725 | !$omp default(shared) & |
---|
| 726 | !$omp private(ifld, abs_diff) |
---|
| 727 | do ifld=1,nflds |
---|
| 728 | abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld)) |
---|
| 729 | if (abs(arr_gsum(ifld)) > abs_diff) then |
---|
| 730 | rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld)) |
---|
| 731 | else |
---|
| 732 | rel_diff(1,ifld) = abs_diff |
---|
| 733 | endif |
---|
| 734 | rel_diff(2,ifld) = abs_diff |
---|
| 735 | enddo |
---|
| 736 | else |
---|
| 737 | rel_diff(:,:) = 0.0_r8 |
---|
| 738 | endif |
---|
| 739 | endif |
---|
| 740 | |
---|
| 741 | ! return statistics |
---|
| 742 | if ( present(repro_sum_stats) ) then |
---|
| 743 | repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast |
---|
| 744 | repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow |
---|
| 745 | repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both |
---|
| 746 | repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum |
---|
| 747 | repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red |
---|
| 748 | endif |
---|
| 749 | |
---|
| 750 | |
---|
| 751 | end subroutine oasis_reprosum_calc |
---|
| 752 | |
---|
| 753 | !======================================================================== |
---|
| 754 | ! |
---|
| 755 | ! Purpose: |
---|
| 756 | !> Compute the global sum of each field in "arr" using the indicated |
---|
| 757 | !> communicator with a reproducible yet scalable implementation based |
---|
| 758 | !> on a fixed point algorithm. The accuracy of the fixed point algorithm |
---|
| 759 | !> is controlled by the number of "levels" of integer expansion, the |
---|
| 760 | !> maximum value of which is specified by max_level. |
---|
| 761 | ! |
---|
| 762 | !---------------------------------------------------------------------- |
---|
| 763 | |
---|
| 764 | subroutine oasis_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & |
---|
| 765 | arr_max_shift, arr_gmax_exp, max_levels, & |
---|
| 766 | max_level, validate, recompute, & |
---|
| 767 | omp_nthreads, mpi_comm ) |
---|
| 768 | |
---|
| 769 | !---------------------------------------------------------------------- |
---|
| 770 | |
---|
| 771 | ! Arguments |
---|
| 772 | |
---|
| 773 | integer, intent(in) :: nsummands !< number of local summands |
---|
| 774 | integer, intent(in) :: dsummands !< declared first dimension |
---|
| 775 | integer, intent(in) :: nflds !< number of fields |
---|
| 776 | integer, intent(in) :: arr_max_shift !< maximum safe exponent for |
---|
| 777 | !< value < 1 (so that sum |
---|
| 778 | !< does not overflow) |
---|
| 779 | integer, intent(in) :: arr_gmax_exp(nflds) |
---|
| 780 | !< exponents of global maxima |
---|
| 781 | integer, intent(in) :: max_levels(nflds) |
---|
| 782 | !< maximum number of levels |
---|
| 783 | !< of integer expansion |
---|
| 784 | integer, intent(in) :: max_level !< maximum value in |
---|
| 785 | !< max_levels |
---|
| 786 | integer, intent(in) :: omp_nthreads !< number of OpenMP threads |
---|
| 787 | integer, intent(in) :: mpi_comm !< MPI subcommunicator |
---|
| 788 | |
---|
| 789 | real(r8), intent(in) :: arr(dsummands,nflds) |
---|
| 790 | !< input array |
---|
| 791 | |
---|
| 792 | logical, intent(in):: validate |
---|
| 793 | !< flag indicating that accuracy of solution generated from |
---|
| 794 | !< arr_gmax_exp and max_levels should be tested |
---|
| 795 | |
---|
| 796 | logical, intent(out):: recompute |
---|
| 797 | !< flag indicating that either the upper bounds are inaccurate, |
---|
| 798 | !< or max_levels and arr_gmax_exp do not generate accurate |
---|
| 799 | !< enough sums |
---|
| 800 | |
---|
| 801 | real(r8), intent(out):: arr_gsum(nflds) !< global means |
---|
| 802 | |
---|
| 803 | ! Local workspace |
---|
| 804 | |
---|
| 805 | integer, parameter :: max_jlevel = & |
---|
| 806 | 1 + (digits(0_i8)/digits(0.0_r8)) |
---|
| 807 | |
---|
| 808 | integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads) |
---|
| 809 | ! integer vector representing local |
---|
| 810 | ! sum (per thread, per field) |
---|
| 811 | integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds) |
---|
| 812 | ! integer vector representing local |
---|
| 813 | ! sum |
---|
| 814 | integer(i8) :: i8_arr_level ! integer part of summand for current |
---|
| 815 | ! expansion level |
---|
| 816 | integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds) |
---|
| 817 | ! integer vector representing global |
---|
| 818 | ! sum |
---|
| 819 | integer(i8) :: IX_8 ! integer representation of current |
---|
| 820 | ! jlevels of X_8 ('part' of |
---|
| 821 | ! i8_arr_gsum_level) |
---|
| 822 | integer(i8) :: i8_sign ! sign global sum |
---|
| 823 | integer(i8) :: i8_radix ! radix for i8 variables |
---|
| 824 | |
---|
| 825 | integer :: max_error(nflds,omp_nthreads) |
---|
| 826 | ! accurate upper bound on data? |
---|
| 827 | integer :: not_exact(nflds,omp_nthreads) |
---|
| 828 | ! max_levels sufficient to |
---|
| 829 | ! capture all digits? |
---|
| 830 | integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads) |
---|
| 831 | ! range of summand indices for each |
---|
| 832 | ! OpenMP thread |
---|
| 833 | integer :: ifld, isum, ithread |
---|
| 834 | ! loop variables |
---|
| 835 | integer :: arr_exp ! exponent of summand |
---|
| 836 | integer :: arr_shift ! exponent used to generate integer |
---|
| 837 | ! for current expansion level |
---|
| 838 | integer :: ilevel ! current integer expansion level |
---|
| 839 | integer :: offset(nflds) ! beginning location in |
---|
| 840 | ! i8_arr_{g,l}sum_level for integer |
---|
| 841 | ! expansion of current ifld |
---|
| 842 | integer :: voffset ! modification to offset used to |
---|
| 843 | ! include validation metrics |
---|
| 844 | integer :: ioffset ! offset(ifld) |
---|
| 845 | integer :: jlevel ! number of floating point 'pieces' |
---|
| 846 | ! extracted from a given i8 integer |
---|
| 847 | integer :: ierr ! MPI error return |
---|
| 848 | integer :: LX(max_jlevel) ! exponent of X_8 (see below) |
---|
| 849 | integer :: veclth ! total length of i8_arr_lsum_level |
---|
| 850 | integer :: sum_digits ! lower bound on number of significant |
---|
| 851 | ! in integer expansion of sum |
---|
| 852 | integer :: curr_exp ! exponent of partial sum during |
---|
| 853 | ! reconstruction from integer vector |
---|
| 854 | integer :: corr_exp ! exponent of current summand in |
---|
| 855 | ! reconstruction from integer vector |
---|
| 856 | |
---|
| 857 | real(r8) :: arr_frac ! fraction of summand |
---|
| 858 | real(r8) :: arr_remainder ! part of summand remaining after |
---|
| 859 | ! current level of integer expansion |
---|
| 860 | real(r8) :: X_8(max_jlevel) ! r8 vector representation of current |
---|
| 861 | ! i8_arr_gsum_level |
---|
| 862 | real(r8) :: RX_8 ! r8 representation of difference |
---|
| 863 | ! between current i8_arr_gsum_level |
---|
| 864 | ! and current jlevels of X_8 |
---|
| 865 | ! (== IX_8). Also used in final |
---|
| 866 | ! scaling step |
---|
| 867 | |
---|
| 868 | logical :: first ! flag used to indicate that just |
---|
| 869 | ! beginning reconstruction of sum |
---|
| 870 | ! from integer vector |
---|
| 871 | |
---|
| 872 | character(len=*),parameter :: subname = '(oasis_reprosum_int)' |
---|
| 873 | |
---|
| 874 | !----------------------------------------------------------------------- |
---|
| 875 | ! Save radix of i8 variables in an i8 variable |
---|
| 876 | i8_radix = radix(IX_8) |
---|
| 877 | |
---|
| 878 | ! If validating upper bounds, reserve space for validation metrics |
---|
| 879 | ! In both cases, reserve an extra level for overflow from the top level |
---|
| 880 | if (validate) then |
---|
| 881 | voffset = 3 |
---|
| 882 | else |
---|
| 883 | voffset = 1 |
---|
| 884 | endif |
---|
| 885 | |
---|
| 886 | ! compute offsets for each field |
---|
| 887 | offset(1) = voffset |
---|
| 888 | do ifld=2,nflds |
---|
| 889 | offset(ifld) = offset(ifld-1) & |
---|
| 890 | + (max_levels(ifld-1) + voffset) |
---|
| 891 | enddo |
---|
| 892 | veclth = offset(nflds) + max_levels(nflds) |
---|
| 893 | |
---|
| 894 | ! split summand index range over OpenMP threads |
---|
| 895 | call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) |
---|
| 896 | |
---|
| 897 | ! convert local summands to vector of integers and sum |
---|
| 898 | ! (Using scale instead of set_exponent because arr_remainder may not be |
---|
| 899 | ! "normal" after level 1 calculation) |
---|
| 900 | i8_arr_lsum_level(:) = 0_i8 |
---|
| 901 | |
---|
| 902 | !$omp parallel do & |
---|
| 903 | !$omp default(shared) & |
---|
| 904 | !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, & |
---|
| 905 | !$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8) |
---|
| 906 | do ithread=1,omp_nthreads |
---|
| 907 | if (detailed_timing) call oasis_timer_start('repro_sum_loopb') |
---|
| 908 | do ifld=1,nflds |
---|
| 909 | ioffset = offset(ifld) |
---|
| 910 | |
---|
| 911 | max_error(ifld,ithread) = 0 |
---|
| 912 | not_exact(ifld,ithread) = 0 |
---|
| 913 | |
---|
| 914 | i8_arr_tlsum_level(:,ifld,ithread) = 0_i8 |
---|
| 915 | do isum=isum_beg(ithread),isum_end(ithread) |
---|
| 916 | arr_remainder = 0.0_r8 |
---|
| 917 | |
---|
| 918 | if (arr(isum,ifld) .ne. 0.0_r8) then |
---|
| 919 | arr_exp = exponent(arr(isum,ifld)) |
---|
| 920 | arr_frac = fraction(arr(isum,ifld)) |
---|
| 921 | |
---|
| 922 | ! test that global maximum upper bound is an upper bound |
---|
| 923 | if (arr_exp > arr_gmax_exp(ifld)) then |
---|
| 924 | max_error(ifld,ithread) = 1 |
---|
| 925 | exit |
---|
| 926 | endif |
---|
| 927 | |
---|
| 928 | ! calculate first shift |
---|
| 929 | arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) |
---|
| 930 | |
---|
| 931 | ! determine first (probably) nonzero level (assuming initial fraction is |
---|
| 932 | ! 'normal' - algorithm still works if this is not true) |
---|
| 933 | ! NOTE: this is critical; scale will set to zero if min exponent is too small. |
---|
| 934 | if (arr_shift < 1) then |
---|
| 935 | ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift |
---|
| 936 | arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) |
---|
| 937 | |
---|
| 938 | do while (arr_shift < 1) |
---|
| 939 | arr_shift = arr_shift + arr_max_shift |
---|
| 940 | ilevel = ilevel + 1 |
---|
| 941 | enddo |
---|
| 942 | else |
---|
| 943 | ilevel = 1 |
---|
| 944 | endif |
---|
| 945 | |
---|
| 946 | if (ilevel .le. max_levels(ifld)) then |
---|
| 947 | ! apply first shift/truncate, add it to the relevant running |
---|
| 948 | ! sum, and calculate the remainder. |
---|
| 949 | arr_remainder = scale(arr_frac,arr_shift) |
---|
| 950 | i8_arr_level = int(arr_remainder,i8) |
---|
| 951 | i8_arr_tlsum_level(ilevel,ifld,ithread) = & |
---|
| 952 | i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level |
---|
| 953 | arr_remainder = arr_remainder - i8_arr_level |
---|
| 954 | |
---|
| 955 | ! while the remainder is non-zero, continue to shift, truncate, |
---|
| 956 | ! sum, and calculate new remainder |
---|
| 957 | do while ((arr_remainder .ne. 0.0_r8) & |
---|
| 958 | .and. (ilevel < max_levels(ifld))) |
---|
| 959 | ilevel = ilevel + 1 |
---|
| 960 | arr_remainder = scale(arr_remainder,arr_max_shift) |
---|
| 961 | i8_arr_level = int(arr_remainder,i8) |
---|
| 962 | i8_arr_tlsum_level(ilevel,ifld,ithread) = & |
---|
| 963 | i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level |
---|
| 964 | arr_remainder = arr_remainder - i8_arr_level |
---|
| 965 | enddo |
---|
| 966 | |
---|
| 967 | endif |
---|
| 968 | endif |
---|
| 969 | |
---|
| 970 | if (arr_remainder .ne. 0.0_r8) then |
---|
| 971 | not_exact(ifld,ithread) = 1 |
---|
| 972 | endif |
---|
| 973 | |
---|
| 974 | enddo |
---|
| 975 | |
---|
| 976 | ! postprocess integer vector to eliminate potential for overlap in the following |
---|
| 977 | ! sums over threads and processes: if value larger than or equal to |
---|
| 978 | ! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in |
---|
| 979 | ! vector, resulting in nonoverlapping ranges for each component. Note that |
---|
| 980 | ! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums |
---|
| 981 | ! over threads and processes do not overflow for ilevel==1. |
---|
| 982 | do ilevel=max_levels(ifld),1,-1 |
---|
| 983 | RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) |
---|
| 984 | IX_8 = int(scale(RX_8,-arr_max_shift),i8) |
---|
| 985 | if (IX_8 .ne. 0_i8) then |
---|
| 986 | i8_arr_tlsum_level(ilevel-1,ifld,ithread) = & |
---|
| 987 | i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8 |
---|
| 988 | IX_8 = IX_8*(i8_radix**arr_max_shift) |
---|
| 989 | i8_arr_tlsum_level(ilevel,ifld,ithread) = & |
---|
| 990 | i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8 |
---|
| 991 | endif |
---|
| 992 | enddo |
---|
| 993 | enddo |
---|
| 994 | if (detailed_timing) call oasis_timer_stop('repro_sum_loopb') |
---|
| 995 | enddo |
---|
| 996 | |
---|
| 997 | ! sum contributions from different threads |
---|
| 998 | do ifld=1,nflds |
---|
| 999 | ioffset = offset(ifld) |
---|
| 1000 | do ithread = 1,omp_nthreads |
---|
| 1001 | do ilevel = 0,max_levels(ifld) |
---|
| 1002 | i8_arr_lsum_level(ioffset+ilevel) = & |
---|
| 1003 | i8_arr_lsum_level(ioffset+ilevel) & |
---|
| 1004 | + i8_arr_tlsum_level(ilevel,ifld,ithread) |
---|
| 1005 | enddo |
---|
| 1006 | enddo |
---|
| 1007 | enddo |
---|
| 1008 | |
---|
| 1009 | ! record if upper bound was inaccurate or if level expansion stopped |
---|
| 1010 | ! before full accuracy was achieved |
---|
| 1011 | if (validate) then |
---|
| 1012 | do ifld=1,nflds |
---|
| 1013 | ioffset = offset(ifld) |
---|
| 1014 | i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:)) |
---|
| 1015 | i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:)) |
---|
| 1016 | enddo |
---|
| 1017 | endif |
---|
| 1018 | |
---|
| 1019 | ! sum integer vector element-wise |
---|
| 1020 | #if ( defined noI8 ) |
---|
| 1021 | ! Workaround for when i8 is not supported. |
---|
| 1022 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_i4") |
---|
| 1023 | call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & |
---|
| 1024 | veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr) |
---|
| 1025 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i4") |
---|
| 1026 | #else |
---|
| 1027 | if (detailed_timing) call oasis_timer_start("repro_sum_allr_i8") |
---|
| 1028 | call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & |
---|
| 1029 | veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr) |
---|
| 1030 | if (detailed_timing) call oasis_timer_stop("repro_sum_allr_i8") |
---|
| 1031 | #endif |
---|
| 1032 | |
---|
| 1033 | ! Construct global sum from integer vector representation: |
---|
| 1034 | ! 1) arr_max_shift is the shift applied to fraction(arr_gmax) . |
---|
| 1035 | ! When shifting back, need to "add back in" true arr_gmax exponent. This was |
---|
| 1036 | ! removed implicitly by working only with the fraction . |
---|
| 1037 | ! 2) want to add levels into sum in reverse order (smallest to largest). However, |
---|
| 1038 | ! even this can generate floating point rounding errors if signs of integers |
---|
| 1039 | ! alternate. To avoid this, do some arithmetic with integer vectors so that all |
---|
| 1040 | ! components have the same sign. This should keep relative difference between |
---|
| 1041 | ! using different integer sizes (e.g. i8 and i4) to machine epsilon |
---|
| 1042 | ! 3) assignment to X_8 will usually lose accuracy since maximum integer |
---|
| 1043 | ! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands |
---|
| 1044 | ! correction not very large). Calculate remainder and add in first (since |
---|
| 1045 | ! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits). |
---|
| 1046 | ! For r4 (24 digits) may need to correct twice. Code is written in a general |
---|
| 1047 | ! fashion, to work no matter how many corrections are necessary (assuming |
---|
| 1048 | ! max_jlevel parameter calculation is correct). |
---|
| 1049 | |
---|
| 1050 | recompute = .false. |
---|
| 1051 | do ifld=1,nflds |
---|
| 1052 | arr_gsum(ifld) = 0.0_r8 |
---|
| 1053 | ioffset = offset(ifld) |
---|
| 1054 | |
---|
| 1055 | ! if validate is .true., test whether the summand upper bound |
---|
| 1056 | ! was exceeded on any of the processes |
---|
| 1057 | if (validate) then |
---|
| 1058 | if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then |
---|
| 1059 | recompute = .true. |
---|
| 1060 | endif |
---|
| 1061 | endif |
---|
| 1062 | |
---|
| 1063 | if (.not. recompute) then |
---|
| 1064 | |
---|
| 1065 | ! preprocess integer vector: |
---|
| 1066 | ! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap' |
---|
| 1067 | ! to next larger integer in vector, resulting in nonoverlapping ranges for each |
---|
| 1068 | ! component. Note that have "ilevel-1=0" level here as described above. |
---|
| 1069 | do ilevel=max_levels(ifld),1,-1 |
---|
| 1070 | RX_8 = i8_arr_gsum_level(ioffset+ilevel) |
---|
| 1071 | IX_8 = int(scale(RX_8,-arr_max_shift),i8) |
---|
| 1072 | if (IX_8 .ne. 0_i8) then |
---|
| 1073 | i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) & |
---|
| 1074 | + IX_8 |
---|
| 1075 | IX_8 = IX_8*(i8_radix**arr_max_shift) |
---|
| 1076 | i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) & |
---|
| 1077 | - IX_8 |
---|
| 1078 | endif |
---|
| 1079 | enddo |
---|
| 1080 | ! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary |
---|
| 1081 | ! so that all vector components have the same sign (eliminating loss |
---|
| 1082 | ! of accuracy arising from difference of large values when |
---|
| 1083 | ! reconstructing r8 sum from integer vector) |
---|
| 1084 | ilevel = 0 |
---|
| 1085 | do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) & |
---|
| 1086 | .and. (ilevel < max_levels(ifld))) |
---|
| 1087 | ilevel = ilevel + 1 |
---|
| 1088 | enddo |
---|
| 1089 | ! |
---|
| 1090 | if (ilevel < max_levels(ifld)) then |
---|
| 1091 | if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then |
---|
| 1092 | i8_sign = 1_i8 |
---|
| 1093 | else |
---|
| 1094 | i8_sign = -1_i8 |
---|
| 1095 | endif |
---|
| 1096 | do jlevel=ilevel,max_levels(ifld)-1 |
---|
| 1097 | if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) & |
---|
| 1098 | .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then |
---|
| 1099 | i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) & |
---|
| 1100 | - i8_sign |
---|
| 1101 | i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) & |
---|
| 1102 | + i8_sign*(i8_radix**arr_max_shift) |
---|
| 1103 | endif |
---|
| 1104 | enddo |
---|
| 1105 | endif |
---|
| 1106 | |
---|
| 1107 | ! start with maximum shift, and work up to larger values |
---|
| 1108 | arr_shift = arr_gmax_exp(ifld) & |
---|
| 1109 | - max_levels(ifld)*arr_max_shift |
---|
| 1110 | curr_exp = 0 |
---|
| 1111 | first = .true. |
---|
| 1112 | do ilevel=max_levels(ifld),0,-1 |
---|
| 1113 | |
---|
| 1114 | if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then |
---|
| 1115 | jlevel = 1 |
---|
| 1116 | |
---|
| 1117 | ! r8 representation of higher order bits in integer |
---|
| 1118 | X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel) |
---|
| 1119 | LX(jlevel) = exponent(X_8(jlevel)) |
---|
| 1120 | |
---|
| 1121 | ! calculate remainder |
---|
| 1122 | IX_8 = int(X_8(jlevel),i8) |
---|
| 1123 | RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) |
---|
| 1124 | |
---|
| 1125 | ! repeat using remainder |
---|
| 1126 | do while (RX_8 .ne. 0.0_r8) |
---|
| 1127 | jlevel = jlevel + 1 |
---|
| 1128 | X_8(jlevel) = RX_8 |
---|
| 1129 | LX(jlevel) = exponent(RX_8) |
---|
| 1130 | IX_8 = IX_8 + int(RX_8,i8) |
---|
| 1131 | RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) |
---|
| 1132 | enddo |
---|
| 1133 | |
---|
| 1134 | ! add in contributions, smaller to larger, rescaling for each |
---|
| 1135 | ! addition to guarantee that exponent of working summand is always |
---|
| 1136 | ! larger than minexponent |
---|
| 1137 | do while (jlevel > 0) |
---|
| 1138 | if (first) then |
---|
| 1139 | curr_exp = LX(jlevel) + arr_shift |
---|
| 1140 | arr_gsum(ifld) = fraction(X_8(jlevel)) |
---|
| 1141 | first = .false. |
---|
| 1142 | else |
---|
| 1143 | corr_exp = curr_exp - (LX(jlevel) + arr_shift) |
---|
| 1144 | arr_gsum(ifld) = fraction(X_8(jlevel)) & |
---|
| 1145 | + scale(arr_gsum(ifld),corr_exp) |
---|
| 1146 | curr_exp = LX(jlevel) + arr_shift |
---|
| 1147 | endif |
---|
| 1148 | jlevel = jlevel - 1 |
---|
| 1149 | enddo |
---|
| 1150 | |
---|
| 1151 | endif |
---|
| 1152 | |
---|
| 1153 | arr_shift = arr_shift + arr_max_shift |
---|
| 1154 | enddo |
---|
| 1155 | |
---|
| 1156 | ! apply final exponent correction, scaling first if exponent is too small |
---|
| 1157 | ! to apply directly |
---|
| 1158 | corr_exp = curr_exp + exponent(arr_gsum(ifld)) |
---|
| 1159 | if (corr_exp .ge. MINEXPONENT(1._r8)) then |
---|
| 1160 | arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp) |
---|
| 1161 | else |
---|
| 1162 | RX_8 = set_exponent(arr_gsum(ifld), & |
---|
| 1163 | corr_exp-MINEXPONENT(1._r8)) |
---|
| 1164 | arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8)) |
---|
| 1165 | endif |
---|
| 1166 | |
---|
| 1167 | ! if validate is .true. and some precision lost, test whether 'too much' |
---|
| 1168 | ! was lost, due to too loose an upper bound, too stringent a limit on number |
---|
| 1169 | ! of levels of expansion, cancellation, .... Calculated by comparing lower |
---|
| 1170 | ! bound on number of sigificant digits with number of digits in 1.0_r8 . |
---|
| 1171 | if (validate) then |
---|
| 1172 | if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then |
---|
| 1173 | |
---|
| 1174 | ! find first nonzero level and use exponent for this level, then assume all |
---|
| 1175 | ! subsequent levels contribute arr_max_shift digits. |
---|
| 1176 | sum_digits = 0 |
---|
| 1177 | do ilevel=0,max_levels(ifld) |
---|
| 1178 | if (sum_digits .eq. 0) then |
---|
| 1179 | if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then |
---|
| 1180 | X_8(1) = i8_arr_gsum_level(ioffset+ilevel) |
---|
| 1181 | LX(1) = exponent(X_8(1)) |
---|
| 1182 | sum_digits = LX(1) |
---|
| 1183 | endif |
---|
| 1184 | else |
---|
| 1185 | sum_digits = sum_digits + arr_max_shift |
---|
| 1186 | endif |
---|
| 1187 | enddo |
---|
| 1188 | |
---|
| 1189 | if (sum_digits < digits(1.0_r8)) then |
---|
| 1190 | recompute = .true. |
---|
| 1191 | endif |
---|
| 1192 | endif |
---|
| 1193 | endif |
---|
| 1194 | |
---|
| 1195 | endif |
---|
| 1196 | |
---|
| 1197 | enddo |
---|
| 1198 | |
---|
| 1199 | |
---|
| 1200 | end subroutine oasis_reprosum_int |
---|
| 1201 | |
---|
| 1202 | !======================================================================== |
---|
| 1203 | ! |
---|
| 1204 | ! Purpose: |
---|
| 1205 | !> Test whether distributed sum exceeds tolerance and print out a |
---|
| 1206 | !> warning message. |
---|
| 1207 | ! |
---|
| 1208 | !---------------------------------------------------------------------- |
---|
| 1209 | |
---|
| 1210 | logical function oasis_reprosum_tolExceeded (name, nflds, master, & |
---|
| 1211 | logunit, rel_diff ) |
---|
| 1212 | !---------------------------------------------------------------------- |
---|
| 1213 | |
---|
| 1214 | ! Arguments |
---|
| 1215 | |
---|
| 1216 | character(len=*), intent(in) :: name !< distributed sum identifier |
---|
| 1217 | integer, intent(in) :: nflds !< number of fields |
---|
| 1218 | logical, intent(in) :: master !< process that will write |
---|
| 1219 | !< warning messages? |
---|
| 1220 | integer, optional, intent(in) :: logunit!< unit warning messages |
---|
| 1221 | !< written to |
---|
| 1222 | real(r8), intent(in) :: rel_diff(2,nflds) |
---|
| 1223 | !< relative and absolute |
---|
| 1224 | !< differences between fixed |
---|
| 1225 | !< and floating point sums |
---|
| 1226 | |
---|
| 1227 | ! Local workspace |
---|
| 1228 | |
---|
| 1229 | integer :: llogunit ! local log unit |
---|
| 1230 | integer :: ifld ! field index |
---|
| 1231 | integer :: exceeds_limit ! number of fields whose |
---|
| 1232 | ! sum exceeds tolerance |
---|
| 1233 | real(r8) :: max_rel_diff ! maximum relative difference |
---|
| 1234 | integer :: max_rel_diff_idx ! field index for max. rel. diff. |
---|
| 1235 | real(r8) :: max_abs_diff ! maximum absolute difference |
---|
| 1236 | integer :: max_abs_diff_idx ! field index for max. abs. diff. |
---|
| 1237 | character(len=*),parameter :: subname = '(oasis_reprosum_tolExceeded)' |
---|
| 1238 | |
---|
| 1239 | !----------------------------------------------------------------------- |
---|
| 1240 | |
---|
| 1241 | oasis_reprosum_tolExceeded = .false. |
---|
| 1242 | if (oasis_reprosum_reldiffmax < 0.0_r8) return |
---|
| 1243 | |
---|
| 1244 | if ( present(logunit) ) then |
---|
| 1245 | llogunit = logunit |
---|
| 1246 | else |
---|
| 1247 | llogunit = nulprt |
---|
| 1248 | endif |
---|
| 1249 | |
---|
| 1250 | ! check that "fast" reproducible sum is accurate enough. |
---|
| 1251 | exceeds_limit = 0 |
---|
| 1252 | max_rel_diff = 0.0_r8 |
---|
| 1253 | max_abs_diff = 0.0_r8 |
---|
| 1254 | do ifld=1,nflds |
---|
| 1255 | if (rel_diff(1,ifld) > oasis_reprosum_reldiffmax) then |
---|
| 1256 | exceeds_limit = exceeds_limit + 1 |
---|
| 1257 | if (rel_diff(1,ifld) > max_rel_diff) then |
---|
| 1258 | max_rel_diff = rel_diff(1,ifld) |
---|
| 1259 | max_rel_diff_idx = ifld |
---|
| 1260 | endif |
---|
| 1261 | if (rel_diff(2,ifld) > max_abs_diff) then |
---|
| 1262 | max_abs_diff = rel_diff(2,ifld) |
---|
| 1263 | max_abs_diff_idx = ifld |
---|
| 1264 | endif |
---|
| 1265 | endif |
---|
| 1266 | enddo |
---|
| 1267 | |
---|
| 1268 | if (exceeds_limit > 0) then |
---|
| 1269 | if (master) then |
---|
| 1270 | write(llogunit,*) subname,trim(name), & |
---|
| 1271 | ': difference in fixed and floating point sums ', & |
---|
| 1272 | ' exceeds tolerance in ', exceeds_limit, & |
---|
| 1273 | ' fields.' |
---|
| 1274 | write(llogunit,*) subname,' Maximum relative diff: (rel)', & |
---|
| 1275 | rel_diff(1,max_rel_diff_idx), ' (abs) ', & |
---|
| 1276 | rel_diff(2,max_rel_diff_idx) |
---|
| 1277 | write(llogunit,*) subname,' Maximum absolute diff: (rel)', & |
---|
| 1278 | rel_diff(1,max_abs_diff_idx), ' (abs) ', & |
---|
| 1279 | rel_diff(2,max_abs_diff_idx) |
---|
| 1280 | endif |
---|
| 1281 | oasis_reprosum_tolExceeded = .true. |
---|
| 1282 | endif |
---|
| 1283 | |
---|
| 1284 | |
---|
| 1285 | end function oasis_reprosum_tolExceeded |
---|
| 1286 | |
---|
| 1287 | !======================================================================== |
---|
| 1288 | ! |
---|
| 1289 | ! Purpose: |
---|
| 1290 | !> Compute the global sum of each field in "arr" using the indicated |
---|
| 1291 | !> communicator with a reproducible yet scalable implementation based |
---|
| 1292 | !> on He and Ding's implementation of the double-double algorithm. |
---|
| 1293 | ! |
---|
| 1294 | !---------------------------------------------------------------------- |
---|
| 1295 | |
---|
| 1296 | subroutine oasis_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & |
---|
| 1297 | nflds, mpi_comm ) |
---|
| 1298 | !---------------------------------------------------------------------- |
---|
| 1299 | |
---|
| 1300 | ! Arguments |
---|
| 1301 | |
---|
| 1302 | integer, intent(in) :: nsummands !< number of local summands |
---|
| 1303 | integer, intent(in) :: dsummands !< declared first dimension |
---|
| 1304 | integer, intent(in) :: nflds !< number of fields |
---|
| 1305 | real(r8), intent(in) :: arr(dsummands,nflds) |
---|
| 1306 | !< input array |
---|
| 1307 | integer, intent(in) :: mpi_comm !< MPI subcommunicator |
---|
| 1308 | |
---|
| 1309 | real(r8), intent(out):: arr_gsum(nflds) |
---|
| 1310 | !< global sums |
---|
| 1311 | |
---|
| 1312 | |
---|
| 1313 | ! Local workspace |
---|
| 1314 | |
---|
| 1315 | integer :: old_cw ! for x86 processors, save |
---|
| 1316 | ! current arithmetic mode |
---|
| 1317 | integer :: ifld, isum ! loop variables |
---|
| 1318 | integer :: ierr ! MPI error return |
---|
| 1319 | |
---|
| 1320 | real(r8) :: e, t1, t2 ! temporaries |
---|
| 1321 | complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double |
---|
| 1322 | ! format) |
---|
| 1323 | complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double |
---|
| 1324 | ! format) |
---|
| 1325 | |
---|
| 1326 | integer, save :: mpi_sumdd |
---|
| 1327 | logical, save :: first_time = .true. |
---|
| 1328 | character(len=*),parameter :: subname = '(oasis_reprosum_ddpdd)' |
---|
| 1329 | |
---|
| 1330 | !----------------------------------------------------------------------- |
---|
| 1331 | |
---|
| 1332 | call shr_reprosumx86_fix_start (old_cw) |
---|
| 1333 | |
---|
| 1334 | if (first_time) then |
---|
| 1335 | call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr) |
---|
| 1336 | first_time = .false. |
---|
| 1337 | endif |
---|
| 1338 | |
---|
| 1339 | do ifld=1,nflds |
---|
| 1340 | arr_lsum_dd(ifld) = (0.0_r8,0.0_r8) |
---|
| 1341 | |
---|
| 1342 | do isum=1,nsummands |
---|
| 1343 | |
---|
| 1344 | ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s |
---|
| 1345 | ! trick. |
---|
| 1346 | t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld)) |
---|
| 1347 | e = t1 - arr(isum,ifld) |
---|
| 1348 | t2 = ((real(arr_lsum_dd(ifld)) - e) & |
---|
| 1349 | + (arr(isum,ifld) - (t1 - e))) & |
---|
| 1350 | + aimag(arr_lsum_dd(ifld)) |
---|
| 1351 | |
---|
| 1352 | ! The result is t1 + t2, after normalization. |
---|
| 1353 | arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) |
---|
| 1354 | enddo |
---|
| 1355 | |
---|
| 1356 | enddo |
---|
| 1357 | |
---|
| 1358 | call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, & |
---|
| 1359 | MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr) |
---|
| 1360 | do ifld=1,nflds |
---|
| 1361 | arr_gsum(ifld) = real(arr_gsum_dd(ifld)) |
---|
| 1362 | enddo |
---|
| 1363 | |
---|
| 1364 | call shr_reprosumx86_fix_end (old_cw) |
---|
| 1365 | |
---|
| 1366 | end subroutine oasis_reprosum_ddpdd |
---|
| 1367 | |
---|
| 1368 | !----------------------------------------------------------------------- |
---|
| 1369 | |
---|
| 1370 | subroutine DDPDD (dda, ddb, len, itype) |
---|
| 1371 | !---------------------------------------------------------------------- |
---|
| 1372 | ! |
---|
| 1373 | ! Purpose: |
---|
| 1374 | ! Modification of original codes written by David H. Bailey |
---|
| 1375 | ! This subroutine computes ddb(i) = dda(i)+ddb(i) |
---|
| 1376 | ! |
---|
| 1377 | !---------------------------------------------------------------------- |
---|
| 1378 | |
---|
| 1379 | ! Arguments |
---|
| 1380 | |
---|
| 1381 | integer, intent(in) :: len ! array length |
---|
| 1382 | complex(r8), intent(in) :: dda(len) ! input |
---|
| 1383 | complex(r8), intent(inout) :: ddb(len) ! result |
---|
| 1384 | integer, intent(in) :: itype ! unused |
---|
| 1385 | |
---|
| 1386 | ! Local workspace |
---|
| 1387 | |
---|
| 1388 | real(r8) e, t1, t2 |
---|
| 1389 | integer i |
---|
| 1390 | character(len=*),parameter :: subname = '(oasis_reprosum_mod:DDPDD)' |
---|
| 1391 | |
---|
| 1392 | !----------------------------------------------------------------------- |
---|
| 1393 | |
---|
| 1394 | do i = 1, len |
---|
| 1395 | ! Compute dda + ddb using Knuth's trick. |
---|
| 1396 | t1 = real(dda(i)) + real(ddb(i)) |
---|
| 1397 | e = t1 - real(dda(i)) |
---|
| 1398 | t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) & |
---|
| 1399 | + aimag(dda(i)) + aimag(ddb(i)) |
---|
| 1400 | |
---|
| 1401 | ! The result is t1 + t2, after normalization. |
---|
| 1402 | ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) |
---|
| 1403 | enddo |
---|
| 1404 | |
---|
| 1405 | |
---|
| 1406 | end subroutine DDPDD |
---|
| 1407 | |
---|
| 1408 | !----------------------------------------------------------------------- |
---|
| 1409 | |
---|
| 1410 | subroutine split_indices(total,num_pieces,ibeg,iend) |
---|
| 1411 | !---------------------------------------------------------------------- |
---|
| 1412 | ! |
---|
| 1413 | ! Purpose: |
---|
| 1414 | ! Split range into 'num_pieces' |
---|
| 1415 | ! |
---|
| 1416 | !---------------------------------------------------------------------- |
---|
| 1417 | |
---|
| 1418 | ! Arguments |
---|
| 1419 | |
---|
| 1420 | integer, intent(in) :: total |
---|
| 1421 | integer, intent(in) :: num_pieces |
---|
| 1422 | integer, intent(out) :: ibeg(num_pieces), iend(num_pieces) |
---|
| 1423 | |
---|
| 1424 | ! Local workspace |
---|
| 1425 | |
---|
| 1426 | integer :: itmp1, itmp2, ioffset, i |
---|
| 1427 | character(len=*),parameter :: subname = '(oasis_reprosum_mod:split_indices)' |
---|
| 1428 | |
---|
| 1429 | !----------------------------------------------------------------------- |
---|
| 1430 | |
---|
| 1431 | itmp1 = total/num_pieces |
---|
| 1432 | itmp2 = mod(total,num_pieces) |
---|
| 1433 | ioffset = 0 |
---|
| 1434 | do i=1,itmp2 |
---|
| 1435 | ibeg(i) = ioffset + 1 |
---|
| 1436 | iend(i) = ioffset + (itmp1+1) |
---|
| 1437 | ioffset = iend(i) |
---|
| 1438 | enddo |
---|
| 1439 | do i=itmp2+1,num_pieces |
---|
| 1440 | ibeg(i) = ioffset + 1 |
---|
| 1441 | if (ibeg(i) > total) then |
---|
| 1442 | iend(i) = ibeg(i) - 1 |
---|
| 1443 | else |
---|
| 1444 | iend(i) = ioffset + itmp1 |
---|
| 1445 | ioffset = iend(i) |
---|
| 1446 | endif |
---|
| 1447 | enddo |
---|
| 1448 | |
---|
| 1449 | end subroutine split_indices |
---|
| 1450 | |
---|
| 1451 | !======================================================================== |
---|
| 1452 | |
---|
| 1453 | end module mod_oasis_reprosum |
---|