- Timestamp:
- 02/27/24 15:50:14 (4 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/libIGCM_concurrent/libIGCM_sys/libIGCM_sys_irene-amd.ksh
r1624 r1629 1172 1172 typeset file 1173 1173 file=$1 1174 1175 # In case of use of DEDICATED option XIOS servers, do not share xios nodes with xios clients 1176 # Compute the number of core to add to force the use of a new node for xios servers 1177 1178 if [ "X${IOS_OK_DEDICATED}" == "X" ] ; then 1179 (( IOS_OK_DEDICATED = 0 )) 1180 fi 1181 1182 1183 if [ ${IOS_OK_DEDICATED} -eq 1 ] ; then 1184 (( current_core_noxios = coreNumber - IOS_PROC_MPI * IOS_PROC_OMP * IOS_PROC_DEP )) 1185 (( first_comp_proc_dep_loc = 1 + NB_CORE_PER_NODE - current_core_noxios % NB_CORE_PER_NODE )) 1186 (( second_comp_proc_mpi_loc = IOS_PROC_MPI - 1 )) 1187 (( coreNumber_final = current_core_noxios - 1 + first_comp_proc_dep_loc * 2 + second_comp_proc_mpi_loc * IOS_PROC_OMP * IOS_PROC_DEP )) 1188 else 1189 (( coreNumber_final = coreNumber )) 1190 fi 1191 1192 if [ ${executionType} -eq 1 ] ; then 1193 # MPMD + MPI 1174 # MPMD + MPI + OMP : mpirun/ccc_mprun/error 1175 (( nodeNumber = coreNumber / NB_CORE_PER_NODE )) 1176 [ $(( ${coreNumber} % ${NB_CORE_PER_NODE} )) -ne 0 ] && (( nodeNumber = nodeNumber + 1 )) 1194 1177 sed -e "/::openMPthreads::/d" \ 1195 -e "s/::JobNumProcTot::/${coreNumber_final}/" \ 1178 -e "s/::JobNumProcTot::/${mpiTasks}/" \ 1179 -e "s/::NodeNumber::/${nodeNumber}/" \ 1180 -e "/--cpu_bind=none/d" \ 1196 1181 ${file} > ${file}.tmp 1197 1198 elif [ ${executionType} -eq 2 ] ; then1199 # MPMD + MPI + OMP :ccc_mprun1200 sed -e "/::openMPthreads::/d" \1201 -e "s/::JobNumProcTot::/${coreNumber_final}/" \1202 ${file} > ${file}.tmp1203 1204 elif [ ${executionType} -eq 3 ] ; then1205 # SPMD + MPI/OMP1206 sed -e "s/::openMPthreads::/${openMPthreads}/" \1207 -e "s/::JobNumProcTot::/${mpiTasks}/" \1208 -e "/#MSUB -x/d" \1209 ${file} > ${file}.tmp1210 1211 elif [ ${executionType} -eq 4 ] ; then1212 # SPMD + MPI only1213 sed -e "s/::JobNumProcTot::/${mpiTasks}/" \1214 -e "/::openMPthreads::/d" \1215 -e "/#MSUB -x/d" \1216 ${file} > ${file}.tmp1217 1218 elif [ ${executionType} -eq 5 ] ; then1219 # SPMD + OMP only1220 sed -e "s/::openMPthreads::/${openMPthreads}/" \1221 -e "/::JobNumProcTot::/d" \1222 -e "/#MSUB -x/d" \1223 ${file} > ${file}.tmp1224 1225 elif [ ${executionType} -eq 6 ] ; then1226 # SEQUENTIAL THEN1227 sed -e "s/::JobNumProcTot::/1/" \1228 -e "/::openMPthreads::/d" \1229 -e "/#MSUB -x/d" \1230 ${file} > ${file}.tmp1231 1232 fi1233 1234 1182 IGCM_sys_Mv ${file}.tmp ${file} 1235 1183 … … 1263 1211 echo "IGCM_sys_build_execution_scripts " $@ 1264 1212 fi 1213 if ( [ "X${config_UserChoices_ExecutionMode}" = "Xslurm" ] ) ; then 1214 1215 EXECUTION="/usr/bin/time srun " 1216 1217 if ( ${OK_PARA_MPMD} ) ; then 1218 1219 # MPMD mode 1220 # 1 MPI only : executionType=1 1221 # 2 MPI/OpenMP : executionType=2 1222 1223 if [ -f run_file ] ; then 1224 IGCM_sys_Rm -f run_file 1225 fi 1226 if [ -f RUNDIR_2/run_file ] ; then 1227 IGCM_sys_Rm -f RUNDIR_2/run_file 1228 fi 1229 touch run_file 1230 1231 # case 1 : Only MPI (MPMD) 1232 # if ( ! ${OK_PARA_OMP} ) ; then 1233 # first_slurm_comp=0 1234 # # Build run_file 1235 # current_core=0 1236 # # First loop on the components for the coupler ie oasis (only if oasis3) 1237 # # the coupler ie oasis3 must be the first one 1238 # for comp in ${config_ListOfComponents[*]} ; do 1239 # number_rundir=$(echo ${comp} | sed 's/[^0-9]*//g') 1240 # if [ X${number_rundir} != X ] ; then 1241 # [ ! -d RUNDIR_${number_rundir} ] && mkdir RUNDIR_${number_rundir} 1242 # cd RUNDIR_${number_rundir} 1243 # if [ ${first_slurm_comp} = "0" ] ; then 1244 # current_core=0 ; first_slurm_comp=1 ; 1245 # fi 1246 # fi 1247 # eval ExeNameIn=\${config_Executable_${comp}[0]} 1248 # eval ExeNameOut=\${config_Executable_${comp}[1]} 1249 1250 # # Only if the component has an executable 1251 # if ( [ "X${ExeNameOut}" != X\"\" ] ) ; then 1252 1253 # eval comp_proc_mpi_loc=\${${comp}_PROC_MPI} 1254 # (( end_core = ${current_core} + ${comp_proc_mpi_loc} - 1 )) 1255 # echo "${current_core}-${end_core} ./${ExeNameOut}" >> run_file 1256 # (( current_core = ${end_core} + 1 )) 1257 # fi 1258 # if [ X${number_rundir} != X ] ; then 1259 # cd $RUN_DIR ; 1260 # fi 1261 # done 1262 1263 # if [ X${number_rundir} != X ] ; then 1264 # echo "cd $RUN_DIR ; /usr/bin/time srun --ntasks=${config_UserChoices_NbMPItasks_run1} --multi-prog ./run_file > out_execution 2>&1 &" > EXECUTION.exe 1265 # echo "cd RUNDIR_2 ; /usr/bin/time srun --ntasks=${config_UserChoices_NbMPItasks_run2} --multi-prog ./run_file > out_execution 2>&1 &" >> EXECUTION.exe 1266 # echo "wait" >> EXECUTION.exe 1267 # chmod u+x EXECUTION.exe 1268 # else 1269 # EXECUTION="/usr/bin/time srun --multi-prog ./run_file" 1270 # fi 1271 1272 # IGCM_sys_Chmod u+x run_file 1273 # if ( $DEBUG_sys ) ; then 1274 # echo "run_file contains : " 1275 # cat run_file 1276 # fi 1277 1278 # else 1279 1280 # 2 MPI/OpenMP : executionType=2 1281 1282 # MPI-OpenMP (MPMD) 1283 # export SLURM_HOSTFILE=./hostlist 1284 # srun --cpu-bind=none --distribution=arbitrary --multi-prog ./run_file 1285 # example of hostlist file : 1286 # r3i3n33 1287 # r3i3n33 1288 # ... 1289 # example of run_file : 1290 # 0-70 ./prog_lmdz.x.sh %o %t 1291 # 71-430 ./prog_opa.xx.sh %o %t 1292 # 431-431 ./prog_xios.x.sh %o %t 1293 # examples of prog_file : 1294 # prog_lmdz.x.sh : 1295 # (( init = 0 + $1 )) 1296 # (( index = init * 10 )) 1297 # (( slot = index % 40 )) 1298 # taskset -c $slot-$((slot + 10 - 1)) ./script_lmdz.x.ksh 1299 # that will become 1300 # taskset -c 0-9 ./script_lmdz.x.ksh 1301 # ... 1302 # with script_lmdz.x.ksh 1303 # export OMP_STACKSIZE=3g 1304 # export OMP_PLACES=cores 1305 # export OMP_NUM_THREADS=10 1306 # ./lmdz.x > out_lmdz.x.out.${SLURM_PROCID} 2>out_lmdz.x.err.${SLURM_PROCID} 1307 1308 # Hosts treatment 1309 _bkIFS=$IFS; 1310 IFS=$'\n'; set -f 1311 listnodes=($(< <( scontrol show hostnames $SLURM_JOB_NODELIST ))) 1312 IFS=$_bkIFS; set +f 1313 rm -f hostlist 1314 1315 # Loop on the components to build run_file and script_exec files 1316 rank=0 1317 current_core=0 1318 current_core_mpi=0 1319 current_core_tmp=0 1320 current_core_mpi_tmp=0 1321 first_slurm_comp=0 1322 1323 for comp in ${config_ListOfComponents[*]} ; do 1324 1325 number_rundir=$(echo ${comp} | sed 's/[^0-9]*//g') 1326 if [ X${number_rundir} != X ] ; then 1327 [ ! -d RUNDIR_${number_rundir} ] && mkdir RUNDIR_${number_rundir} 1328 cd RUNDIR_${number_rundir} 1329 if [ ${first_slurm_comp} = "0" ] ; then 1330 (( NbMPItasks_run1 = current_core_mpi_tmp )) 1331 ### On change de noeud pour le prochain srun 1332 if [ $(( $current_core % $NB_CORE_PER_NODE )) -ne 0 ] ; then 1333 (( current_core = current_core + NB_CORE_PER_NODE - current_core % NB_CORE_PER_NODE )) 1334 fi 1335 current_core_tmp=0 ; current_core_mpi_tmp=0 ; first_slurm_comp=1 ; 1336 fi 1337 fi 1338 1339 eval ExeNameIn=\${config_Executable_${comp}[0]} 1340 eval ExeNameOut=\${config_Executable_${comp}[1]} 1341 1342 # Not possible if oasis has an executable (i.e old version of oasis3) 1343 if ( [ "X${ExeNameOut}" != X\"\" ] && [ "X${comp}" = "XCPL" ] ) ; then 1344 IGCM_debug_Exit "ERROR MPMD with hybrid MPI-OpenMP is not available with oasis3 version" 1345 IGCM_debug_Print 2 "Only available with oasis3-MCT version coupler" 1346 IGCM_debug_Verif_Exit 1347 fi 1348 1349 # Only if we really have an executable for the component : 1350 if [ "X${ExeNameOut}" != X\"\" ] ; then 1351 1352 eval comp_proc_mpi_loc=\${${comp}_PROC_MPI} 1353 eval comp_proc_omp_loc=\${${comp}_PROC_OMP} 1354 eval comp_proc_nod_loc=\${${comp}_PROC_NOD} 1355 1356 1357 # Build script files 1358 1359 echo "#!/bin/ksh" > script_${ExeNameOut}.ksh 1360 echo "" >> script_${ExeNameOut}.ksh 1361 if [ ${comp_proc_omp_loc} -gt 1 ] ; then 1362 echo "export OMP_STACKSIZE=3g" >> script_${ExeNameOut}.ksh 1363 echo "export OMP_PLACES=cores" >> script_${ExeNameOut}.ksh 1364 echo "export OMP_NUM_THREADS=${comp_proc_omp_loc}" >> script_${ExeNameOut}.ksh 1365 fi 1366 1367 # to have out/err per process on different files 1368 echo "./${ExeNameOut} > out_${ExeNameOut}.out.\${SLURM_PROCID} 2>out_${ExeNameOut}.err.\${SLURM_PROCID}" >> script_${ExeNameOut}.ksh 1369 1370 IGCM_sys_Chmod u+x script_${ExeNameOut}.ksh 1371 1372 # Build run_file 1373 # Only if the component has an executable 1374 if ( [ "X${ExeNameOut}" != X\"\" ] ) ; then 1375 1376 eval comp_proc_mpi_loc=\${${comp}_PROC_MPI} 1377 (( end_core = ${current_core_mpi_tmp} + ${comp_proc_mpi_loc} - 1 )) 1378 echo "${current_core_mpi_tmp}-${end_core} ./prog_${ExeNameOut}.sh %o %t" >> run_file 1379 (( current_core_mpi_tmp = ${end_core} + 1 )) 1380 fi 1381 1382 if [ ${comp_proc_nod_loc} -gt 1 ] ; then 1383 (( offset_comp_proc_loc = NB_CORE_PER_NODE / (comp_proc_mpi_loc / comp_proc_nod_loc) )) 1384 else 1385 (( offset_comp_proc_loc = comp_proc_omp_loc )) 1386 fi 1387 1388 # Build configuration file 1389 1390 echo "#!/bin/sh" > prog_${ExeNameOut}.sh 1391 echo "(( init = $current_core_tmp + \$1 ))" >> prog_${ExeNameOut}.sh 1392 echo "(( index = init * $comp_proc_omp_loc ))" >> prog_${ExeNameOut}.sh 1393 echo "(( slot = index % 40 ))" >> prog_${ExeNameOut}.sh 1394 echo "echo ${ExeNameOut} taskset -c \$slot"-"\$((slot + $comp_proc_omp_loc - 1))" >> prog_${ExeNameOut}.sh 1395 echo "taskset -c \$slot"-"\$((slot + $comp_proc_omp_loc - 1)) ./script_${ExeNameOut}.ksh" >> prog_${ExeNameOut}.sh 1396 1397 IGCM_sys_Chmod u+x prog_${ExeNameOut}.sh 1398 1399 # Build hostlist file 1400 1401 for nb_proc_mpi in `seq 0 $(($comp_proc_mpi_loc-1))`; do 1402 (( index_host = current_core / NB_CORE_PER_NODE )) 1403 host_value=${listnodes[${index_host}]} 1404 echo "$host_value" >> hostlist 1405 if [ ${DRYRUN_DEBUG} = 4 ] ; then 1406 echo "node_${index_host}_X" >> hostlist_template 1407 fi 1408 (( current_core = current_core + offset_comp_proc_loc )) 1409 (( current_core_tmp = current_core_tmp + offset_comp_proc_loc )) 1410 done 1411 fi 1412 if [ X${number_rundir} != X ] ; then 1413 cd $RUN_DIR ; 1414 fi 1415 done 1416 1417 ## variable added to stop after 60s instead of 600s by default. 1418 ## This is used when no error comes from executables and when something stopped an executable without notice. 1419 export SLURM_WAIT=60 1420 1421 if [ X${number_rundir} != X ] ; then 1422 echo "cd $RUN_DIR ; export SLURM_HOSTFILE=./hostlist ; /usr/bin/time srun --ntasks=${NbMPItasks_run1} --cpu-bind=none --distribution=arbitrary --multi-prog ./run_file > out_execution 2>&1 &" > EXECUTION.exe 1423 echo "cd RUNDIR_2 ; export SLURM_HOSTFILE=./hostlist ; /usr/bin/time srun --ntasks=${current_core_mpi_tmp} --cpu-bind=none --distribution=arbitrary --multi-prog ./run_file > out_execution 2>&1 &" >> EXECUTION.exe 1424 echo "wait" >> EXECUTION.exe 1425 chmod u+x EXECUTION.exe 1426 else 1427 EXECUTION="/usr/bin/time srun --cpu-bind=none --distribution=arbitrary --multi-prog ./run_file" 1428 fi 1429 IGCM_sys_Chmod u+x run_file 1430 if ( $DEBUG_sys ) ; then 1431 echo "run_file contains : " 1432 cat run_file 1433 fi 1434 1435 # fi # if ${OK_PARA_MPMD} 1436 1437 else 1438 # Only one executable (SPMD mode): executionType=3, 4, 5 and 6 1439 1440 for comp in ${config_ListOfComponents[*]} ; do 1441 1442 # Only if we really have an executable for the component : 1443 eval ExeNameOut=\${config_Executable_${comp}[1]} 1444 if ( [ "X${ExeNameOut}" != X\"\" ] && [ "X${ExeNameOut}" != "Xinca.dat" ] ) ; then 1445 1446 # Build script files 1447 1448 echo "#!/bin/ksh" > script_${ExeNameOut}.ksh 1449 echo "" >> script_${ExeNameOut}.ksh 1450 IGCM_sys_Chmod u+x script_${ExeNameOut}.ksh 1451 1452 if ( ${OK_PARA_OMP} ) ; then 1453 eval comp_proc_omp_loc=\${${comp}_PROC_OMP} 1454 # Check if the number of threads is correct 1455 case ${comp_proc_omp_loc} in 1456 2|4|5|10|20) 1457 IGCM_debug_Print 1 "You run ${ExeNameOut} on ${comp_proc_omp_loc} OMP threads" 1458 ;; 1459 *) 1460 IGCM_debug_Exit "ERROR with OMP parameters !" 1461 IGCM_debug_Print 2 "${comp_proc_omp_loc} is not possible as number of OMP threads" 1462 IGCM_debug_Print 2 "Only 2,4,5,10,20 as number of OMP threads are possible " 1463 IGCM_debug_Verif_Exit 1464 ;; 1465 esac 1466 echo "" >> script_${ExeNameOut}.ksh 1467 echo "export OMP_STACKSIZE=3g" >> script_${ExeNameOut}.ksh 1468 echo "export OMP_PLACES=cores" >> script_${ExeNameOut}.ksh 1469 echo "OMP_NUM_THREADS=${comp_proc_omp_loc}" >> script_${ExeNameOut}.ksh 1470 fi 1471 1472 eval comp_proc_mpi_loc=\${${comp}_PROC_MPI} 1473 1474 # To have out/err per process on different files 1475 echo "./${ExeNameOut} > out_${ExeNameOut}.out.\${SLURM_PROCID} 2>out_${ExeNameOut}.err.\${SLURM_PROCID}" >> script_${ExeNameOut}.ksh 1476 EXECUTION="/usr/bin/time srun ./script_${ExeNameOut}.ksh" 1477 1478 IGCM_debug_Print 1 "sys Jean-Zay : script_${ExeNameOut}.ksh contains" 1479 cat script_${ExeNameOut}.ksh 1480 1481 fi 1482 1483 done 1484 1485 fi # ${OK_PARA_MPMD} 1486 1487 else 1265 1488 1266 1489 EXECUTION=${HOST_MPIRUN_COMMAND} … … 1634 1857 fi # ${OK_PARA_MPMD} 1635 1858 1859 fi 1636 1860 IGCM_debug_Print 1 "sys Irene-amd : execution command is " 1637 1861 IGCM_debug_Print 1 "$EXECUTION"
Note: See TracChangeset
for help on using the changeset viewer.