Context Navigation

source: TOOLS/PACK_IPSL/launch_and_measureTime.sh @ 1879

Last change on this file since 1879 was 1869, checked in by gpincka, 12 years ago
meilleure gestion de la verification de la progression avant et apres l'exec d'une instance
Property svn:executable set to ``*
File size: 15.7 KB

Line
1	#!/bin/bash
2
3	function update_tasks_list
4	{
5	local tasksListFile=$1 # fichier contenant la nouvelle liste de cmds
6	touch $tasksListFile
7	local logFileOfPrevPack=$2 # fichier log de relatif a l'execution de la derniere liste de cmds
8	local tasksListFileOfPrevPack=$3 # fichier contenant la derniere liste de cmds
9
10	> $tasksListFile
11
12	old_IFS=$IFS # sauvegarde du séparateur de champ
13	IFS=$'\n' # nouveau séparateur de champ, le caractère fin de ligne
14	for cmdReport in $( cat $logFileOfPrevPack )
15	do
16	hasCmdGoodFormat=`echo $cmdReport \| grep -e '^#executed by process' \| wc -l `
17	if [ "x${hasCmdGoodFormat}" == "x0" ]
18	then
19	continue
20	fi
21
22	local resCmd=`echo $cmdReport \| awk '{print $9}' `
23
24	if [ "x$resCmd" != "x0" ]
25	then
26	local cmdToPrint=`echo $cmdReport \| awk '{print $NF}' `
27	case "x$resCmd" in
28	x5)
29	newCmdToPrint=`echo $cmdToPrint \| sed 's;output_ncrcat;output_tar;' `
30	echo "./process_list.sh $newCmdToPrint" >> ${tasksListFile}
31	;;
32	x10)
33	echo "./process_list.sh $cmdToPrint" >> ${tasksListFile}
34	newCmdToPrint=`echo $cmdToPrint \| sed 's;output_ncrcat;output_tar;' `
35	echo "./process_list.sh $newCmdToPrint" >> ${tasksListFile}
36	;;
37	*)
38	echo "./process_list.sh $cmdToPrint" >> ${tasksListFile}
39	;;
40	esac
41	fi
42	done
43	IFS=$old_IFS # rétablissement du séparateur de champ par défaut
44
45	# Il faut aussi rajouter les cmds qui n'ont pas ete traitees du tout,
46	# par exemple en cas d'interruption du calculateur
47	# Pour cela, on boucle sur la derniere liste de cmds et on cherche dans le
48	# fichier log associe si certaines sont absentes. On remet ces commandes (absentes)
49	# dans la nouvelle liste
50	old_IFS=$IFS # sauvegarde du séparateur de champ
51	IFS=$'\n' # nouveau séparateur de champ, le caractère fin de ligne
52	for cmd in $( cat $tasksListFileOfPrevPack )
53	do
54	local list=`echo $cmd \| awk '{print $NF}' `
55	hasListBeenTreated=`grep $list $logFileOfPrevPack \| wc -l `
56	if [ "x${hasListBeenTreated}" == "x0" ]
57	then
58	echo $cmd >> ${tasksListFile}
59	fi
60
61
62	done
63	IFS=$old_IFS # rétablissement du séparateur de champ par défaut
64
65	# Il peut arriver que 2 listes soient identiques, on empeche ce cas
66	cat ${tasksListFile} \| sort \| uniq > taskFile.txt
67	cat taskFile.txt > ${tasksListFile}
68
69	}
70
71	function getNumeroOfCurrentTry
72	{
73	local num_try="1"
74	tryNumFile=${USER_OUTPUT_PROGRESS}/numero_current_try.txt
75	if [ ! -e $tryNumFile ]
76	then
77	echo "Le fichier :" >> $badFailureFile
78	echo "$tryNumFile" >> $badFailureFile
79	echo "doit etre present dans le repertoire :" >> $badFailureFile
80	echo "${USER_OUTPUT_PROGRESS}" >> $badFailureFile
81	echo "et il doit contenir un numero d'essai" >> $badFailureFile
82	exit 1
83	fi
84	num_try=`head -n 1 $tryNumFile `
85	echo $num_try
86
87	}
88
89	function getNumeroOfLastInstance
90	{
91	local num_instance="0"
92	local numTry=$1
93	ici=$PWD
94	local progressDirectory="${USER_OUTPUT_PROGRESS}/TRY__${numTry}"
95	if [ ! -e $progressDirectory ]
96	then
97	echo "fonction getNumeroOfLastInstance :" >> $badFailureFile
98	echo "Le repertoire :" >> $badFailureFile
99	echo "$progressDirectory" >> $badFailureFile
100	echo "devrait exister. Il n'existe pas." >> $badFailureFile
101	exit 1
102	fi
103	cd $progressDirectory
104
105	listFiles=`ls \| grep -e "inputCmd__try__${numTry}__instance__[[:digit:]]\{1,2\}.list" `
106	for listFile in $listFiles
107	do
108	num=`echo $listFile \| awk -F"__" '{print $NF}' \| awk -F"." '{print $1}' `
109	if [ $num -gt $num_instance ]
110	then
111	num_instance=$num
112	fi
113	done
114	cd $ici
115	echo $num_instance
116
117
118	}
119
120
121	function check_progress
122	{
123	local file1=$1
124	local file2=$2
125	local file_to_get_retCode=$3
126	if [ "x${file1}" == "x" ] \|\| [ "x${file2}" == "x" ] \|\| [ "x${file_to_get_retCode}" == "x" ]
127	then
128	echo "check_progress : Le nom d'au moins 1 des 3 fichiers d'entree est vide" >> $badFailureFile
129	exit 1
130	fi
131
132	if [ ! -e $file1 ] \|\| [ ! -e $file2 ] \|\| [ ! -e ${file_to_get_retCode} ]
133	then
134	echo "check_progress : au moins un des 3 fichiers suivants n'existe pas :" >> $badFailureFile
135	echo "$file1" >> $badFailureFile
136	echo "$file2" >> $badFailureFile
137	echo "${file_to_get_retCode}" >> $badFailureFile
138	exit 1
139	fi
140	local nbLineFile1=`cat $file1 \| wc -l `
141	local nbLineFile2=`cat $file2 \| wc -l `
142	if [ $nbLineFile1 -ne $nbLineFile2 ]
143	then
144	echo 1
145	return
146	fi
147	# si l'on parvient a ce stade, c'est les 2 fichiers ont le meme
148	# nombre de lignes
149
150	old_IFS=$IFS # sauvegarde du séparateur de champ
151	IFS=$'\n' # nouveau séparateur de champ, le caractère fin de ligne
152	for line in $( cat $file1 )
153	do
154	# echo $line
155	local isLineInFile2=`grep $line $file2 \| wc -l `
156	if [ $isLineInFile2 -eq 0 ]
157	then
158	echo 1
159	return
160	fi
161	done
162	IFS=$old_IFS # rétablissement du séparateur de champ par défaut
163
164	# Les codes de retour sont-ils != 0 ?
165	nbNonZeroCodes=`cat ${file_to_get_retCode} \| grep -e '^#executed by' \| awk '{ if ($9 != 0) {print $9} }' \| wc -l `
166	if [ $nbNonZeroCodes -gt 0 ]
167	then
168	echo 1
169	return
170	fi
171
172	echo 0
173	}
174
175
176	function update_report
177	{
178	> $reportFile # on vide le fichier rapport
179	echo "Execution of tasks :" >> $reportFile
180	echo "------------------" >> $reportFile
181	cat $inputCmd >> $reportFile
182	echo >> $reportFile
183	echo "Results of tasks :" >> $reportFile
184	echo "----------------" >> $reportFile
185	cat $output >> $reportFile
186	echo >> $reportFile
187	echo >> $reportFile
188
189	}
190
191	export RANDOM=$$ # random seed
192	function gives_random_number
193	{
194	lim=$1
195	bit=-1
196	let "bit = RANDOM % $lim"
197	bit=$(( $bit + 1 )) # nb entre 1 et $limit
198	echo $bit
199	}
200
201	########## batch directives : begin ##########
202	#MSUB -r pack_ipsl # Nom du job
203	### mutable directives ###
204	#MSUB -o /ccc/dmfbuf/import_data.2/ccrt/dmnfs12/cont003/bacasable/GUILLAUME/PSEUDO_DMNFS_PROGRESS/zIGCM_OUT/detailed_pack_output/pack_ipsl_%I.o
205	#MSUB -e /ccc/dmfbuf/import_data.2/ccrt/dmnfs12/cont003/bacasable/GUILLAUME/PSEUDO_DMNFS_PROGRESS/zIGCM_OUT/detailed_pack_output/pack_ipsl_%I.e
206	#MSUB -n 7
207	#MSUB -T 900
208	#MSUB -A tgcc0013
209	#MSUB -q standard
210	#MSUB -Qos test
211	########## batch directives : end ##########
212
213	export JOB_DIR=${LS_SUBCWD:-${PWD}}
214	export EXE_DIR=${JOB_DIR}
215	source ${EXE_DIR}/DEM_utilities.sh
216
217	export badFailureFile=${USER_OUTPUT_PROGRESS}/badFailure.txt
218
219	export numCurrentTry=$( getNumeroOfCurrentTry )
220	export progressDir="${USER_OUTPUT_PROGRESS}/TRY__${numCurrentTry}"
221	if [ ! -e $progressDir ]
222	then
223	echo "Le repertoire de suivi :" >> $badFailureFile
224	echo "$progressDir" >> $badFailureFile
225	echo "n'existe pas. STOP." >> $badFailureFile
226	exit 1
227	fi
228	export numPrevInstance=$( getNumeroOfLastInstance $numCurrentTry )
229	export numNewInstance=$(( $numPrevInstance + 1 ))
230
231	export inputCmd="${progressDir}/inputCmd__try__${numCurrentTry}__instance__${numNewInstance}.list"
232	export nextInputCmd="${progressDir}/nextInputCmd__try__${numCurrentTry}__instance__${numNewInstance}.list"
233	export output="${progressDir}/packOutput__try__${numCurrentTry}__instance__${numNewInstance}.log"
234	export reportFile="${progressDir}/report__try__${numCurrentTry}__instance__${numNewInstance}.log"
235	export checkFile="${progressDir}/check__try__${numCurrentTry}__instance__${numNewInstance}.log"
236	export checkFileTmp="checkTmp__try__${numCurrentTry}__instance__${numNewInstance}.txt"
237
238	export noInterruptFile="${progressDir}/noInterrupt__try__${numCurrentTry}__instance__${numNewInstance}.txt"
239
240
241	# a virer
242	#echo "inputCmd=$inputCmd" >> $badFailureFile
243	#echo "nextInputCmd=$nextInputCmd" >> $badFailureFile
244	#echo "output=$nextInputCmd" >> $badFailureFile
245	#echo "reportFile=$nextInputCmd" >> $badFailureFile
246	#echo "noInterruptFile=$noInterruptFile" >> $badFailureFile
247
248	# exit 0 # a virer
249
250	# a virer
251	#if [ ${numNewInstance} -ge 4 ]
252	#then
253	# echo "inputCmd=$inputCmd" >> $badFailureFile
254	# echo "nextInputCmd=$nextInputCmd" >> $badFailureFile
255	# echo "output=$nextInputCmd" >> $badFailureFile
256	# echo "reportFile=$nextInputCmd" >> $badFailureFile
257	# echo "noInterruptFile=$noInterruptFile" >> $badFailureFile
258	# echo >> $badFailureFile
259	#fi
260
261	if [ ${numCurrentTry} -le 1 ] && [ ${numNewInstance} -le 1 ]
262	then
263	# C'est le tout premier essai
264	# on construit la liste des taches a effectuer en inventoriant les fichiers *.liste
265	# dans les rep contenus dans le fichier "config_card.liste"
266	> ${inputCmd}
267	for CONFIG in $( awk '{print $1}' ${IGCM_DEM}/config_card.liste ) ; do
268
269	PATH_SIMU=$( dirname $CONFIG )
270	# echo "PATH_SIMU=$PATH_SIMU"
271
272	setOfListFiles=`find $PATH_SIMU -type f -name "*.list" `
273	for file in $setOfListFiles
274	do
275	echo "./process_list.sh $file" >> ${inputCmd}
276	done
277	done
278
279	else
280	# if try > 1 && inst == 1 ==> construction liste cmd avec fichiers try - 1, derniere instance
281	# dans ce dernier cas, gerer une eventuelle interruption au try - 1
282	# if try > 1 && inst > 1 ==> construction liste cmd avec fichiers try, instance precedente
283	if [ ${numNewInstance} -ge 2 ]
284	then
285	nextInputCmd_of_PrevInst="${progressDir}/nextInputCmd__try__${numCurrentTry}__instance__${numPrevInstance}.list"
286	if [ ! -e $nextInputCmd_of_PrevInst ]
287	then
288	echo "Le fichier suivant :" >> $badFailureFile
289	echo "$nextInputCmd_of_PrevInst" >> $badFailureFile
290	echo "n'existe pas. Il devrait exister. STOP." >> $badFailureFile
291	exit 1
292	fi
293	cat $nextInputCmd_of_PrevInst > ${inputCmd}
294
295	else # numNewInstance == 1
296	numPrevTry=$(( $numCurrentTry - 1 ))
297	# echo "numCurrentTry=$numCurrentTry" >> $badFailureFile # a virer
298	# echo "numPrevTry=$numPrevTry" >> $badFailureFile # a virer
299	# exit 1 # a virer
300	numLastInstInstanceInPrevTry=$( getNumeroOfLastInstance $numPrevTry )
301	noInterruptFile="${progressDir}/noInterrupt__try__${numPrevTry}__instance__${numLastInstInstanceInPrevTry}.txt"
302	if [ ! -e $noInterruptFile ]
303	then
304	# il y a eu interruption non prevue au dernier essai, il faut recomposer la liste des cmds avec les resultats
305	# de l'essai precedent, derniere instance
306	prevProgressDir="${USER_OUTPUT_PROGRESS}/TRY__${numPrevTry}"
307	nextInputCmd_of_LastInst="${prevProgressDir}/nextInputCmd__try__${numPrevTry}__instance__${numLastInstInstanceInPrevTry}.list"
308	output_of_LastInst="${prevProgressDir}/packOutput__try__${numPrevTry}__instance__${numLastInstInstanceInPrevTry}.log"
309	inputCmd_of_LastInst="${prevProgressDir}/inputCmd__try__${numPrevTry}__instance__${numLastInstInstanceInPrevTry}.list"
310	if [ ! -e $output_of_LastInst ] \|\| [ ! -e $inputCmd_of_LastInst ]
311	then
312	echo "Les fichiers suivants :" >> $badFailureFile
313	echo "$output_of_LastInst" >> $badFailureFile
314	echo "$inputCmd_of_LastInst" >> $badFailureFile
315	echo "n'existent pas. Il devrait exister. STOP." >> $badFailureFile
316	exit 1
317	fi
318
319	update_tasks_list ${nextInputCmd_of_LastInst} ${output_of_LastInst} ${inputCmd_of_LastInst}
320	cat $nextInputCmd_of_LastInst > ${inputCmd}
321	else
322	nextInputCmd_of_LastInst="${progressDir}/nextInputCmd__try__${numPrevTry}__instance__${numLastInstInstanceInPrevTry}.list"
323	if [ ! -e $nextInputCmd_of_LastInst ]
324	then
325	echo "Le fichier suivant :" >> $badFailureFile
326	echo "$nextInputCmd_of_LastInst" >> $badFailureFile
327	echo "n'existe pas. Il devrait exister. STOP." >> $badFailureFile
328	exit 1
329	fi
330	cat $nextInputCmd_of_LastInst > ${inputCmd}
331	fi
332
333	fi
334	fi
335
336	# Initialisation du rapport : par defaut, les calculs ont ete interrompus
337	echo "No report. Le computation must have interrupted." > $reportFile
338
339	# exit 0 # a virer
340
341	# a virer #########################################
342	if [ ${numNewInstance} -ge 10 ]
343	then
344	echo >> $badFailureFile
345	echo "10eme instance. STOP." >> $badFailureFile
346	exit 1
347	fi
348	###################################################
349	# startTime=$( getDateMilliSeconds ) # suppr
350	# echo "start time:$startTime" >> $timeHandlingFile # suppr
351	> $timeEndFile # added
352
353	ccc_mprun ./glost_launch -R $timeLimitBeforeEnd ${inputCmd} 2>${output}
354
355	### ccc_mprun ./cmd_launch.exe ${inputCmd} 2>${output}
356
357	### ./cmd_launch.exe ${inputCmd} 2>${output}
358	### ccc_mprun -p standard -n ${BRIDGE_MSUB_NPROC} ./cmd_launch.exe ${inputCmd} 2>myIO/output.log
359	### mpirun -n 4 ./cmd_launch.exe myIO/inputCmd10.list 2>myIO/output.log
360
361	# meantime=$( getTimeDiffSeconds $startTime ) # suppr
362
363	endExecutionTime=$( getDateMilliSeconds )
364	echo "end time:$endExecutionTime" >> $timeEndFile
365
366
367
368	# echo "meantime ncrcat = $meantime"
369
370	# exit 0 # a virer
371
372	# Gestion des reprises :
373	# --------------------
374
375	update_report
376
377	update_tasks_list $nextInputCmd $output $inputCmd
378
379	if [ "x${doYouWantCheck}" == "xyes" ]
380	then
381	# Verifications sur qq listes (dont le traitement semble correct) :
382	# ----------------------------------------------------------------------------------------
383	if [ "x${nbListsToCheck}" == "x" ]
384	then
385	echo "nbre de listes a checker absent" >> $checkFile
386	echo "nbre de listes a checker absent" >> $badFailureFile
387	exit 1
388	fi
389	> $checkFileTmp
390	# ensemble des listes concatenees correctement
391	set_of_good_lists=`cat $output \| grep -e '^#executed by process' \| awk '{ if ($9==0){print $12} }' \| grep "output_ncrcat" `
392
393	# envoi des cmds de check dans fichier tmp
394	for lst in $set_of_good_lists
395	do
396	echo "./check_ncrcat_list.sh $lst" >> $checkFileTmp
397	done
398
399	# nombre de listes concatenees correctement
400	nb_of_good_lists=`cat $checkFileTmp \| wc -l `
401
402	# le nb de listes a checker ne peut exceder le nb de listes disponibles pour le check
403	if [ $nbListsToCheck -ge $nb_of_good_lists ]
404	then
405	nbListsToCheck=$nb_of_good_lists
406	fi
407	nbLstToCheck_tmp=$nbListsToCheck
408
409	while [ $nbLstToCheck_tmp -gt 0 ]
410	do
411	random_number=$( gives_random_number $nbLstToCheck_tmp ) # nb aleatoire entre 1 et $nbLstToCheck_tmp
412	checkCmd=`sed -n "${random_number}p" $checkFileTmp `
413	$checkCmd # on envoie la cmd de check
414	resCmd=$?
415	if [ "x${resCmd}" != "x0" ]
416	then
417	echo "$checkCmd ==> not OK ==> stop everything." >> $checkFile
418	echo "$checkCmd ==> not OK ==> stop everything." >> $badFailureFile
419	# exit 1 # a retablir
420	else
421	echo "$checkCmd ==> OK" >> $checkFile
422	fi
423	sed -i "${random_number}d" $checkFileTmp # on retire la cmd qui vient d'etre effectuee du fichier tmp
424	nbLstToCheck_tmp=$(( $nbLstToCheck_tmp - 1 ))
425	done
426
427
428	rm $checkFileTmp
429	# ----- Fin verif ------------------------------------------------------------------------
430	fi
431
432	echo "no interruption has occured" > ${noInterruptFile}
433
434	# exit 0 # a virer
435
436	# Tout s'est bien passe
437	# ----------------------
438	everythingOK=`cat $nextInputCmd \| wc -l `
439	if [ "x${everythingOK}" == "x0" ]
440	then
441	echo "Tout s'est fini correctement" >> $badFailureFile
442	exit 0
443	fi
444
445	# y a t il progression avant traitements et apres ?
446	# -------------------------------------------------
447	# Dans la fonction 'check_progress', on a besoin de verifier le code
448	# de retour du traitement des listes : en effet, si les 2 fichiers a comparer
449	# contiennent les memes fichiers de liste, il faut aussi que les codes de
450	# retour soient != 0.
451	file_to_get_lists_retCode=$output
452	resDiff=$( check_progress $inputCmd $nextInputCmd $file_to_get_lists_retCode )
453
454	# resDiff == 1 : fichiers differents
455	# resDiff == 0 : fichiers identiques
456	if [ "x${resDiff}" == "x0" ]
457	then
458	echo "Il n'y pas plus de progression" >> $badFailureFile
459	exit 1
460	fi
461
462	# On enchaine avec le meme script
463	ccc_msub launch_and_measureTime.sh

Note: See TracBrowser for help on using the repository browser.

Download in other formats: