import matplotlib.pyplot as plt
#For latex documents
#textwidth = 13
textwidth=15
figheight = 0.25*textwidth
plt.rc('figure', figsize=(0.66*textwidth,figheight))
#plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('grid', linestyle="--")
plt.rc('grid', alpha="0.5")
plt.rc('axes', grid=True)
def latex_float(f):
float_str = "{0:.2g}".format(f)
if "e" in float_str:
base, exponent = float_str.split("e")
if(base == "1"):
return r"10^{{{1}}}".format(base, int(exponent))
return r"{0} \times 10^{{{1}}}".format(base, int(exponent))
else:
return float_str
import numpy as np
import re
from matplotlib.ticker import FormatStrFormatter
!mkdir fig
!cat scripts/run_tmpl.def
Plusieurs executions indépendantes sont lancées en même temps sur un noeud Joliot-Curie (2x24 proc) :
<nbp>
= 41<nsplit>
= 2<thread_v>
= 1times_mpmd={}
for line in open("data/out_mpmd"):
ds,nb_proc,time= re.match(".*/(.*)_(\d*).0.1.0:.*:\s*(\d*.\d*)", line).groups()
times_mpmd[ds,int(nb_proc)]=float(time)
#Script d'execution :
!cat scripts/batch_mpmd.sh
#Résultats
#!grep -r "Time elapsed" out/*
times_mpmd
def plot_times_mpmd() :
plt.figure(figsize=(0.33*textwidth, figheight))
procs = np.array(sorted(set([p for (s,p) in times_mpmd.keys()])))
times = np.array([times_mpmd["single",p] for p in procs])
plt.plot(procs, times, 'd-', label="1 NUMA")
times = np.array([times_mpmd["double",p] for p in procs])
plt.plot(2*procs, times, 'd-', label="2 NUMA")
plt.xlim(1)
plt.ylim(0)
plt.xticks(2*procs)
plt.title("Temps d'execution MPMD")
plt.xlabel("Nombre de processus")
plt.ylabel("Temps (s)")
plt.legend()
plot_times_mpmd()
plt.savefig("fig/times_mpmd.pdf", transparent=True, bbox_inches='tight')
plt.show()
def plot_eff_mpmd() :
plt.figure(figsize=(0.33*textwidth, figheight))
procs = np.array(sorted(set([p for (s,p) in times_mpmd.keys() if s=="double"])))
times = np.array([times_mpmd['single',p] for p in procs])
plt.plot(procs, times[0]/times*100, '-d', label='1 NUMA')
times = np.array([times_mpmd['double',p] for p in procs])
plt.plot(2*procs, times[0]/times*100, '-d', label='2 NUMA')
plt.ylim([0,100])
plt.xlim(1)
plt.xscale('log',basex=2)
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%i'))
plt.xticks(procs)
plt.title("Efficacité MPMD")
plt.xlabel("Nombre de processus")
plt.ylabel("Efficacité (\%)")
plt.legend()
plt.savefig("fig/eff_mpmd.pdf", transparent=True, bbox_inches='tight')
plt.show()
plot_eff_mpmd()
Dynamico est lancé en modifiant le nombre de process et de threads (vertical) sur un noeud d'Irene (2*24 coeurs)
run.def :
<nbp>
= 41<nsplit>
= 2<thread_v>
= (paramètre)#Script d'execution :
!cat scripts/batch_scal.sh
times_scal_gather={}
times_scal_scatter={}
for line in open("data/out_scal"):
nb_proc,nb_thread,distrib,time= re.match(".*/p(\d*)_th(\d*)_([^:]*):.*:\s*(\d*.\d*)", line).groups()
if(distrib=="scatter"):
times_scal_scatter[int(nb_proc),int(nb_thread)]=float(time)
elif(distrib=="gather"):
times_scal_gather[int(nb_proc),int(nb_thread)]=float(time)
else:
print("error : d="+distrib)
times_scal=times_scal_scatter
times_scal
def plot_times_parallel(times_in) :
threads = np.array(sorted([a for a in set([t for (p,t) in times_in.keys()])]))
for thread in threads:
procs = np.array(sorted([p for (p,t) in times_in.keys() if t==thread]))
times = np.array([times_in[p,thread] for p in procs])
plt.plot(procs*thread, times, 'd-', label=str(thread))
plt.ylim(0)
plt.xlim(1)
plt.xticks(np.array(sorted(set([p*t for (p,t) in times_in.keys()]))))
plt.title("Temps d'execution")
plt.xlabel("Nombre de coeurs")
plt.ylabel("Temps (s)")
plt.legend(title="Threads (vertical)")
plt.figure(figsize=(0.33*textwidth, figheight))
plot_times_parallel(times_scal)
plt.xlim([1,60])
plt.title("Temps d'exécution (1 noeud Skylake)")
plt.savefig("fig/times_1node_skylake.pdf", transparent=True, bbox_inches='tight')
plt.show()
def plot_speedup_parallel(times_in, ref_time, max_cpus) :
plt.plot([1,max_cpus],[1,max_cpus],'--')
threads = np.array(sorted([a for a in set([t for (p,t) in times_in.keys()])]))
for thread in threads:
procs = np.array(sorted([p for (p,t) in times_in.keys() if t==thread]))
times = np.array([ref_time/times_in[p,thread] for p in procs])
plt.plot(procs*thread, times, 'd-', label=str(thread))
plt.xlim([1,max_cpus])
plt.xscale('log',basex=2)
plt.yscale('log',basey=2)
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%i'))
plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%i'))
plt.title("Speedup")
plt.xlabel("Nombre de coeurs")
plt.ylabel("Speedup")
plt.legend(title="Threads (vertical)")
def plot_mpmd_double():
procs = np.array(sorted([p for (s,p) in times_mpmd.keys() if s=="double"]))
times = np.array([times_mpmd["double",p] for p in procs])
plt.plot(2*procs, times[0]/times*100, '-d', label="mpmd")
def plot_eff_parallel(times_in, time_ref, max_cpus) :
plt.plot([1,max_cpus],[100,100],'--')
threads = np.array(sorted([a for a in set([t for (p,t) in times_in.keys()])]))
for thread in threads:
procs = np.array(sorted([p for (p,t) in times_in.keys() if t==thread]))
times = np.array([time_ref/times_in[p,thread]/(p*thread)*100 for p in procs])
plt.plot(procs*thread, times, 'd-', label=str(thread))
plt.ylim(0)
plt.xlim(1)
plt.xscale('log',basex=2)
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%i'))
plt.title("Efficacité")
plt.xlabel("Nombre de coeurs")
plt.ylabel("Efficacité (\%)")
plt.legend(title="Threads (vertical)")
plt.figure(figsize=(0.66*textwidth, figheight))
plt.suptitle("Scalabilité (1 noeud Skylake, nbp=40)")
#Speedup
plt.subplot(121)
plot_speedup_parallel(times_scal, times_scal[1,1], 48)
#plt.xscale('linear')
#plt.yscale('linear')
#Efficacité
plt.subplot(122)
plot_eff_parallel(times_scal, times_scal[1,1],48)
plot_mpmd_double()
plt.xlim([1,48])
plt.tight_layout()
plt.savefig("fig/scal_1node_skylake.pdf", transparent=True, bbox_inches='tight')
plt.show()
Dynamico est lancé en changeant le découpage split_i et split_j en plus du découpage sur les threads verticaux. run.def :
<nbp>
= 41<nsplit_i>
, <nsplit_j>
, <thread_v>
= (paramètre)!cat scripts/batch_param.sh
!cat scripts/batch_multinode_strong_skylake.sh
times_skylake_multinode={}
for line in open("data/out_skylake_multinode"):
nbp,split,nb_proc,threads_v,time= re.match(".*/nbp(\d*)_s(\d*)_p(\d*)_t(\d*)/out:.*:\s*(\d*.\d*)", line).groups()
times_skylake_multinode[int(nbp),int(split),int(nb_proc),int(threads_v)]=float(time)
times_skylake_multinode
Les valeurs de référence pour le speedup et la scalabilité sont les cas avec le plus petit nombre de CPUs qui remplissent completement les noeuds (ex : 40proc x 6threads = 240CPUs= 5noeuds x 2socket x 24CPUs)
def multinode_scal(times_multinodes, arch_name, max_cpus, ref):
times_nbp={}
for nbp in 40, 80, 160:
times_nbp[nbp] = { (p,t):v for (n,s,p,t),v in times_multinodes.items() if n==nbp }
#(p_first,t_first)=sorted(times_nbp[nbp])[0]
(p_first,t_first)=ref[nbp]
time_ref=times_nbp[nbp][p_first,t_first]*p_first*t_first
#times
#plt.figure(figsize=(0.33*textwidth, figheight))
#plot_times_parallel(times_nbp[nbp])
#plt.show()
#plt.hold(True)
plt.figure(figsize=(0.66*textwidth, figheight))
plt.suptitle("Scalabilité ("+arch_name+", nbp="+str(nbp)+")")
#Speedup
plt.subplot(121)
plt.scatter([p_first*t_first],[p_first*t_first],s=100, marker='x',label="Référence",c='r')
plot_speedup_parallel(times_nbp[nbp], time_ref, max_cpus)
plt.xlim([32,max_cpus])
plt.ylim([32,max_cpus])
#Efficiency
plt.subplot(122)
plt.scatter([p_first*t_first],[100],s=100, marker='x',label="Référence",c='r')
plot_eff_parallel(times_nbp[nbp],time_ref,max_cpus)
plt.xlim([32,max_cpus])
plt.tight_layout()
plt.savefig("fig/scal_multinode_"+arch_name+"_nbp"+str(nbp)+".pdf", transparent=True, bbox_inches='tight')
plt.show()
#40proc*6threads premier à charger completement les noeuds
ref={40:(40,6),80:(40,6),160:(40,6)}
#ref={40:(10,4),80:(10,4),160:(40,4)}
multinode_scal(times_skylake_multinode, "Skylake", 120*48, ref)
!cat scripts/batch_knl.sh
times_knl_gather={}
times_knl_scatter={}
for line in open("data/out_scal_knl"):
nb_proc,nb_thread,distrib,time= re.match(".*/p(\d*)_th(\d*)_([^:]*):.*:\s*(\d*.\d*)", line).groups()
if(distrib=="scatter"):
times_knl_scatter[int(nb_proc),int(nb_thread)]=float(time)
elif(distrib=="gather"):
times_knl_gather[int(nb_proc),int(nb_thread)]=float(time)
else:
print("error : d="+distrib)
times_knl=times_knl_gather
times_knl_gather,times_knl_scatter
%matplotlib inline
time_ref=times_knl[1,1]
max_cpus=64
#times
plt.figure(figsize=(0.33*textwidth, figheight))
plot_times_parallel(times_knl)
plt.show()
plt.figure(figsize=(0.66*textwidth, figheight))
plt.suptitle("Scalabilité (1 noeud KNL, nbp=40)")
#Speedup
plt.subplot(121)
plot_speedup_parallel(times_knl, time_ref, max_cpus)
plt.xlim([1,max_cpus])
plt.ylim([1,max_cpus])
#Efficiency
plt.subplot(122)
plot_eff_parallel(times_knl,time_ref, max_cpus)
plt.xlim([1,max_cpus])
plt.tight_layout()
plt.savefig("fig/scal_multinode_knl_nbp"+str(nbp)+".pdf", transparent=True, bbox_inches='tight')
plt.show()
!cat scripts/batch_multinode_strong_knl.sh
times_knl_multinode={}
for line in open("data/out_knl_multinode"):
nbp,split,nb_proc,threads_v,time= re.match(".*/nbp(\d*)_s(\d*)_p(\d*)_t(\d*)/out:.*:\s*(\d*.\d*)", line).groups()
times_knl_multinode[int(nbp),int(split),int(nb_proc),int(threads_v)]=float(time)
times_knl_multinode
Les valeurs de référence pour le speedup et la scalabilité sont les cas avec le plus petit nombre de CPUs qui remplissent completement les noeuds (ex : 40proc x 8threads = 320CPUs= 5noeuds x 64CPUs)
thread_count=np.array([4,8,16])
split=np.array([1,2,4,8])
proc_count=split*split*10
for t in thread_count:
util=(proc_count*t)/np.ceil((proc_count*t)/64)
plt.plot(proc_count*t,util,'-d',label=str(t))
plt.xlim([1,1000])
plt.ylim(0)
plt.ylabel("Nombre de CPUs par noeud")
plt.xlabel("Nombre de processus")
plt.title("Occupation moyenne des noeuds")
plt.legend(title="Threads")
plt.show()
#10proc*8thread ou 40proc*4thread sont les premiers à rempilr les noeuds (64 threads/KNL)
ref={40:(40,8),80:(40,8),160:(40,8)}
#ref={40:(10,4),80:(10,4),160:(40,4)}
multinode_scal(times_knl_multinode, "KNL", 100*64, ref)
def get_best(times):
for nbp in 40, 80, 160 :
max_proc = max([p*t for (n,s,p,t) in times if n == nbp])
(time_min,nb_proc_time_min) = min([(times[n,s,p,t],p*t) for (n,s,p,t) in times if n == nbp])
print("nbp:", nbp, " proc:", nb_proc_time_min, "/", max_proc, " time:", time_min)
print("knl")
get_best(times_knl_multinode)
print("skylake")
get_best(times_skylake_multinode)