import matplotlib.pyplot as plt
#For latex documents
#textwidth = 13
textwidth=15
figheight = 0.25*textwidth

plt.rc('figure', figsize=(0.66*textwidth,figheight))
#plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('grid', linestyle="--")
plt.rc('grid', alpha="0.5")
plt.rc('axes', grid=True)

def latex_float(f):
    float_str = "{0:.2g}".format(f)
    if "e" in float_str:
        base, exponent = float_str.split("e")
        if(base == "1"):
            return r"10^{{{1}}}".format(base, int(exponent))
        return r"{0} \times 10^{{{1}}}".format(base, int(exponent))
    else:
        return float_str

import numpy as np
import re
from matplotlib.ticker import FormatStrFormatter

!mkdir fig

mkdir: cannot create directory ‘fig’: File exists

Cas test¶

Paramètres:

<nbp>
<nsplit>
<thread_v>

!cat scripts/run_tmpl.def

INCLUDEDEF = const.def

#---------------- Mesh ----------------

# Number of subdivisions on a main triangle : integer (default=40)
nbp = <nbp>

# Number of vertical layers : integer (default=19)
llm = 79

# Vertical grid : [std|ncar|ncarl30] (default=std)
disvert = std

# Mesh optimisation : number of iterations : integer (default=0)
optim_it = 100

# Sub splitting of main rhombus : integer (default=1)
nsplit_i = <nsplit>
nsplit_j = <nsplit>
omp_level_size=<thread_v>

#---------------- Numerics ----------------

# Time step in s : real (default=480)
dt = 120

# Dissipation time for grad(div) : real (default=5000)
tau_graddiv = 9000

# Exponent of grad(div) disspation : integer (default=1)
nitergdiv = 2

# Dissipation time for curl(curl) : real (default=5000)
tau_gradrot = 9000

# Exponent of curl(curl) disspation : integer (default=1)
nitergrot = 2

# Dissipation time for div(grad) : real (default=5000)
tau_divgrad = 9000

# Exponent of div(grad) disspation : integer (default=1)
niterdivgrad = 2


#---------------- Time ----------------

# Run length in s : real (default=??)
run_length = 864000

# Interval in s between two outputs : integer (default=??)
write_period = 8640


#---------------- Physical parameters ----------------

# Number of tracers : integer (default=1)
nqtot = 5

# Initial state : 
#   [jablonowsky06|academic|dcmip[1-4]|heldsz|dcmip2_schaer_noshear] (default=jablonowsky06)
etat0 = dcmip2016_baroclinic_wave


itau_physics=1
#physics=dcmip2016
#physics_dcmip2016=moist_baroclinic
physics=none


itau_check_conserv=1000
time_scheme=ARK2.3

Extraire les données¶

Efficacité MPMD¶

Plusieurs executions indépendantes sont lancées en même temps sur un noeud Joliot-Curie (2x24 proc) :

single en remplissant un noeud NUMA
double en remplissant les 2 noeuds NUMA run.def :
<nbp> = 41
<nsplit> = 2
<thread_v>= 1
dt = 720
run_length=86400

times_mpmd={}
for line in open("data/out_mpmd"):
    ds,nb_proc,time= re.match(".*/(.*)_(\d*).0.1.0:.*:\s*(\d*.\d*)", line).groups()
    times_mpmd[ds,int(nb_proc)]=float(time)

#Script d'execution :
!cat scripts/batch_mpmd.sh
#Résultats 
#!grep -r "Time elapsed" out/*
times_mpmd

#!/usr/bin/bash
#SBATCH -J Dynamico_scal_mpmd
#SBATCH -N 1 
#SBATCH --exclusive
#SBATCH -A gen0826@skylake
#SBATCH -p skylake
#SBATCH -o bench_mpmd.out
#SBATCH -e bench_mpmd.err

set -x

mkdir out

rm run.def
cp run_mpmd.def run.def

for nb_proc in 1 2 4 8 12 16 24 48
do 
    for (( i=0; i<$nb_proc; i++ ))
    do
	OMP_NUM_THREADS=1 srun -n 1 -m cyclic:cyclic -o out/${nb_proc}.$i -slot-list $i ./icosa_gcm &
	pids[${i}]=$!
    done

    # wait for all pids
    for pid in ${pids[*]}; do
	wait $pid
    done
done

{('double', 1): 135.107,
 ('double', 12): 204.0884,
 ('double', 16): 254.2374,
 ('double', 2): 138.1456,
 ('double', 24): 400.6108,
 ('double', 4): 144.1014,
 ('double', 8): 162.3409,
 ('single', 1): 134.921,
 ('single', 12): 203.7471,
 ('single', 16): 254.3122,
 ('single', 2): 138.167,
 ('single', 24): 403.0065,
 ('single', 4): 143.8789,
 ('single', 8): 162.4821}

def plot_times_mpmd() :
    plt.figure(figsize=(0.33*textwidth, figheight))
    procs = np.array(sorted(set([p for (s,p) in times_mpmd.keys()])))
    times = np.array([times_mpmd["single",p] for p in procs])
    plt.plot(procs, times, 'd-', label="1 NUMA")
    times = np.array([times_mpmd["double",p] for p in procs])
    plt.plot(2*procs, times, 'd-', label="2 NUMA")
    plt.xlim(1)
    plt.ylim(0)
    plt.xticks(2*procs)
    plt.title("Temps d'execution MPMD")
    plt.xlabel("Nombre de processus")
    plt.ylabel("Temps (s)")
    plt.legend()
    
plot_times_mpmd()
plt.savefig("fig/times_mpmd.pdf", transparent=True, bbox_inches='tight')
plt.show()

def plot_eff_mpmd() :
    plt.figure(figsize=(0.33*textwidth, figheight))
    procs = np.array(sorted(set([p for (s,p) in times_mpmd.keys() if s=="double"])))
    times = np.array([times_mpmd['single',p] for p in procs])
    plt.plot(procs, times[0]/times*100, '-d', label='1 NUMA')
    times = np.array([times_mpmd['double',p] for p in procs])
    plt.plot(2*procs, times[0]/times*100, '-d', label='2 NUMA')
    plt.ylim([0,100])
    plt.xlim(1)
    plt.xscale('log',basex=2)
    plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%i'))
    plt.xticks(procs)
    plt.title("Efficacité MPMD")
    plt.xlabel("Nombre de processus")
    plt.ylabel("Efficacité (\%)")
    plt.legend()
    plt.savefig("fig/eff_mpmd.pdf", transparent=True, bbox_inches='tight')
    plt.show()
    
plot_eff_mpmd()

Observations¶

La performance est moins bonne avec un socket chargé (probablement à cause de la contension mémoire)
Même performance sur 1 ou 2 sockets (24 ou 48 procs) : confirme l'hypothèse de la contension mémoire

MPI + OpenMP¶

Dynamico est lancé en modifiant le nombre de process et de threads (vertical) sur un noeud d'Irene (2*24 coeurs)

run.def :

<nbp> = 41
<nsplit> = 2
<thread_v>= (paramètre)
dt = 720
run_length = 86400

#Script d'execution :
!cat scripts/batch_scal.sh

#!/usr/bin/bash
#SBATCH -J Dynamico_scal
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH -A gen0826@skylake 
#SBATCH -p skylake
#SBATCH -o out_scal.o
#SBATCH -e out_scal.e

set -x

export OMP_STACKSIZE=500M

mkdir out

for proc in 1 2 4 8 12 16 24 48
do
    for thread_h in 1 2 4 8
    do
        dir=p${proc}_th${thread_h}
	sed 's/<thread_h>/'$thread_h'/g' run_scal.def > run.def
	let "c=48 / $proc"
	OMP_NUM_THREADS=$thread_h srun -c $thread_h -n $proc -m block:block ./icosa_gcm > out/${dir}_gather
	OMP_NUM_THREADS=$thread_h srun -c $c -n $proc -m block:block ./icosa_gcm > out/${dir}_scatter

    done
done

times_scal_gather={}
times_scal_scatter={}
for line in open("data/out_scal"):
    nb_proc,nb_thread,distrib,time= re.match(".*/p(\d*)_th(\d*)_([^:]*):.*:\s*(\d*.\d*)", line).groups()
    if(distrib=="scatter"):
        times_scal_scatter[int(nb_proc),int(nb_thread)]=float(time)
    elif(distrib=="gather"):
        times_scal_gather[int(nb_proc),int(nb_thread)]=float(time)
    else:
        print("error : d="+distrib)
        
times_scal=times_scal_scatter
times_scal

{(1, 1): 133.1368,
 (1, 2): 71.6293,
 (1, 4): 44.6252,
 (1, 8): 32.7814,
 (2, 1): 66.6458,
 (2, 2): 34.5381,
 (2, 4): 18.7718,
 (2, 8): 16.5009,
 (4, 1): 34.0398,
 (4, 2): 17.9557,
 (4, 4): 10.5636,
 (4, 8): 10.15,
 (8, 1): 18.0365,
 (8, 2): 10.3324,
 (8, 4): 7.2838,
 (8, 8): 51.2491,
 (12, 1): 14.863,
 (12, 2): 9.5074,
 (12, 4): 7.4496,
 (12, 8): 53.3071,
 (16, 1): 12.888,
 (16, 2): 9.3343,
 (16, 4): 74.7979,
 (16, 8): 111.0557,
 (24, 1): 10.9958,
 (24, 2): 8.9941,
 (24, 4): 59.852,
 (24, 8): 154.5399}

def plot_times_parallel(times_in) :
    threads = np.array(sorted([a for a in set([t for (p,t) in times_in.keys()])]))
    for thread in threads:
        procs = np.array(sorted([p for (p,t) in times_in.keys() if t==thread]))
        times = np.array([times_in[p,thread] for p in procs])
        plt.plot(procs*thread, times, 'd-', label=str(thread))
    plt.ylim(0)
    plt.xlim(1)
    plt.xticks(np.array(sorted(set([p*t for (p,t) in times_in.keys()]))))
    plt.title("Temps d'execution")
    plt.xlabel("Nombre de coeurs")
    plt.ylabel("Temps (s)")
    plt.legend(title="Threads (vertical)")
    
plt.figure(figsize=(0.33*textwidth, figheight))   
plot_times_parallel(times_scal)
plt.xlim([1,60])
plt.title("Temps d'exécution (1 noeud Skylake)")
plt.savefig("fig/times_1node_skylake.pdf", transparent=True, bbox_inches='tight')
plt.show()

def plot_speedup_parallel(times_in, ref_time, max_cpus) : 
    plt.plot([1,max_cpus],[1,max_cpus],'--') 
    threads = np.array(sorted([a for a in set([t for (p,t) in times_in.keys()])]))
    for thread in threads:
        procs = np.array(sorted([p for (p,t) in times_in.keys() if t==thread]))
        times = np.array([ref_time/times_in[p,thread] for p in procs])
        plt.plot(procs*thread, times, 'd-', label=str(thread))  
    plt.xlim([1,max_cpus])
    plt.xscale('log',basex=2)
    plt.yscale('log',basey=2)
    plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%i'))
    plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%i'))
    plt.title("Speedup")
    plt.xlabel("Nombre de coeurs")
    plt.ylabel("Speedup")
    plt.legend(title="Threads (vertical)")

def plot_mpmd_double():
    procs = np.array(sorted([p for (s,p) in times_mpmd.keys() if s=="double"]))
    times = np.array([times_mpmd["double",p] for p in procs])    
    plt.plot(2*procs, times[0]/times*100, '-d', label="mpmd")

def plot_eff_parallel(times_in, time_ref, max_cpus) :
    plt.plot([1,max_cpus],[100,100],'--') 
    threads = np.array(sorted([a for a in set([t for (p,t) in times_in.keys()])]))
    for thread in threads:
        procs = np.array(sorted([p for (p,t) in times_in.keys() if t==thread]))
        times = np.array([time_ref/times_in[p,thread]/(p*thread)*100 for p in procs])
        plt.plot(procs*thread, times, 'd-', label=str(thread))        
    plt.ylim(0)
    plt.xlim(1)
    plt.xscale('log',basex=2)
    plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%i'))
    plt.title("Efficacité")
    plt.xlabel("Nombre de coeurs")
    plt.ylabel("Efficacité (\%)")
    plt.legend(title="Threads (vertical)")

plt.figure(figsize=(0.66*textwidth, figheight))
plt.suptitle("Scalabilité (1 noeud Skylake, nbp=40)")
#Speedup
plt.subplot(121)
plot_speedup_parallel(times_scal, times_scal[1,1], 48)
#plt.xscale('linear')
#plt.yscale('linear')
#Efficacité
plt.subplot(122)
plot_eff_parallel(times_scal, times_scal[1,1],48)
plot_mpmd_double()
plt.xlim([1,48])
plt.tight_layout()
plt.savefig("fig/scal_1node_skylake.pdf", transparent=True, bbox_inches='tight')
plt.show()

Observations¶

On observe une efficacité en MPI+OpenMP comparable à celle observable en MPMD, ce qui indique que la performance sur un noeud est limité par la bande passante mémoire
8 threads sur la verticale -> performance problématique (Piste : les 8 threads sont regroupés sur des coeurs du meme socket (= même banc mémoire?))

Impact du découpage¶

Dynamico est lancé en changeant le découpage split_i et split_j en plus du découpage sur les threads verticaux. run.def :

<nbp> = 41
<nsplit_i>, <nsplit_j>, <thread_v>= (paramètre)
dt = 720
run_length = 86400

!cat scripts/batch_param.sh

#!/usr/bin/bash
#SBATCH -J Dynamico_scal
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH -A gen0826@skylake 
#SBATCH -p skylake
#SBATCH -o out_scal.o
#SBATCH -e out_scal.e

set -x

export OMP_STACKSIZE=500M

mkdir out
cd out

function start_bench()
{
    split_i=$1
    split_j=$2
    proc=$3
    thread_h=$4
    dir=si${split_i}_sj${split_j}_p${proc}_th${thread_h}
    mkdir $dir
    cd $dir
    sed 's/<thread_h>/'$thread_h'/g' ../../run_scal.def | sed 's/<split_i>/'$split_i'/g' | sed 's/<split_j>/'$split_j'/g' > run.def
    let "c=48 / $proc"
    OMP_NUM_THREADS=$thread_h srun -c $c -n $proc -m block:block ../../icosa_gcm > out_scatter
    OMP_NUM_THREADS=$thread_h srun -c $thread_h -n $proc -m block:block ../../icosa_gcm > out_gather
    cd ..
}


for threads in 1 2 3 4 6 8 12
do
    for split_i in 1 2 3 4 6 8  
    do
	for split_j in 1 2 3 4 6 8
	do
	    let "proc=$split_i * $split_j"
	    start_bench $split_i $split_j $proc $threads
	done
    done
done

Multi-noeuds¶

run.def: paramètres

<nbp>
<nsplit>
<thread_v>
dt = 120
run_length = 864000

!cat scripts/batch_multinode_strong_skylake.sh

set -x

mkdir out

max_nodes=80
let "max_cores=48*$max_nodes"
partition=skylake
export OMP_STACKSIZE=100M

time_limit=2:00:00

function run(){
    nbp=$1
    nsplit=$2
    proc=$3
    thread_v=$4

    dir=out/nbp${nbp}_s${nsplit}_p${proc}_t${thread_v}
    mkdir $dir
    cd $dir

    sed "s/<nsplit>/"$nsplit"/" ../../run_tmpl.def | sed "s/<thread_v>/"$thread_v"/" | sed "s/<nbp>/"$nbp"/" > run.def
    echo -e "#!/usr/bin/bash \n OMP_NUM_THREADS="$thread_v" srun ../../icosa_gcm" | sbatch -o out -e err -J Dyn_scal_$proc --exclusive -t $time_limit -A gen0826@$partition -p $partition -n $proc -c $thread_v --
    
    cd ../..
}

for nbp in 40 80 160 
do
    for thread in 4 6 8 
    do
	for split in 1 2 #4 8 16
	do
	    let "tiles=$split*$split*10"
	    let "requested_cpus=$tiles*$thread"
	    if [ "$requested_cpus" -le "$max_cores" ]; then 
		run $nbp $split $tiles $thread
	    else
		echo "too many tiles : "$tiles
	    fi
	done
done
done

run 80 8 160 6
run 80 8 160 8

run 160 8 160 6
run 160 8 160 8

run 160 16 640 4
run 160 16 640 6
run 160 16 640 8

times_skylake_multinode={}
for line in open("data/out_skylake_multinode"):
    nbp,split,nb_proc,threads_v,time= re.match(".*/nbp(\d*)_s(\d*)_p(\d*)_t(\d*)/out:.*:\s*(\d*.\d*)", line).groups()
    times_skylake_multinode[int(nbp),int(split),int(nb_proc),int(threads_v)]=float(time)
    
times_skylake_multinode

{(40, 8, 640, 6): 18.3139,
 (160, 2, 40, 8): 1236.8419,
 (160, 16, 640, 6): 200.8642,
 (160, 8, 640, 4): 215.6057,
 (80, 1, 10, 6): 1559.2913,
 (160, 2, 40, 4): 2310.6745,
 (80, 8, 160, 6): 116.9297,
 (80, 1, 10, 4): 2211.3472,
 (40, 8, 640, 4): 21.344,
 (160, 4, 160, 4): 573.7003,
 (40, 1, 10, 8): 232.6274,
 (40, 2, 40, 4): 93.3042,
 (80, 1, 10, 8): 1154.5988,
 (80, 4, 160, 8): 80.9629,
 (160, 4, 160, 6): 436.6177,
 (160, 8, 640, 6): 183.0899,
 (80, 4, 160, 4): 101.7601,
 (40, 1, 10, 6): 307.2394,
 (80, 8, 160, 8): 100.2801,
 (160, 4, 160, 8): 350.2307,
 (40, 2, 40, 6): 81.7571,
 (40, 2, 40, 8): 70.9212,
 (40, 4, 160, 4): 35.3892,
 (80, 8, 640, 6): 51.0804,
 (80, 2, 40, 6): 331.5392,
 (160, 8, 160, 8): 329.95,
 (40, 4, 160, 6): 33.6143,
 (40, 4, 160, 8): 28.332,
 (160, 16, 640, 8): 109.3143,
 (80, 4, 160, 6): 88.869,
 (80, 8, 640, 4): 42.5942,
 (160, 8, 160, 6): 366.6184,
 (80, 2, 40, 4): 490.0975,
 (80, 2, 40, 8): 282.0337,
 (160, 2, 40, 6): 1612.4732,
 (40, 1, 10, 4): 454.1991,
 (160, 16, 640, 4): 151.0682}

Les valeurs de référence pour le speedup et la scalabilité sont les cas avec le plus petit nombre de CPUs qui remplissent completement les noeuds (ex : 40proc x 6threads = 240CPUs= 5noeuds x 2socket x 24CPUs)

def multinode_scal(times_multinodes, arch_name, max_cpus, ref):
    times_nbp={}
    for nbp in 40, 80, 160:
        times_nbp[nbp] = { (p,t):v for (n,s,p,t),v in times_multinodes.items() if n==nbp }
        #(p_first,t_first)=sorted(times_nbp[nbp])[0]
        (p_first,t_first)=ref[nbp]
        time_ref=times_nbp[nbp][p_first,t_first]*p_first*t_first
        #times
        #plt.figure(figsize=(0.33*textwidth, figheight))
        #plot_times_parallel(times_nbp[nbp])
        #plt.show()
        #plt.hold(True)
        plt.figure(figsize=(0.66*textwidth, figheight))
        plt.suptitle("Scalabilité ("+arch_name+", nbp="+str(nbp)+")")
        #Speedup
        plt.subplot(121)   
        plt.scatter([p_first*t_first],[p_first*t_first],s=100, marker='x',label="Référence",c='r')
        plot_speedup_parallel(times_nbp[nbp], time_ref, max_cpus)        
        plt.xlim([32,max_cpus])
        plt.ylim([32,max_cpus]) 
        #Efficiency
        plt.subplot(122) 
        plt.scatter([p_first*t_first],[100],s=100, marker='x',label="Référence",c='r')
        plot_eff_parallel(times_nbp[nbp],time_ref,max_cpus)
        plt.xlim([32,max_cpus])
        plt.tight_layout()
        plt.savefig("fig/scal_multinode_"+arch_name+"_nbp"+str(nbp)+".pdf", transparent=True, bbox_inches='tight')
        plt.show()

#40proc*6threads premier à charger completement les noeuds 
ref={40:(40,6),80:(40,6),160:(40,6)}
#ref={40:(10,4),80:(10,4),160:(40,4)}
multinode_scal(times_skylake_multinode, "Skylake", 120*48, ref)

Xeon phi¶

Intra noeud¶

run.def :

<nbp> = 41
<nsplit> = 2
<thread_v>= (paramètre)
dt = 720
run_length = 86400

!cat scripts/batch_knl.sh

#!/usr/bin/bash
#SBATCH -J Dynamico_scal
#SBATCH -N 1
#SBATCH --exclusive
#SBATCH -A gen0826@knl 
#SBATCH -p knl
#SBATCH -o out_scal.o
#SBATCH -e out_scal.e

set -x

export OMP_STACKSIZE=500M

mkdir out

for proc in 1 2 4 8 16 32 64
do
    for thread_h in 1 2 4 8
    do
        dir=p${proc}_th${thread_h}
	sed 's/<thread_h>/'$thread_h'/g' run_scal.def > run.def
	let "c=64 / $proc"
	OMP_NUM_THREADS=$thread_h srun -c $thread_h -n $proc -m block:block ./icosa_gcm > out/${dir}_gather
	OMP_NUM_THREADS=$thread_h srun -c $c -n $proc -m block:block ./icosa_gcm > out/${dir}_scatter

    done
done

times_knl_gather={}
times_knl_scatter={}
for line in open("data/out_scal_knl"):
    nb_proc,nb_thread,distrib,time= re.match(".*/p(\d*)_th(\d*)_([^:]*):.*:\s*(\d*.\d*)", line).groups()
    if(distrib=="scatter"):
        times_knl_scatter[int(nb_proc),int(nb_thread)]=float(time)
    elif(distrib=="gather"):
        times_knl_gather[int(nb_proc),int(nb_thread)]=float(time)
    else:
        print("error : d="+distrib)
        
times_knl=times_knl_gather
times_knl_gather,times_knl_scatter

({(1, 1): 329.177,
  (1, 2): 188.1756,
  (1, 4): 104.1642,
  (1, 8): 65.1805,
  (2, 1): 185.6126,
  (2, 2): 96.7113,
  (2, 4): 53.6763,
  (2, 8): 34.4709,
  (4, 1): 94.4728,
  (4, 2): 49.8149,
  (4, 4): 28.5949,
  (4, 8): 16.526,
  (8, 1): 48.5931,
  (8, 2): 29.6327,
  (8, 4): 14.1617,
  (8, 8): 9.301,
  (16, 1): 29.6035,
  (16, 2): 15.3404,
  (16, 4): 8.8587,
  (32, 1): 19.032,
  (32, 2): 10.5154},
 {(1, 1): 328.2685,
  (1, 2): 188.73,
  (1, 4): 104.3708,
  (1, 8): 65.4525,
  (2, 1): 168.4326,
  (2, 2): 96.6988,
  (2, 4): 54.7444,
  (2, 8): 34.2946,
  (4, 1): 86.332,
  (4, 2): 49.9112,
  (4, 4): 28.6898,
  (4, 8): 16.9315,
  (8, 1): 44.3428,
  (8, 2): 25.9841,
  (8, 4): 14.1137,
  (8, 8): 8.7025,
  (16, 1): 30.8142,
  (16, 2): 15.3586,
  (16, 4): 8.8967,
  (16, 8): 8.5454,
  (32, 1): 17.3953,
  (32, 2): 10.4688,
  (32, 4): 9.6146,
  (32, 8): 10.9052})

%matplotlib inline

time_ref=times_knl[1,1]
max_cpus=64

#times
plt.figure(figsize=(0.33*textwidth, figheight))
plot_times_parallel(times_knl)
plt.show()

plt.figure(figsize=(0.66*textwidth, figheight))
plt.suptitle("Scalabilité (1 noeud KNL, nbp=40)")
#Speedup
plt.subplot(121)    
plot_speedup_parallel(times_knl, time_ref, max_cpus)
plt.xlim([1,max_cpus])
plt.ylim([1,max_cpus]) 
#Efficiency
plt.subplot(122) 
plot_eff_parallel(times_knl,time_ref, max_cpus)
plt.xlim([1,max_cpus])
plt.tight_layout()
plt.savefig("fig/scal_multinode_knl_nbp"+str(nbp)+".pdf", transparent=True, bbox_inches='tight')
plt.show()

Multi-noeuds¶

run.def: paramètres

<nbp>
<nsplit>
<thread_v>
dt = 120
run_length = 864000

!cat scripts/batch_multinode_strong_knl.sh

set -x

mkdir out

max_nodes=80
let "max_cores=64*$max_nodes"
partition=knl
export OMP_STACKSIZE=100M

function run(){
    nbp=$1
    nsplit=$2
    proc=$3
    thread_v=$4
    time_limit=$5

    dir=out/nbp${nbp}_s${nsplit}_p${proc}_t${thread_v}
    mkdir $dir
    cd $dir

    sed "s/<nsplit>/"$nsplit"/" ../../run_tmpl.def | sed "s/<thread_v>/"$thread_v"/" | sed "s/<nbp>/"$nbp"/" > run.def
    echo -e "#!/usr/bin/bash \n OMP_NUM_THREADS="$thread_v" srun ../../icosa_gcm" | sbatch -o out -e err -J ${nbp}_${nsplit}_${thread_v} --exclusive -t $time_limit -A gen0826@$partition -p $partition -n $proc -c $thread_v --
    
    cd ../..
}

run 160 4 160 5 2:00:00

exit 0

time_limit=2:00:00

for nbp in 40 80 160 
do
    for thread in 4 8 
    do
	for split in 1 2 4
	do
	    let "tiles=$split*$split*10"
	    run $nbp $split $tiles $thread $time_limit
	done
done
done

time_limit=10:00

for nbp in 40 80 160
do
    for thread in 4 8
    do
        for split in 8 16
        do
            let "tiles=$split*$split*10"
            let "requested_cpus=$tiles*$thread"
            if [ "$requested_cpus" -le "$max_cores" ]; then
                run $nbp $split $tiles $thread $time_limit
            else
                echo "too many tiles : "$tiles
            fi
        done
done
done

time_limit=10:00

run 80 8 160 4 $time_limit
run 80 8 160 8 $time_limit

run 160 8 160 4 20:00
run 160 8 160 8 $time_limit

run 160 16 640 4 $time_limit
run 160 16 640 8 $time_limit

times_knl_multinode={}
for line in open("data/out_knl_multinode"):
    nbp,split,nb_proc,threads_v,time= re.match(".*/nbp(\d*)_s(\d*)_p(\d*)_t(\d*)/out:.*:\s*(\d*.\d*)", line).groups()
    times_knl_multinode[int(nbp),int(split),int(nb_proc),int(threads_v)]=float(time)
    
times_knl_multinode

{(160, 2, 40, 8): 1261.0159,
 (160, 8, 640, 4): 264.6752,
 (160, 2, 40, 4): 2342.1782,
 (80, 1, 10, 4): 1945.2871,
 (40, 8, 640, 4): 68.6094,
 (160, 4, 160, 4): 645.6453,
 (40, 1, 10, 8): 420.7622,
 (40, 2, 40, 4): 257.2646,
 (80, 1, 10, 8): 1121.5099,
 (160, 8, 640, 8): 153.0664,
 (160, 2, 40, 16): 818.5843,
 (80, 4, 160, 8): 147.0689,
 (80, 4, 160, 4): 210.2384,
 (80, 8, 160, 8): 259.6045,
 (160, 4, 160, 16): 354.7174,
 (160, 4, 160, 8): 381.0508,
 (40, 4, 160, 16): 76.5834,
 (80, 2, 40, 16): 292.1051,
 (40, 2, 40, 8): 169.242,
 (40, 4, 160, 4): 133.4371,
 (160, 8, 160, 8): 501.9349,
 (80, 4, 160, 16): 143.3839,
 (40, 4, 160, 8): 86.2381,
 (40, 8, 640, 8): 53.997,
 (160, 16, 640, 8): 257.5874,
 (80, 8, 640, 4): 118.7904,
 (80, 8, 640, 8): 92.4125,
 (40, 1, 10, 16): 329.9034,
 (80, 1, 10, 16): 857.7385,
 (80, 2, 40, 4): 611.6195,
 (40, 2, 40, 16): 139.4846,
 (80, 2, 40, 8): 363.0068,
 (40, 1, 10, 4): 510.0136,
 (160, 16, 640, 4): 351.9924}

Les valeurs de référence pour le speedup et la scalabilité sont les cas avec le plus petit nombre de CPUs qui remplissent completement les noeuds (ex : 40proc x 8threads = 320CPUs= 5noeuds x 64CPUs)

thread_count=np.array([4,8,16])
split=np.array([1,2,4,8])
proc_count=split*split*10

for t in thread_count:
    util=(proc_count*t)/np.ceil((proc_count*t)/64)
    plt.plot(proc_count*t,util,'-d',label=str(t))
plt.xlim([1,1000])
plt.ylim(0)
plt.ylabel("Nombre de CPUs par noeud")
plt.xlabel("Nombre de processus")
plt.title("Occupation moyenne des noeuds")
plt.legend(title="Threads")
plt.show()

#10proc*8thread ou 40proc*4thread sont les premiers à rempilr les noeuds (64 threads/KNL)
ref={40:(40,8),80:(40,8),160:(40,8)}
#ref={40:(10,4),80:(10,4),160:(40,4)}
multinode_scal(times_knl_multinode, "KNL", 100*64, ref)

def get_best(times):
    for nbp in 40, 80, 160 :
        max_proc = max([p*t for (n,s,p,t) in times if n == nbp])
        (time_min,nb_proc_time_min) = min([(times[n,s,p,t],p*t) for (n,s,p,t) in times if n == nbp])
        print("nbp:", nbp, " proc:", nb_proc_time_min, "/", max_proc, " time:", time_min)

print("knl")
get_best(times_knl_multinode)
print("skylake")
get_best(times_skylake_multinode)

knl
nbp: 40  proc: 5120 / 5120  time: 53.997
nbp: 80  proc: 5120 / 5120  time: 92.4125
nbp: 160  proc: 5120 / 5120  time: 153.0664
skylake
nbp: 40  proc: 3840 / 3840  time: 18.3139
nbp: 80  proc: 2560 / 3840  time: 42.5942
nbp: 160  proc: 5120 / 5120  time: 109.3143