#!/bin/bash
#SBATCH --job-name=nsight_systems   # Arbitrary name of the Slurm job
#SBATCH --output=%x.%j.out          # Standard output file of the job
#SBATCH --error=%x.%j.err           # Standard error file of the job
#SBATCH --ntasks=4                  # Number of MPI processes requested
#SBATCH --ntasks-per-node=4         # Number of MPI tasks per node (= number of GPUs per node)
#SBATCH --gres=gpu:4                # Number of GPUs per node
#SBATCH --cpus-per-task=10          # Number of CPU cores per task (a quarter of the node here)
# La ligne ci-dessous peut être décommentée pour passer en mode exclusif et disposer
# d'un accès complet aux compteurs matériels (cf. les bonnes pratiques pour le profilage)
##SBATCH --exclusive -C prof
# /!\ Important:  The following line is misleading but in Slurm vocabulary, 
# "multithread" refers to hyperthreading.
#SBATCH --hint=nomultithread        # 1 MPI process per physical core (no hyperthreading)
#SBATCH --time=00:20:00             # Job time hh:mm:ss (20mn here)

# Loading the modules of your choice
module load ...
# Loading Nsight Systems
module load nvidia-nsight-systems/2021.2.1

# Echo of commands 
set -x

# To not use the /tmp
export TMPDIR=$JOBSCRATCH
# To circumvent a bug in the current versions of Nsight Systems,
# it is also necessary to create a symbolic link which allows
# pointing the /tmp/nvidia directory to TMPDIR
ln -s $JOBSCRATCH /tmp/nvidia

# Profiling in OpenACC mode with the generation of a results file
# per process ("report_rank0.qdrep", "report_rank1.qdrep", etc)
srun nsys profile -t openacc -o "report_rank%q{SLURM_PROCID}" ./my_bin_exe