Skip to content

Commit a8654a3

Browse files
committed
refactor: had to change module loading due to looping over configs
1 parent 266788a commit a8654a3

File tree

1 file changed

+28
-20
lines changed
  • tutorials/scaling_up/scripts/hpc/leonardo

1 file changed

+28
-20
lines changed

tutorials/scaling_up/scripts/hpc/leonardo/job.sbatch

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,6 @@
99
#SBATCH --gres=gpu:4
1010
#SBATCH --exclusive
1111

12-
# Enable logging
13-
set -x # Every command and its arguments are printed to stderr before being executed.
14-
15-
# Setup environment
16-
module load cuda/12.3
17-
module load gcc/12.2.0
18-
module load binutils/2.42
19-
source /leonardo_scratch/fast/EUHPC_D21_101/max_lue/python_envs/working/leonardo_modalities/bin/activate # TODO replace "working" with "stable" if needed
20-
2112
#### Environment variables
2213
export CXX=g++
2314
export CC=gcc
@@ -34,6 +25,14 @@ export GLOO_SOCKET_IFNAME=ib0
3425
export CUDA_VISIBLE_DEVICES=0,1,2,3
3526
export NCCL_ASYNC_ERROR_HANDLING=1
3627

28+
# Enable logging
29+
set -x # Every command and its arguments are printed to stderr before being executed.
30+
31+
# Setup environment
32+
module load cuda/12.3
33+
source /leonardo_scratch/fast/EUHPC_D21_101/max_lue/python_envs/working/leonardo_modalities/bin/activate # TODO replace "working" with "stable" if needed
34+
35+
3736
MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
3837
MASTER_PORT=6015
3938

@@ -49,7 +48,8 @@ modalities benchmark list_remaining_runs \
4948
--experiment_dir "$EXPERIMENT_ROOT/$NUM_RANKS" \
5049
--file_list_path "$CONFIG_LIST_FILE" \
5150
--expected_steps "$EXPECTED_STEPS" \
52-
--skip_exception_types "OutOfMemoryError,ValueError"
51+
--skip_exception_types "OutOfMemoryError,ValueError" \
52+
--new_folders_for_remaining
5353

5454

5555
# Step 2: Read configs into Bash array and loop
@@ -59,16 +59,24 @@ for config_file in "${config_files[@]}"; do
5959
echo "Processing config: $config_file"
6060
error_log_folder="$(dirname "$config_file")/error_logs"
6161

62-
srun torchrun \
63-
--rdzv-endpoint "$MASTER_ADDR:$MASTER_PORT" \
64-
--nnodes "$SLURM_JOB_NUM_NODES" \
65-
--nproc_per_node "$GPUS_PER_NODE" \
66-
--rdzv_backend c10d \
67-
--rdzv_conf join_timeout=120 \
68-
"$(which modalities)" run \
69-
--config_file_path "$config_file" \
70-
--experiment_id "" \
71-
--error_log_folder "$error_log_folder"
62+
srun --exclusive bash -c "
63+
source /etc/profile
64+
export USER=\${USER:-\$(whoami)}
65+
module load cuda/12.3
66+
source /leonardo_scratch/fast/EUHPC_D21_101/max_lue/python_envs/working/leonardo_modalities/bin/activate
67+
68+
set -euo pipefail
69+
torchrun \
70+
--rdzv-endpoint $MASTER_ADDR:$MASTER_PORT \
71+
--nnodes $SLURM_JOB_NUM_NODES \
72+
--nproc_per_node $GPUS_PER_NODE \
73+
--rdzv_backend c10d \
74+
--rdzv_conf join_timeout=120 \
75+
$(which modalities) run \
76+
--config_file_path $config_file \
77+
--experiment_id \"\" \
78+
--error_log_folder $error_log_folder
79+
" || echo "Config $config_file failed, continuing..."
7280
done
7381

7482
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)