Skip to content

Commit 038956e

Browse files
authored
Revert "fix ray templates by using --exclusive to launch ray nodes (#380) (#384)
1 parent f587ae0 commit 038956e

File tree

5 files changed

+42
-39
lines changed

5 files changed

+42
-39
lines changed

nemo_run/run/ray/templates/ray.sub.j2

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
2828
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
2929

3030
# Ports for the head node
31-
PORT=${PORT:-54514}
31+
PORT=${PORT:-6379}
3232
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
3333
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
3434
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
@@ -57,7 +57,7 @@ fi
5757
# (not including the other ports set by this script). So this range is chosen to be
5858
# somewhere in the middle
5959
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
60-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
60+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
6161

6262
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
6363
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -98,6 +98,7 @@ rm -f $LOG_DIR/ENDED
9898

9999
# Number of GPUs per node
100100
gpus_per_node=8
101+
CPUS_PER_WORKER=${CPUS_PER_WORKER:-$((gpus_per_node * 16))}
101102

102103
num_retries={{ num_retries }}
103104

@@ -256,13 +257,12 @@ ray start --head \
256257
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
257258
--dashboard-port=${DASHBOARD_PORT} \
258259
\
259-
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
260-
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
261-
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
262-
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
263-
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
264-
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
265-
$RAY_DEBUGGER_ARGS \
260+
--node-manager-port=${NODE_MANAGER_PORT} \
261+
--object-manager-port=${OBJECT_MANAGER_PORT} \
262+
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
263+
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
264+
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
265+
--metrics-export-port=${METRICS_EXPORT_PORT} \
266266
\
267267
--block
268268
EOFINNER
@@ -279,7 +279,7 @@ touch $LOG_DIR/ENDED
279279
exit 1
280280
EOF
281281
)
282-
srun {{ common_srun_args }} --container-name=ray-head --exclusive --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
282+
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
283283
SRUN_PIDS["ray-head"]=$!
284284

285285
# Wait for the head node container to start and for Ray to be ready
@@ -336,6 +336,7 @@ monitor-sidecar &
336336
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
337337
338338
cat <<EOFINNER | tee /launch-worker.sh
339+
sleep 5
339340
ray start --address "$ip_head" \
340341
--disable-usage-stats \
341342
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
@@ -367,7 +368,7 @@ EOF
367368
if [[ $i -eq 0 ]]; then
368369
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
369370
fi
370-
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
371+
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
371372
SRUN_PIDS["ray-worker-$i"]=$!
372373
sleep 3
373374
done

nemo_run/run/ray/templates/ray_enroot.sub.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ touch $LOG_DIR/ENDED
279279
exit 1
280280
EOF
281281
)
282-
srun {{ common_srun_args }} --container-name=ray-head --exclusive --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
282+
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
283283
SRUN_PIDS["ray-head"]=$!
284284

285285
# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
@@ -380,7 +380,7 @@ EOF
380380
if [[ $i -eq 0 ]]; then
381381
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
382382
fi
383-
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
383+
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
384384
SRUN_PIDS["ray-worker-$i"]=$!
385385
sleep 3
386386
done

test/core/execution/artifacts/expected_ray_cluster.sub

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
3030
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
3131

3232
# Ports for the head node
33-
PORT=${PORT:-54514}
33+
PORT=${PORT:-6379}
3434
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
3535
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
3636
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
@@ -59,7 +59,7 @@ fi
5959
# (not including the other ports set by this script). So this range is chosen to be
6060
# somewhere in the middle
6161
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
62-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
62+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
6363

6464
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
6565
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -100,6 +100,7 @@ rm -f $LOG_DIR/ENDED
100100

101101
# Number of GPUs per node
102102
gpus_per_node=8
103+
CPUS_PER_WORKER=${CPUS_PER_WORKER:-$((gpus_per_node * 16))}
103104

104105
num_retries=1
105106

@@ -250,13 +251,12 @@ ray start --head \
250251
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
251252
--dashboard-port=${DASHBOARD_PORT} \
252253
\
253-
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
254-
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
255-
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
256-
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
257-
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
258-
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
259-
$RAY_DEBUGGER_ARGS \
254+
--node-manager-port=${NODE_MANAGER_PORT} \
255+
--object-manager-port=${OBJECT_MANAGER_PORT} \
256+
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
257+
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
258+
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
259+
--metrics-export-port=${METRICS_EXPORT_PORT} \
260260
\
261261
--block
262262
EOFINNER
@@ -273,7 +273,7 @@ touch $LOG_DIR/ENDED
273273
exit 1
274274
EOF
275275
)
276-
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --exclusive --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
276+
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
277277
SRUN_PIDS["ray-head"]=$!
278278

279279
# Wait for the head node container to start and for Ray to be ready
@@ -326,6 +326,7 @@ monitor-sidecar &
326326
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
327327
328328
cat <<EOFINNER | tee /launch-worker.sh
329+
sleep 5
329330
ray start --address "$ip_head" \
330331
--disable-usage-stats \
331332
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
@@ -357,7 +358,7 @@ EOF
357358
if [[ $i -eq 0 ]]; then
358359
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
359360
fi
360-
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
361+
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
361362
SRUN_PIDS["ray-worker-$i"]=$!
362363
sleep 3
363364
done
@@ -439,4 +440,4 @@ EOF
439440
chmod +x $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh
440441
echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh"
441442
sleep infinity
442-
fi
443+
fi

test/core/execution/artifacts/expected_ray_cluster_enroot.sub

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ touch $LOG_DIR/ENDED
273273
exit 1
274274
EOF
275275
)
276-
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --exclusive --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
276+
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
277277
SRUN_PIDS["ray-head"]=$!
278278

279279
# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
@@ -370,7 +370,7 @@ EOF
370370
if [[ $i -eq 0 ]]; then
371371
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
372372
fi
373-
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
373+
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
374374
SRUN_PIDS["ray-worker-$i"]=$!
375375
sleep 3
376376
done

test/core/execution/artifacts/expected_ray_cluster_ssh.sub

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
3131
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
3232

3333
# Ports for the head node
34-
PORT=${PORT:-54514}
34+
PORT=${PORT:-6379}
3535
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
3636
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
3737
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
@@ -60,7 +60,7 @@ fi
6060
# (not including the other ports set by this script). So this range is chosen to be
6161
# somewhere in the middle
6262
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
63-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
63+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
6464

6565
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
6666
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -101,6 +101,7 @@ rm -f $LOG_DIR/ENDED
101101

102102
# Number of GPUs per node
103103
gpus_per_node=8
104+
CPUS_PER_WORKER=${CPUS_PER_WORKER:-$((gpus_per_node * 16))}
104105

105106
num_retries=1
106107

@@ -255,13 +256,12 @@ ray start --head \
255256
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
256257
--dashboard-port=${DASHBOARD_PORT} \
257258
\
258-
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
259-
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
260-
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
261-
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
262-
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
263-
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
264-
$RAY_DEBUGGER_ARGS \
259+
--node-manager-port=${NODE_MANAGER_PORT} \
260+
--object-manager-port=${OBJECT_MANAGER_PORT} \
261+
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
262+
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
263+
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
264+
--metrics-export-port=${METRICS_EXPORT_PORT} \
265265
\
266266
--block
267267
EOFINNER
@@ -278,7 +278,7 @@ touch $LOG_DIR/ENDED
278278
exit 1
279279
EOF
280280
)
281-
srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --exclusive --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
281+
srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
282282
SRUN_PIDS["ray-head"]=$!
283283

284284
# Wait for the head node container to start and for Ray to be ready
@@ -333,6 +333,7 @@ monitor-sidecar &
333333
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
334334
335335
cat <<EOFINNER | tee /launch-worker.sh
336+
sleep 5
336337
ray start --address "$ip_head" \
337338
--disable-usage-stats \
338339
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
@@ -364,7 +365,7 @@ EOF
364365
if [[ $i -eq 0 ]]; then
365366
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
366367
fi
367-
srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
368+
srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
368369
SRUN_PIDS["ray-worker-$i"]=$!
369370
sleep 3
370371
done
@@ -446,4 +447,4 @@ EOF
446447
chmod +x $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh
447448
echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh"
448449
sleep infinity
449-
fi
450+
fi

0 commit comments

Comments
 (0)