@@ -31,7 +31,7 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
3131METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:- 53009}
3232
3333# Ports for the head node
34- PORT=${PORT:- 54514 }
34+ PORT=${PORT:- 6379 }
3535RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:- 10001}
3636# REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
3737DASHBOARD_PORT=${DASHBOARD_PORT:- 8265} # Also used by debugger
6060# (not including the other ports set by this script). So this range is chosen to be
6161# somewhere in the middle
6262MIN_WORKER_PORT=${MIN_WORKER_PORT:- 54001}
63- MAX_WORKER_PORT=${MAX_WORKER_PORT:- 54513 }
63+ MAX_WORKER_PORT=${MAX_WORKER_PORT:- 54257 }
6464
6565# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
6666RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ ray-cluster}
@@ -101,6 +101,7 @@ rm -f $LOG_DIR/ENDED
101101
102102# Number of GPUs per node
103103gpus_per_node=8
104+ CPUS_PER_WORKER=${CPUS_PER_WORKER:- $((gpus_per_node * 16))}
104105
105106num_retries=1
106107
@@ -255,13 +256,12 @@ ray start --head \
255256 --ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
256257 --dashboard-port=${DASHBOARD_PORT} \
257258 \
258- --node-manager-port=$(( ${NODE_MANAGER_PORT} + 1 )) \
259- --object-manager-port=$(( ${OBJECT_MANAGER_PORT} + 1 )) \
260- --runtime-env-agent-port=$(( ${RUNTIME_ENV_AGENT_PORT} + 1 )) \
261- --dashboard-agent-grpc-port=$(( ${DASHBOARD_AGENT_GRPC_PORT} + 1 )) \
262- --dashboard-agent-listen-port=$(( ${DASHBOARD_AGENT_LISTEN_PORT} + 1 )) \
263- --metrics-export-port=$(( ${METRICS_EXPORT_PORT} + 1 )) \
264- $RAY_DEBUGGER_ARGS \
259+ --node-manager-port=${NODE_MANAGER_PORT} \
260+ --object-manager-port=${OBJECT_MANAGER_PORT} \
261+ --runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
262+ --dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
263+ --dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
264+ --metrics-export-port=${METRICS_EXPORT_PORT} \
265265 \
266266 --block
267267EOFINNER
@@ -278,7 +278,7 @@ touch $LOG_DIR/ENDED
278278exit 1
279279EOF
280280)
281- srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --exclusive -- nodes=1 --ntasks=1 -w " $head_node " -o $LOG_DIR /ray-head.log bash -x -c " $head_cmd " &
281+ srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task= $CPUS_PER_WORKER -w " $head_node " -o $LOG_DIR /ray-head.log bash -x -c " $head_cmd " &
282282SRUN_PIDS[" ray-head" ]=$!
283283
284284# Wait for the head node container to start and for Ray to be ready
@@ -333,6 +333,7 @@ monitor-sidecar &
333333sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
334334
335335cat <<EOFINNER | tee /launch-worker.sh
336+ sleep 5
336337ray start --address "$ip_head " \
337338 --disable-usage-stats \
338339 --resources="{\"worker_units\": $gpus_per_node , \"slurm_managed_ray_cluster\": 1}" \
364365 if [[ $i -eq 0 ]]; then
365366 OVERLAP_HEAD_AND_WORKER_ARG=" --overlap"
366367 fi
367- srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training ${OVERLAP_HEAD_AND_WORKER_ARG:- } --container-name=ray-worker-$i --exclusive --nodes=1 --ntasks=1 -w " $node_i " -o $LOG_DIR /ray-worker-$i .log bash -x -c " $worker_cmd " &
368+ srun --container-image=nvcr.io/nvidia/nemo:24.01 --no-container-mount-home --mpi=pmix -A=research_account -p=gpu_partition --gres=gpu:8 --container-mounts /data:/data,/models:/models,/nemo_run:/nemo_run,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training:/lustre/fsw/projects/research/jobs/multi-node-training,/lustre/fsw/projects/research/jobs/multi-node-training/logs:/lustre/fsw/projects/research/jobs/multi-node-training/logs --container-workdir=/workspace/training ${OVERLAP_HEAD_AND_WORKER_ARG:- } --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task= $CPUS_PER_WORKER -w " $node_i " -o $LOG_DIR /ray-worker-$i .log bash -x -c " $worker_cmd " &
368369 SRUN_PIDS[" ray-worker-$i " ]=$!
369370 sleep 3
370371done
446447 chmod +x $CLUSTER_DIR /scripts/${SLURM_JOB_ID} -attach.sh
447448 echo " bash $CLUSTER_DIR /scripts/${SLURM_JOB_ID} -attach.sh"
448449 sleep infinity
449- fi
450+ fi
0 commit comments