Skip to content

Commit 33fce8e

Browse files
authored
[https://nvbugs/5405041][fix] Update wide ep doc (#6950)
Signed-off-by: Xianjie <[email protected]>
1 parent 550faa9 commit 33fce8e

File tree

1 file changed

+7
-5
lines changed

1 file changed

+7
-5
lines changed

examples/wide_ep/slurm_scripts/submit.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ workdir=<workdir> # Path to disaggr_torch.slurm
1111
model_dir=<model_dir> # Path to the model checkpoint
1212

1313
mtp_size=0
14-
ntasks_per_node=4 # 4 GPUs per GB200 node
14+
ntasks_per_node=4 # 4 GPUs per GB200 node, 8 GPUs per B200 node
1515

1616
isl=1024
1717
osl=1024
@@ -22,8 +22,9 @@ streaming=true
2222
for b in 1 64 1024; do
2323
for eplb_num_slots in 0 256 288; do
2424
concurrency=$((b * 16))
25-
ctx_num=$(((concurrency + 5499)/5500))
26-
total_node_num=$((ctx_num + 4))
25+
ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
26+
ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
27+
total_node_num=$((ctx_node_num + 4)) # $((ctx_node_num + 2)) for B200
2728
ntasks=$((total_node_num * ntasks_per_node))
2829

2930
args=(
@@ -56,8 +57,9 @@ done
5657
# dep32 eplb288
5758
for b in 512; do
5859
concurrency=$((b * 32))
59-
ctx_num=$(((concurrency + 5499)/5500))
60-
total_node_num=$((ctx_num + 8))
60+
ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
61+
ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
62+
total_node_num=$((ctx_node_num + 8)) # $((ctx_node_num + 4)) for B200
6163
ntasks=$((total_node_num * ntasks_per_node))
6264
eplb_num_slots=288
6365

0 commit comments

Comments
 (0)