File tree Expand file tree Collapse file tree 1 file changed +7
-5
lines changed
examples/wide_ep/slurm_scripts Expand file tree Collapse file tree 1 file changed +7
-5
lines changed Original file line number Diff line number Diff line change @@ -11,7 +11,7 @@ workdir=<workdir> # Path to disaggr_torch.slurm
11
11
model_dir=< model_dir> # Path to the model checkpoint
12
12
13
13
mtp_size=0
14
- ntasks_per_node=4 # 4 GPUs per GB200 node
14
+ ntasks_per_node=4 # 4 GPUs per GB200 node, 8 GPUs per B200 node
15
15
16
16
isl=1024
17
17
osl=1024
@@ -22,8 +22,9 @@ streaming=true
22
22
for b in 1 64 1024; do
23
23
for eplb_num_slots in 0 256 288; do
24
24
concurrency=$(( b * 16 ))
25
- ctx_num=$(( (concurrency + 5499 )/ 5500 ))
26
- total_node_num=$(( ctx_num + 4 ))
25
+ ctx_node_num=$(( (concurrency + 5499 )/ 5500 )) # $(((concurrency + 10999)/11000)) for B200
26
+ ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
27
+ total_node_num=$(( ctx_node_num + 4 )) # $((ctx_node_num + 2)) for B200
27
28
ntasks=$(( total_node_num * ntasks_per_node))
28
29
29
30
args=(
56
57
# dep32 eplb288
57
58
for b in 512; do
58
59
concurrency=$(( b * 32 ))
59
- ctx_num=$(( (concurrency + 5499 )/ 5500 ))
60
- total_node_num=$(( ctx_num + 8 ))
60
+ ctx_node_num=$(( (concurrency + 5499 )/ 5500 )) # $(((concurrency + 10999)/11000)) for B200
61
+ ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
62
+ total_node_num=$(( ctx_node_num + 8 )) # $((ctx_node_num + 4)) for B200
61
63
ntasks=$(( total_node_num * ntasks_per_node))
62
64
eplb_num_slots=288
63
65
You can’t perform that action at this time.
0 commit comments