File tree Expand file tree Collapse file tree 5 files changed +25
-16
lines changed
examples/wide_ep/slurm_scripts
tests/integration/test_lists Expand file tree Collapse file tree 5 files changed +25
-16
lines changed Original file line number Diff line number Diff line change 1
1
# This file defines code ownership rules for the repository.
2
2
3
- # The following rule should only be uncommented on release branches (e.g., release/0.19).
4
- # The rule below requires that any PR to release/**/* branches must be approved by at least one member
5
- # of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
6
- # Without approval from a member of this team, PRs cannot be merged to release branches.
7
- * @ NVIDIA/trt-llm-release-branch-approval
8
3
9
4
# TensorRT-LLM Pytorch backend
10
5
/tensorrt_llm /_torch @ NVIDIA/trt-llm-torch-devs
@@ -155,3 +150,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
155
150
# from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
156
151
/tests /unittest /api_stability / @ NVIDIA/trt-llm-noncommitted-api-review-committee
157
152
/tests /unittest /api_stability /references_committed / @ NVIDIA/trt-llm-committed-api-review-committee
153
+
154
+ # The following rule should only be uncommented on release branches (e.g., release/0.19).
155
+ # The rule below requires that any PR to release/**/* branches must be approved by at least one member
156
+ # of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
157
+ # Without approval from a member of this team, PRs cannot be merged to release branches.
158
+ * @ NVIDIA/trt-llm-release-branch-approval
Original file line number Diff line number Diff line change @@ -11,7 +11,7 @@ workdir=<workdir> # Path to disaggr_torch.slurm
11
11
model_dir=< model_dir> # Path to the model checkpoint
12
12
13
13
mtp_size=0
14
- ntasks_per_node=4 # 4 GPUs per GB200 node
14
+ ntasks_per_node=4 # 4 GPUs per GB200 node, 8 GPUs per B200 node
15
15
16
16
isl=1024
17
17
osl=1024
@@ -22,8 +22,9 @@ streaming=true
22
22
for b in 1 64 1024; do
23
23
for eplb_num_slots in 0 256 288; do
24
24
concurrency=$(( b * 16 ))
25
- ctx_num=$(( (concurrency + 5499 )/ 5500 ))
26
- total_node_num=$(( ctx_num + 4 ))
25
+ ctx_node_num=$(( (concurrency + 5499 )/ 5500 )) # $(((concurrency + 10999)/11000)) for B200
26
+ ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
27
+ total_node_num=$(( ctx_node_num + 4 )) # $((ctx_node_num + 2)) for B200
27
28
ntasks=$(( total_node_num * ntasks_per_node))
28
29
29
30
args=(
56
57
# dep32 eplb288
57
58
for b in 512; do
58
59
concurrency=$(( b * 32 ))
59
- ctx_num=$(( (concurrency + 5499 )/ 5500 ))
60
- total_node_num=$(( ctx_num + 8 ))
60
+ ctx_node_num=$(( (concurrency + 5499 )/ 5500 )) # $(((concurrency + 10999)/11000)) for B200
61
+ ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
62
+ total_node_num=$(( ctx_node_num + 8 )) # $((ctx_node_num + 4)) for B200
61
63
ntasks=$(( total_node_num * ntasks_per_node))
62
64
eplb_num_slots=288
63
65
Original file line number Diff line number Diff line change @@ -182,6 +182,9 @@ def _forward_nope(
182
182
attention_mask = attention_mask ,
183
183
mrope_config = mrope_config )
184
184
185
+ if isinstance (attn_output , tuple ):
186
+ attn_output = Fp4QuantizedTensor (attn_output [0 ], attn_output [1 ])
187
+
185
188
attn_output = self .o_proj (attn_output ,
186
189
all_reduce_params = all_reduce_params )
187
190
Original file line number Diff line number Diff line change 234
234
help = "Path where per request information is written to." ,
235
235
)
236
236
@optgroup .option (
237
- "--enable_chunked_context/--disable_chunked_context" ,
238
- default = True ,
239
- help =
240
- "Enable/disable chunking in prefill stage for enhanced throughput benchmark. "
237
+ "--enable_chunked_context" ,
238
+ is_flag = True ,
239
+ default = None ,
240
+ help = "Enable chunking in prefill stage for enhanced throughput benchmark. "
241
+ "Default is False for PyTorch/AutoDeploy backend, True for TensorRT backend." ,
241
242
)
242
243
@optgroup .option (
243
244
"--scheduler_policy" ,
@@ -348,8 +349,11 @@ def throughput_command(
348
349
kv_cache_percent = params .get ("kv_cache_free_gpu_mem_fraction" )
349
350
beam_width = params .get ("beam_width" )
350
351
streaming : bool = params .get ("streaming" )
351
- enable_chunked_context : bool = params .get ("enable_chunked_context" )
352
352
scheduler_policy : str = params .get ("scheduler_policy" )
353
+ enable_chunked_context : bool = params .get ("enable_chunked_context" )
354
+ if enable_chunked_context is None :
355
+ # Set default based on backend: True for TensorRT, False for others
356
+ enable_chunked_context = backend .lower () == "tensorrt"
353
357
354
358
# Update configuration with runtime options
355
359
exec_settings ["settings_config" ]["kv_cache_percent" ] = kv_cache_percent
Original file line number Diff line number Diff line change @@ -243,7 +243,6 @@ examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-re
243
243
examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
244
244
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5409414)
245
245
test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416)
246
- test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420)
247
246
llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5410399)
248
247
unittest/trt/attention/test_gpt_attention.py -k "partition0" SKIP (https://nvbugs/5412456)
249
248
unittest/trt/attention/test_gpt_attention.py -k "partition1" SKIP (https://nvbugs/5412456)
You can’t perform that action at this time.
0 commit comments