Skip to content

Commit 9743e5f

Browse files
authored
Merge branch 'release/1.0' into user/bo/release/1.0/fix-nixl-tests
2 parents e1e0e71 + 33fce8e commit 9743e5f

File tree

5 files changed

+25
-16
lines changed

5 files changed

+25
-16
lines changed

.github/CODEOWNERS

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
11
# This file defines code ownership rules for the repository.
22

3-
# The following rule should only be uncommented on release branches (e.g., release/0.19).
4-
# The rule below requires that any PR to release/**/* branches must be approved by at least one member
5-
# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
6-
# Without approval from a member of this team, PRs cannot be merged to release branches.
7-
* @NVIDIA/trt-llm-release-branch-approval
83

94
# TensorRT-LLM Pytorch backend
105
/tensorrt_llm/_torch @NVIDIA/trt-llm-torch-devs
@@ -155,3 +150,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
155150
# from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
156151
/tests/unittest/api_stability/ @NVIDIA/trt-llm-noncommitted-api-review-committee
157152
/tests/unittest/api_stability/references_committed/ @NVIDIA/trt-llm-committed-api-review-committee
153+
154+
# The following rule should only be uncommented on release branches (e.g., release/0.19).
155+
# The rule below requires that any PR to release/**/* branches must be approved by at least one member
156+
# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
157+
# Without approval from a member of this team, PRs cannot be merged to release branches.
158+
* @NVIDIA/trt-llm-release-branch-approval

examples/wide_ep/slurm_scripts/submit.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ workdir=<workdir> # Path to disaggr_torch.slurm
1111
model_dir=<model_dir> # Path to the model checkpoint
1212

1313
mtp_size=0
14-
ntasks_per_node=4 # 4 GPUs per GB200 node
14+
ntasks_per_node=4 # 4 GPUs per GB200 node, 8 GPUs per B200 node
1515

1616
isl=1024
1717
osl=1024
@@ -22,8 +22,9 @@ streaming=true
2222
for b in 1 64 1024; do
2323
for eplb_num_slots in 0 256 288; do
2424
concurrency=$((b * 16))
25-
ctx_num=$(((concurrency + 5499)/5500))
26-
total_node_num=$((ctx_num + 4))
25+
ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
26+
ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
27+
total_node_num=$((ctx_node_num + 4)) # $((ctx_node_num + 2)) for B200
2728
ntasks=$((total_node_num * ntasks_per_node))
2829

2930
args=(
@@ -56,8 +57,9 @@ done
5657
# dep32 eplb288
5758
for b in 512; do
5859
concurrency=$((b * 32))
59-
ctx_num=$(((concurrency + 5499)/5500))
60-
total_node_num=$((ctx_num + 8))
60+
ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
61+
ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
62+
total_node_num=$((ctx_node_num + 8)) # $((ctx_node_num + 4)) for B200
6163
ntasks=$((total_node_num * ntasks_per_node))
6264
eplb_num_slots=288
6365

tensorrt_llm/_torch/models/modeling_llama.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ def _forward_nope(
182182
attention_mask=attention_mask,
183183
mrope_config=mrope_config)
184184

185+
if isinstance(attn_output, tuple):
186+
attn_output = Fp4QuantizedTensor(attn_output[0], attn_output[1])
187+
185188
attn_output = self.o_proj(attn_output,
186189
all_reduce_params=all_reduce_params)
187190

tensorrt_llm/bench/benchmark/throughput.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -234,10 +234,11 @@
234234
help="Path where per request information is written to.",
235235
)
236236
@optgroup.option(
237-
"--enable_chunked_context/--disable_chunked_context",
238-
default=True,
239-
help=
240-
"Enable/disable chunking in prefill stage for enhanced throughput benchmark. "
237+
"--enable_chunked_context",
238+
is_flag=True,
239+
default=None,
240+
help="Enable chunking in prefill stage for enhanced throughput benchmark. "
241+
"Default is False for PyTorch/AutoDeploy backend, True for TensorRT backend.",
241242
)
242243
@optgroup.option(
243244
"--scheduler_policy",
@@ -348,8 +349,11 @@ def throughput_command(
348349
kv_cache_percent = params.get("kv_cache_free_gpu_mem_fraction")
349350
beam_width = params.get("beam_width")
350351
streaming: bool = params.get("streaming")
351-
enable_chunked_context: bool = params.get("enable_chunked_context")
352352
scheduler_policy: str = params.get("scheduler_policy")
353+
enable_chunked_context: bool = params.get("enable_chunked_context")
354+
if enable_chunked_context is None:
355+
# Set default based on backend: True for TensorRT, False for others
356+
enable_chunked_context = backend.lower() == "tensorrt"
353357

354358
# Update configuration with runtime options
355359
exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,6 @@ examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-re
243243
examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
244244
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5409414)
245245
test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416)
246-
test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420)
247246
llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5410399)
248247
unittest/trt/attention/test_gpt_attention.py -k "partition0" SKIP (https://nvbugs/5412456)
249248
unittest/trt/attention/test_gpt_attention.py -k "partition1" SKIP (https://nvbugs/5412456)

0 commit comments

Comments
 (0)