Merge branch 'release/1.0' into user/bo/release/1.0/fix-nixl-tests

bo-nv · web-flow · commit 9743e5fdcfa2 · 2025-08-17T15:22:42.000+08:00
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,10 +1,5 @@
 # This file defines code ownership rules for the repository.
 
-# The following rule should only be uncommented on release branches (e.g., release/0.19).
-# The rule below requires that any PR to release/**/* branches must be approved by at least one member
-# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
-# Without approval from a member of this team, PRs cannot be merged to release branches.
-* @NVIDIA/trt-llm-release-branch-approval
 
 # TensorRT-LLM Pytorch backend
 /tensorrt_llm/_torch @NVIDIA/trt-llm-torch-devs
@@ -155,3 +150,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 # from a member of this team, PRs affecting public APIs cannot be merged to main or release branches.
 /tests/unittest/api_stability/ @NVIDIA/trt-llm-noncommitted-api-review-committee
 /tests/unittest/api_stability/references_committed/ @NVIDIA/trt-llm-committed-api-review-committee
+
+# The following rule should only be uncommented on release branches (e.g., release/0.19).
+# The rule below requires that any PR to release/**/* branches must be approved by at least one member
+# of the NVIDIA/trt-llm-release-branch-approval team, regardless of who else approves the PR.
+# Without approval from a member of this team, PRs cannot be merged to release branches.
+* @NVIDIA/trt-llm-release-branch-approval
diff --git a/examples/wide_ep/slurm_scripts/submit.sh b/examples/wide_ep/slurm_scripts/submit.sh
@@ -11,7 +11,7 @@ workdir=<workdir>  # Path to disaggr_torch.slurm
 model_dir=<model_dir>  # Path to the model checkpoint
 
 mtp_size=0
-ntasks_per_node=4 # 4 GPUs per GB200 node
+ntasks_per_node=4 # 4 GPUs per GB200 node, 8 GPUs per B200 node
 
 isl=1024
 osl=1024
@@ -22,8 +22,9 @@ streaming=true
 for b in 1 64 1024; do
     for eplb_num_slots in 0 256 288; do
         concurrency=$((b * 16))
-        ctx_num=$(((concurrency + 5499)/5500))
-        total_node_num=$((ctx_num + 4))
+        ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
+        ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
+        total_node_num=$((ctx_node_num + 4)) # $((ctx_node_num + 2)) for B200
         ntasks=$((total_node_num * ntasks_per_node))
 
         args=(
@@ -56,8 +57,9 @@ done
 # dep32 eplb288
 for b in 512; do
     concurrency=$((b * 32))
-    ctx_num=$(((concurrency + 5499)/5500))
-    total_node_num=$((ctx_num + 8))
+    ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200
+    ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200
+    total_node_num=$((ctx_node_num + 8)) # $((ctx_node_num + 4)) for B200
     ntasks=$((total_node_num * ntasks_per_node))
     eplb_num_slots=288
 
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -182,6 +182,9 @@ def _forward_nope(
                                         attention_mask=attention_mask,
                                         mrope_config=mrope_config)
 
+        if isinstance(attn_output, tuple):
+            attn_output = Fp4QuantizedTensor(attn_output[0], attn_output[1])
+
         attn_output = self.o_proj(attn_output,
                                   all_reduce_params=all_reduce_params)
 
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -234,10 +234,11 @@
     help="Path where per request information is written to.",
 )
 @optgroup.option(
-    "--enable_chunked_context/--disable_chunked_context",
-    default=True,
-    help=
-    "Enable/disable chunking in prefill stage for enhanced throughput benchmark. "
+    "--enable_chunked_context",
+    is_flag=True,
+    default=None,
+    help="Enable chunking in prefill stage for enhanced throughput benchmark. "
+    "Default is False for PyTorch/AutoDeploy backend, True for TensorRT backend.",
 )
 @optgroup.option(
     "--scheduler_policy",
@@ -348,8 +349,11 @@ def throughput_command(
     kv_cache_percent = params.get("kv_cache_free_gpu_mem_fraction")
     beam_width = params.get("beam_width")
     streaming: bool = params.get("streaming")
-    enable_chunked_context: bool = params.get("enable_chunked_context")
     scheduler_policy: str = params.get("scheduler_policy")
+    enable_chunked_context: bool = params.get("enable_chunked_context")
+    if enable_chunked_context is None:
+        # Set default based on backend: True for TensorRT, False for others
+        enable_chunked_context = backend.lower() == "tensorrt"
 
     # Update configuration with runtime options
     exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -243,7 +243,6 @@ examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-re
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5409414)
 test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416)
-test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420)
 llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5410399)
 unittest/trt/attention/test_gpt_attention.py -k "partition0" SKIP (https://nvbugs/5412456)
 unittest/trt/attention/test_gpt_attention.py -k "partition1" SKIP (https://nvbugs/5412456)