build: Update trtllm commit to latest from release/1.1.0rc2 branch (#3023)

rmccorm4 · web-flow · commit 8bc871ebbe55 · 2025-09-16T13:26:15.000-04:00
Signed-off-by: Ryan McCormick &lt;rmccormick@nvidia.com&gt;
diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
@@ -9,8 +9,8 @@ CONTAINER_NAME=disaggr-test
 
 
 STREAMING=true
-CTX_GPU_FRAC=0.75
-CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448
+CTX_GPU_FRAC=0.85
+CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608}
 
 num_ctx_servers=$1
 ctx_tp_size=$2
diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench.sh b/components/backends/trtllm/performance_sweeps/scripts/bench.sh
@@ -155,8 +155,9 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
         --random-output-len ${osl} \
         --random-range-ratio 0.8 \
         --ignore-eos \
-        --backend "dynamo" \
-        --endpoint "/v1/chat/completions" \
+        --use-chat-template \
+        --backend "openai" \
+        --endpoint "/v1/completions" \
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "1" \
         --host ${hostname} \
@@ -179,8 +180,9 @@ for concurrency in ${concurrency_list}; do
         --random-range-ratio 0.8 \
         --use-chat-template \
         --ignore-eos \
-        --backend "dynamo" \
-        --endpoint "/v1/chat/completions" \
+        --use-chat-template \
+        --backend "openai" \
+        --endpoint "/v1/completions" \
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "$concurrency" \
         --host ${hostname} \
diff --git a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
@@ -188,6 +188,7 @@ def gen_config_file(
         64,
         128,
         256,
+        384,
         512,
         768,
         1024,
@@ -209,6 +210,7 @@ def gen_config_file(
         "moe_expert_parallel_size": ctx_tp_size,
         "enable_attention_dp": ctx_enable_attention_dp,
         "pipeline_parallel_size": 1,
+        "cuda_graph_config": None,
         "print_iter_log": True,
         "disable_overlap_scheduler": True,
         "kv_cache_config": {
@@ -242,12 +244,16 @@ def gen_config_file(
         },
         "moe_config": {
             "backend": gen_moe_backend,
+            "use_low_precision_moe_combine": True,
         },
         "cache_transceiver_config": {
             "max_tokens_in_buffer": cache_transceiver_max_num_tokens,
             "backend": "DEFAULT",
         },
         "stream_interval": 20,
+        # Should be unused in Dynamo integration when TRTLLM detokenization
+        # is disabled, but set it here for config parity.
+        "num_postprocess_workers": 8,
     }
 
     if gen_tp_size == 8 and not gen_enable_attention_dp:
diff --git a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
@@ -40,12 +40,19 @@ echo "  max_batch_size: ${max_batch_size}"
 echo "  max_seq_len: ${max_seq_len}"
 
 export TLLM_LOG_LEVEL=INFO
+# NOTE: This var is default behavior in recent trtllm commits, and can
+# be removed. Keeping it here in case the script is ran with older commits.
 export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
+# NOTE: This var was replaced with an LLM API / yaml engine config field
+# "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and
+# can be removed. Keeping it here in case the script is ran with older commits.
+export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1
 
 if [ "${enable_pdl}" = "true" ]; then
     export TRTLLM_ENABLE_PDL=1
 fi
 
+# NOTE: Set (or unset) these depending on what cluster you're using
 export TRTLLM_UCX_INTERFACE=enP6p9s0np0
 export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_3:1,mlx5_4:1,enP6p9s0np0
 
diff --git a/components/backends/trtllm/performance_sweeps/submit_disagg.sh b/components/backends/trtllm/performance_sweeps/submit_disagg.sh
@@ -293,6 +293,53 @@ main() {
                 run_32_gpus_mtp
             fi
             ;;
+        "pareto")
+            # 1k/1k
+            export ISL=1024
+            export OSL=1024
+            export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608
+
+            if [[ "$mtp_mode" == "mtp=off" ]]; then
+                # 1k/1k mtp=off
+                run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 141"
+                run_single 1 1 32 32 32 true "0.7" 0 0 "1075"
+                run_single 1 1 16 64 64 true "0.75" 0 0 "1075"
+                run_single 2 1 16 256 256 true "0.75" 0 0 "2048 4300"
+                run_single 1 1 8 512 512 true "0.8" 0 0 "4300"
+
+            else
+                # 1k/1k mtp=on
+                run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 36"
+                run_single 1 1 16 64 256 true "0.7" 3 0 "512 1075"
+                run_single 2 1 16 128 256 true "0.7" 1 0 "2150"
+                run_single 1 1 32 16 64 true "0.6" 3 0 "512"
+                run_single 1 1 8 256 512 true "0.8" 1 0 "2252"
+            fi
+
+            # 8k/1k
+            export ISL=8192
+            export OSL=1024
+            export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448
+
+            if [[ "$mtp_mode" == "mtp=off" ]]; then
+                # 8k/1k mtp=off
+                run_single 1 3 8 32 32 false "0.9" 0 0 "1 2 4 8 16 34"
+                run_single 4 1 32 16 16 true "0.7" 0 0 "256 538"
+                run_single 7 1 32 32 32 true "0.7" 0 0 "1075" # remove if need 5 cofigs
+                run_single 6 1 16 64 64 true "0.75" 0 0 "1075"
+                run_single 8 1 16 128 128 true "0.75" 0 0 "2150"
+                run_single 5 1 8 256 256 true "0.8" 0 0 "2150"
+            else
+                # 8k/1k mtp=on
+                run_single 1 3 8 16 64 false "0.9" 3 0 "1 2 4 8 18"
+                run_single 5 1 32 8 32 true "0.7" 3 0 "128 269"
+                run_single 8 1 32 16 64 true "0.7" 3 0 "538"
+                run_single 6 1 16 32 128 true "0.75" 3 0 "538" # remove if need 5 configs
+                run_single 8 1 16 64 256 true "0.75" 2 0 "1075"
+                run_single 5 1 8 128 256 true "0.8" 1 0 "1075" # remove if need 5 configs
+                run_single 6 1 8 256 512 true "0.8" 1 0 "2150"
+            fi
+            ;;
         "4GPU")
             echo "Running 4 GPUs combinations for $mtp_mode mode..."
             if [[ "$mtp_mode" == "mtp=off" ]]; then
@@ -379,4 +426,4 @@ if [ $# -eq 0 ]; then
 fi
 
 # Run main function
-main "$@"
+main "$@"
diff --git a/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py b/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -216,7 +216,9 @@ async def generate_locally(self, request: dict):
                 )
 
             if res.finished and not out.get("finish_reason"):
-                logging.warning("Request finished with no finish reason set - this indicates a possible bug")
+                logging.warning(
+                    "Request finished with no finish reason set - this indicates a possible bug"
+                )
 
             # Yield the chunk to the client and update the token count for the next iteration.
             yield out
diff --git a/container/build.sh b/container/build.sh
@@ -89,7 +89,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="9d6e87aed37b6f0b3b2be097c5fafe1497190a71"
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="ef0d06df5812b510f9d3a03b3cbb6fbf6a06406f"
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""
diff --git a/pyproject.toml b/pyproject.toml
@@ -154,6 +154,7 @@ filterwarnings = [
     "ignore:.*unclosed.*socket.*:ResourceWarning", # Ignore unclosed socket warnings
     "ignore:.*unclosed event loop.*:ResourceWarning", # Ignore unclosed event loop warnings
     "ignore:.*Exception ignored in.*:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings
+    "ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py
 ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -216,7 +216,9 @@ async def generate_locally(self, request: dict):`
`216`	`216`	`)`
`217`	`217`
`218`	`218`	`if res.finished and not out.get("finish_reason"):`
`219`		`- logging.warning("Request finished with no finish reason set - this indicates a possible bug")`
	`219`	`+ logging.warning(`
	`220`	`+ "Request finished with no finish reason set - this indicates a possible bug"`
	`221`	`+ )`
`220`	`222`
`221`	`223`	`# Yield the chunk to the client and update the token count for the next iteration.`
`222`	`224`	`yield out`
Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,7 @@ filterwarnings = [`
`154`	`154`	`"ignore:.unclosed.socket.*:ResourceWarning", # Ignore unclosed socket warnings`
`155`	`155`	`"ignore:.unclosed event loop.:ResourceWarning", # Ignore unclosed event loop warnings`
`156`	`156`	`"ignore:.Exception ignored in.:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings`
	`157`	`+ "ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py`
`157`	`158`	`]`
`158`	`159`
`159`	`160`