Skip to content

Commit 8bc871e

Browse files
authored
build: Update trtllm commit to latest from release/1.1.0rc2 branch (#3023)
Signed-off-by: Ryan McCormick <[email protected]>
1 parent 0ba1486 commit 8bc871e

File tree

8 files changed

+74
-9
lines changed

8 files changed

+74
-9
lines changed

components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ CONTAINER_NAME=disaggr-test
99

1010

1111
STREAMING=true
12-
CTX_GPU_FRAC=0.75
13-
CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448
12+
CTX_GPU_FRAC=0.85
13+
CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608}
1414

1515
num_ctx_servers=$1
1616
ctx_tp_size=$2

components/backends/trtllm/performance_sweeps/scripts/bench.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,9 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
155155
--random-output-len ${osl} \
156156
--random-range-ratio 0.8 \
157157
--ignore-eos \
158-
--backend "dynamo" \
159-
--endpoint "/v1/chat/completions" \
158+
--use-chat-template \
159+
--backend "openai" \
160+
--endpoint "/v1/completions" \
160161
--percentile-metrics ttft,tpot,itl,e2el \
161162
--max-concurrency "1" \
162163
--host ${hostname} \
@@ -179,8 +180,9 @@ for concurrency in ${concurrency_list}; do
179180
--random-range-ratio 0.8 \
180181
--use-chat-template \
181182
--ignore-eos \
182-
--backend "dynamo" \
183-
--endpoint "/v1/chat/completions" \
183+
--use-chat-template \
184+
--backend "openai" \
185+
--endpoint "/v1/completions" \
184186
--percentile-metrics ttft,tpot,itl,e2el \
185187
--max-concurrency "$concurrency" \
186188
--host ${hostname} \

components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ def gen_config_file(
188188
64,
189189
128,
190190
256,
191+
384,
191192
512,
192193
768,
193194
1024,
@@ -209,6 +210,7 @@ def gen_config_file(
209210
"moe_expert_parallel_size": ctx_tp_size,
210211
"enable_attention_dp": ctx_enable_attention_dp,
211212
"pipeline_parallel_size": 1,
213+
"cuda_graph_config": None,
212214
"print_iter_log": True,
213215
"disable_overlap_scheduler": True,
214216
"kv_cache_config": {
@@ -242,12 +244,16 @@ def gen_config_file(
242244
},
243245
"moe_config": {
244246
"backend": gen_moe_backend,
247+
"use_low_precision_moe_combine": True,
245248
},
246249
"cache_transceiver_config": {
247250
"max_tokens_in_buffer": cache_transceiver_max_num_tokens,
248251
"backend": "DEFAULT",
249252
},
250253
"stream_interval": 20,
254+
# Should be unused in Dynamo integration when TRTLLM detokenization
255+
# is disabled, but set it here for config parity.
256+
"num_postprocess_workers": 8,
251257
}
252258

253259
if gen_tp_size == 8 and not gen_enable_attention_dp:

components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,19 @@ echo " max_batch_size: ${max_batch_size}"
4040
echo " max_seq_len: ${max_seq_len}"
4141

4242
export TLLM_LOG_LEVEL=INFO
43+
# NOTE: This var is default behavior in recent trtllm commits, and can
44+
# be removed. Keeping it here in case the script is ran with older commits.
4345
export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
46+
# NOTE: This var was replaced with an LLM API / yaml engine config field
47+
# "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and
48+
# can be removed. Keeping it here in case the script is ran with older commits.
49+
export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1
4450

4551
if [ "${enable_pdl}" = "true" ]; then
4652
export TRTLLM_ENABLE_PDL=1
4753
fi
4854

55+
# NOTE: Set (or unset) these depending on what cluster you're using
4956
export TRTLLM_UCX_INTERFACE=enP6p9s0np0
5057
export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_3:1,mlx5_4:1,enP6p9s0np0
5158

components/backends/trtllm/performance_sweeps/submit_disagg.sh

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,53 @@ main() {
293293
run_32_gpus_mtp
294294
fi
295295
;;
296+
"pareto")
297+
# 1k/1k
298+
export ISL=1024
299+
export OSL=1024
300+
export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608
301+
302+
if [[ "$mtp_mode" == "mtp=off" ]]; then
303+
# 1k/1k mtp=off
304+
run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 141"
305+
run_single 1 1 32 32 32 true "0.7" 0 0 "1075"
306+
run_single 1 1 16 64 64 true "0.75" 0 0 "1075"
307+
run_single 2 1 16 256 256 true "0.75" 0 0 "2048 4300"
308+
run_single 1 1 8 512 512 true "0.8" 0 0 "4300"
309+
310+
else
311+
# 1k/1k mtp=on
312+
run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 36"
313+
run_single 1 1 16 64 256 true "0.7" 3 0 "512 1075"
314+
run_single 2 1 16 128 256 true "0.7" 1 0 "2150"
315+
run_single 1 1 32 16 64 true "0.6" 3 0 "512"
316+
run_single 1 1 8 256 512 true "0.8" 1 0 "2252"
317+
fi
318+
319+
# 8k/1k
320+
export ISL=8192
321+
export OSL=1024
322+
export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448
323+
324+
if [[ "$mtp_mode" == "mtp=off" ]]; then
325+
# 8k/1k mtp=off
326+
run_single 1 3 8 32 32 false "0.9" 0 0 "1 2 4 8 16 34"
327+
run_single 4 1 32 16 16 true "0.7" 0 0 "256 538"
328+
run_single 7 1 32 32 32 true "0.7" 0 0 "1075" # remove if need 5 cofigs
329+
run_single 6 1 16 64 64 true "0.75" 0 0 "1075"
330+
run_single 8 1 16 128 128 true "0.75" 0 0 "2150"
331+
run_single 5 1 8 256 256 true "0.8" 0 0 "2150"
332+
else
333+
# 8k/1k mtp=on
334+
run_single 1 3 8 16 64 false "0.9" 3 0 "1 2 4 8 18"
335+
run_single 5 1 32 8 32 true "0.7" 3 0 "128 269"
336+
run_single 8 1 32 16 64 true "0.7" 3 0 "538"
337+
run_single 6 1 16 32 128 true "0.75" 3 0 "538" # remove if need 5 configs
338+
run_single 8 1 16 64 256 true "0.75" 2 0 "1075"
339+
run_single 5 1 8 128 256 true "0.8" 1 0 "1075" # remove if need 5 configs
340+
run_single 6 1 8 256 512 true "0.8" 1 0 "2150"
341+
fi
342+
;;
296343
"4GPU")
297344
echo "Running 4 GPUs combinations for $mtp_mode mode..."
298345
if [[ "$mtp_mode" == "mtp=off" ]]; then
@@ -379,4 +426,4 @@ if [ $# -eq 0 ]; then
379426
fi
380427

381428
# Run main function
382-
main "$@"
429+
main "$@"

components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,9 @@ async def generate_locally(self, request: dict):
216216
)
217217

218218
if res.finished and not out.get("finish_reason"):
219-
logging.warning("Request finished with no finish reason set - this indicates a possible bug")
219+
logging.warning(
220+
"Request finished with no finish reason set - this indicates a possible bug"
221+
)
220222

221223
# Yield the chunk to the client and update the token count for the next iteration.
222224
yield out

container/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
8989
# TensorRT-LLM commit to use for building the trtllm wheel if not provided.
9090
# Important Note: This commit is not used in our CI pipeline. See the CI
9191
# variables to learn how to run a pipeline with a specific commit.
92-
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="9d6e87aed37b6f0b3b2be097c5fafe1497190a71"
92+
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="ef0d06df5812b510f9d3a03b3cbb6fbf6a06406f"
9393
TRTLLM_COMMIT=""
9494
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
9595
TRTLLM_GIT_URL=""

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ filterwarnings = [
154154
"ignore:.*unclosed.*socket.*:ResourceWarning", # Ignore unclosed socket warnings
155155
"ignore:.*unclosed event loop.*:ResourceWarning", # Ignore unclosed event loop warnings
156156
"ignore:.*Exception ignored in.*:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings
157+
"ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py
157158
]
158159

159160

0 commit comments

Comments
 (0)