You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: tensorrt_llm/llmapi/llm_args.py
+29-1Lines changed: 29 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -2240,6 +2240,18 @@ class TorchLlmArgs(BaseLlmArgs):
2240
2240
"If greater than 0, the request queue might wait up to batch_wait_timeout_ms to receive max_batch_size requests, if fewer than max_batch_size requests are currently available. If 0, no waiting occurs.",
2241
2241
status="prototype")
2242
2242
2243
+
batch_wait_timeout_iters: int=Field(
2244
+
default=0,
2245
+
description=
2246
+
"Maximum number of iterations the scheduler will wait to accumulate new coming requests for improved GPU utilization efficiency. If greater than 0, the scheduler will delay batch processing to gather more requests up to the specified iteration limit. If 0, disables timeout-iters-based batching delays.",
2247
+
status="prototype")
2248
+
2249
+
batch_wait_max_tokens_ratio: float=Field(
2250
+
default=0,
2251
+
description=
2252
+
"Token accumulation threshold ratio for batch scheduling optimization. If greater than 0, the scheduler will accumulate requests locally until the total token count reaches batch_wait_max_tokens_ratio * max_num_tokens. This mechanism enhances GPU utilization efficiency by ensuring adequate batch sizes.If 0 disables token-based batching delays.",
0 commit comments