Add schedulParams

Shunkang · Shunkang · commit 84bb95c978fa · 2025-07-22T03:36:43.000Z
Signed-off-by: Shunkang &lt;182541032+Shunkangz@users.noreply.github.co&gt;
diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
@@ -29,6 +29,7 @@
                             print_colored_debug)
 from ..sampling_params import (BatchedLogitsProcessor, LogprobParams,
                                SamplingParams)
+from ..schedule_params import ScheduleParams
 from .ipc import FusedIpcQueue
 from .postproc_worker import PostprocParams, PostprocWorkerConfig
 from .request import GenerationRequest, LoRARequest, PromptAdapterRequest
@@ -120,6 +121,7 @@ def generate_async(
         disaggregated_params: Optional[DisaggregatedParams] = None,
         postproc_params: Optional[PostprocParams] = None,
         multimodal_params: Optional[MultimodalParams] = None,
+        schedule_params: Optional[ScheduleParams] = None,
     ) -> GenerationResult:
         """Generate output for the given prompt token ids in the asynchronous mode.
         Asynchronous generation accepts single prompt only.
@@ -142,7 +144,8 @@ def generate_async(
             streaming=streaming,
             kv_cache_retention_config=kv_cache_retention_config,
             disaggregated_params=disaggregated_params,
-            multimodal_params=multimodal_params)
+            multimodal_params=multimodal_params,
+            schedule_params=schedule_params)
         result = self.submit(request)
         # release memory in time
         if hasattr(request, "multimodal_params"):
diff --git a/tensorrt_llm/executor/request.py b/tensorrt_llm/executor/request.py
@@ -10,6 +10,7 @@
 from ..disaggregated_params import DisaggregatedParams
 from ..llmapi.llm_utils import KvCacheRetentionConfig
 from ..sampling_params import SamplingParams
+from ..schedule_params import ScheduleParams
 from .postproc_worker import PostprocParams
 
 __all__ = [
@@ -86,6 +87,7 @@ def __init__(
         disaggregated_params: Optional[DisaggregatedParams] = None,
         postproc_params: Optional[PostprocParams] = None,
         multimodal_params: Optional[MultimodalParams] = None,
+        schedule_params: Optional[ScheduleParams] = None,
     ):
         if isinstance(prompt_token_ids, list):
             self.prompt_token_ids = prompt_token_ids
@@ -110,6 +112,7 @@ def __init__(
         self.kv_cache_retention_config = kv_cache_retention_config
         self.id: Optional[int] = None
         self.disaggregated_params = disaggregated_params
+        self.schedule_params = schedule_params
 
     def set_id(self, id):
         assert self.id is None, f"Request ID is already set: {self.id}"
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -508,6 +508,9 @@ def _deduce_max_tokens(request: GenerationRequest,
                 executor_request.py_logits_post_processors = lp if isinstance(
                     lp, list) else [lp]
 
+            if self._is_pytorch_backend and request.schedule_params is not None:
+                executor_request.py_schedule_params = request.schedule_params
+
             if request.query_token_ids is not None:
                 # pytorch star attention workflow
                 # a workaround to avoid public interface update
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -30,6 +30,7 @@
                       create_input_processor_with_hash, prompt_inputs)
 from ..logger import logger
 from ..sampling_params import SamplingParams
+from ..schedule_params import ScheduleParams
 from .llm_args import (TORCH_LLMARGS_EXPLICIT_DOCSTRING,
                        TRT_LLMARGS_EXPLICIT_DOCSTRING, PybindMirror,
                        TorchLlmArgs, TrtLlmArgs)
@@ -235,6 +236,8 @@ def generate(
             KvCacheRetentionConfig, Sequence[KvCacheRetentionConfig]]] = None,
         disaggregated_params: Optional[Union[
             DisaggregatedParams, Sequence[DisaggregatedParams]]] = None,
+        schedule_params: Optional[Union[ScheduleParams,
+                                        List[ScheduleParams]]] = None,
     ) -> Union[RequestOutput, List[RequestOutput]]:
         """Generate output for the given prompts in the synchronous mode.
         Synchronous generation accepts either single prompt or batched prompts.
@@ -282,6 +285,7 @@ def _item_at(maybe_batched: Union[Any, Sequence[Any]], pos: int) -> Any:
                 kv_cache_retention_config=_item_at(kv_cache_retention_config,
                                                    i),
                 disaggregated_params=_item_at(disaggregated_params, i),
+                schedule_params=_item_at(schedule_params, i),
                 streaming=False)
             futures.append(future)
 
@@ -307,6 +311,7 @@ def generate_async(
         kv_cache_retention_config: Optional[KvCacheRetentionConfig] = None,
         disaggregated_params: Optional[DisaggregatedParams] = None,
         _postproc_params: Optional[PostprocParams] = None,
+        schedule_params: Optional[ScheduleParams] = None,
     ) -> RequestOutput:
         """Generate output for the given prompt in the asynchronous mode.
         Asynchronous generation accepts single prompt only.
@@ -417,6 +422,7 @@ def generate_async(
             disaggregated_params=disaggregated_params,
             postproc_params=_postproc_params,
             multimodal_params=multimodal_params,
+            schedule_params=schedule_params,
         )
 
         return RequestOutput._from_generation_result(result, prompt,
diff --git a/tensorrt_llm/schedule_params.py b/tensorrt_llm/schedule_params.py
@@ -0,0 +1,15 @@
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(slots=True, kw_only=True)
+class ScheduleParams:
+    """Schedule parameters.
+
+    Args:
+        attention_dp_rank (int): The rank of target attention dp
+        attention_dp_relax (bool): Whether to allow the request to be scheduled to other attention dp for better throughput
+    """
+
+    attention_dp_rank: Optional[int] = None
+    attention_dp_relax: Optional[bool] = None