code refactoring

nvxuanyuc · nvxuanyuc · commit 3be14a604321 · 2025-07-29T14:07:04.000-07:00
Signed-off-by: Xuanyu Chen &lt;xuanyuc@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -263,10 +263,6 @@ def __init__(
         lora_config: Optional[LoraConfig] = None,
         is_draft_model: bool = False,
     ):
-        # Set deterministic seed for consistent multi-GPU sampling using PyTorch RNG
-        # operations that avoid torch.multinomial's CPU-GPU sync overhead
-        torch.manual_seed(0)
-
         self.ub_buffers = None
         self.batch_size = batch_size
         self.max_num_tokens = max_num_tokens
@@ -381,23 +377,6 @@ def __init__(
             self.without_logits = self.spec_config.spec_dec_mode.without_logits(
             )
             self.max_draft_len = spec_config.max_draft_len
-
-            if self.is_advanced_mtp_sampler:
-                mtp_total_sampling_size = self.batch_size * (
-                    self.max_draft_len + 1)
-                self.temperatures_cuda = torch.empty(
-                    (mtp_total_sampling_size, ),
-                    dtype=torch.float,
-                    device='cuda')
-                self.top_k_cuda = torch.empty((mtp_total_sampling_size, ),
-                                              dtype=torch.int,
-                                              device='cuda')
-                self.top_p_cuda = torch.empty((mtp_total_sampling_size, ),
-                                              dtype=torch.float,
-                                              device='cuda')
-                self.min_p_cuda = torch.empty((mtp_total_sampling_size, ),
-                                              dtype=torch.float,
-                                              device='cuda')
         else:
             self.without_logits = False
             self.max_draft_len = 0
@@ -1196,38 +1175,50 @@ def _prepare_tp_inputs(
             top_p = []
             min_p = []
 
-        def get_request_temperature(request: LlmRequest) -> float:
-            if not request.sampling_config.temperature:
-                return 1.0
-            temperature = request.sampling_config.temperature[0]
-            if 0 < temperature < 1e-2:
-                # temperature less than 0.01 may cause numerical errors
-                temperature = 0.01
-            return temperature
-
-        def get_request_top_k(request: LlmRequest) -> int:
-            if not request.sampling_config.top_k:
-                top_k = 0
-            else:
-                top_k = request.sampling_config.top_k[0]
+        # advanced mtp sampling's request preprocessing helper functions
+        def collect_req_mtp_sampling_params(request: LlmRequest,
+                                            draft_len: int = 0):
+
+            def get_request_temperature(request: LlmRequest) -> float:
+                if not request.sampling_config.temperature:
+                    return 1.0
+                temperature = request.sampling_config.temperature[0]
+                if 0 < temperature < 1e-2:
+                    # temperature less than 0.01 may cause numerical errors
+                    temperature = 0.01
+                return temperature
+
+            def get_request_top_k(request: LlmRequest) -> int:
+                if not request.sampling_config.top_k:
+                    top_k = 0
+                else:
+                    top_k = request.sampling_config.top_k[0]
 
-            if top_k <= 0:
-                top_k = 2147483647
-            return top_k
+                # set k to a very large value (larger than vocab size) to disable top_k sampling
+                TOP_K_DISABLED = (1 << 31) - 1
+                if top_k <= 0:
+                    top_k = TOP_K_DISABLED
+                return top_k
 
-        def get_request_top_p(request: LlmRequest) -> float:
-            if not request.sampling_config.top_p:
-                top_p = 1.0
-            else:
-                top_p = request.sampling_config.top_p[0]
-            return top_p
+            def get_request_top_p(request: LlmRequest) -> float:
+                if not request.sampling_config.top_p:
+                    top_p = 1.0
+                else:
+                    top_p = request.sampling_config.top_p[0]
+                return top_p
 
-        def get_request_min_p(request: LlmRequest) -> float:
-            if not request.sampling_config.min_p:
-                min_p = 0.0
-            else:
-                min_p = request.sampling_config.min_p[0]
-            return min_p
+            def get_request_min_p(request: LlmRequest) -> float:
+                if not request.sampling_config.min_p:
+                    min_p = 0.0
+                else:
+                    min_p = request.sampling_config.min_p[0]
+                return min_p
+
+            temperatures.extend([get_request_temperature(request)] *
+                                (draft_len + 1))
+            top_k.extend([get_request_top_k(request)] * (draft_len + 1))
+            top_p.extend([get_request_top_p(request)] * (draft_len + 1))
+            min_p.extend([get_request_min_p(request)] * (draft_len + 1))
 
         for request in scheduled_requests.context_requests:
             request_ids.append(request.py_request_id)
@@ -1263,10 +1254,7 @@ def get_request_min_p(request: LlmRequest) -> float:
                 multimodal_params_list.append(multimodal_params)
 
             if self.is_advanced_mtp_sampler:
-                temperatures.append(get_request_temperature(request))
-                top_k.append(get_request_top_k(request))
-                top_p.append(get_request_top_p(request))
-                min_p.append(get_request_min_p(request))
+                collect_req_mtp_sampling_params(request)
 
             request.py_batch_idx = request.py_seq_slot
 
@@ -1352,14 +1340,7 @@ def get_request_min_p(request: LlmRequest) -> float:
                 request_ids.append(request.py_request_id)
 
                 if self.is_advanced_mtp_sampler:
-                    temperatures.extend([get_request_temperature(request)] *
-                                        (num_draft_tokens + 1))
-                    top_k.extend([get_request_top_k(request)] *
-                                 (num_draft_tokens + 1))
-                    top_p.extend([get_request_top_p(request)] *
-                                 (num_draft_tokens + 1))
-                    min_p.extend([get_request_min_p(request)] *
-                                 (num_draft_tokens + 1))
+                    collect_req_mtp_sampling_params(request, num_draft_tokens)
 
                 # update batch index
                 request.py_batch_idx = request.py_seq_slot
@@ -1391,14 +1372,7 @@ def get_request_min_p(request: LlmRequest) -> float:
                 request_ids.append(request.py_request_id)
 
                 if self.is_advanced_mtp_sampler:
-                    temperatures.extend([get_request_temperature(request)] *
-                                        (self.max_draft_len + 1))
-                    top_k.extend([get_request_top_k(request)] *
-                                 (self.max_draft_len + 1))
-                    top_p.extend([get_request_top_p(request)] *
-                                 (self.max_draft_len + 1))
-                    min_p.extend([get_request_min_p(request)] *
-                                 (self.max_draft_len + 1))
+                    collect_req_mtp_sampling_params(request, self.max_draft_len)
 
         for request in generation_requests:
             beam_width = request.sampling_config.beam_width
@@ -1433,14 +1407,7 @@ def get_request_min_p(request: LlmRequest) -> float:
             gen_request_seq_slots.append(request.py_seq_slot)
 
             if self.is_advanced_mtp_sampler:
-                temperatures.extend([get_request_temperature(request)] *
-                                    (self.max_draft_len + 1))
-                top_k.extend([get_request_top_k(request)] *
-                             (self.max_draft_len + 1))
-                top_p.extend([get_request_top_p(request)] *
-                             (self.max_draft_len + 1))
-                min_p.extend([get_request_min_p(request)] *
-                             (self.max_draft_len + 1))
+                collect_req_mtp_sampling_params(request, self.max_draft_len)
 
             request.py_batch_idx = request.py_seq_slot
 
@@ -1561,21 +1528,6 @@ def previous_seq_slots_device():
             self.gather_ids_cuda[:len(gather_ids)].copy_(torch.tensor(
                 gather_ids, dtype=torch.int, pin_memory=True),
                                                          non_blocking=True)
-            if self.is_advanced_mtp_sampler:
-                self.temperatures_cuda[:len(temperatures)].copy_(
-                    torch.tensor(temperatures,
-                                 dtype=torch.float,
-                                 pin_memory=True),
-                    non_blocking=True)
-                self.top_k_cuda[:len(top_k)].copy_(torch.tensor(
-                    top_k, dtype=torch.int, pin_memory=True),
-                                                   non_blocking=True)
-                self.top_p_cuda[:len(top_p)].copy_(torch.tensor(
-                    top_p, dtype=torch.float, pin_memory=True),
-                                                   non_blocking=True)
-                self.min_p_cuda[:len(min_p)].copy_(torch.tensor(
-                    min_p, dtype=torch.float, pin_memory=True),
-                                                   non_blocking=True)
 
         if not attn_metadata.is_cuda_graph:
             # Assumes seq lens do not change between CUDA graph invocations. This applies
@@ -1651,11 +1603,8 @@ def previous_seq_slots_device():
             spec_metadata.gather_ids = self.gather_ids_cuda[:len(gather_ids)]
 
             if self.is_advanced_mtp_sampler:
-                spec_metadata.temperatures = self.temperatures_cuda[:len(
-                    temperatures)]
-                spec_metadata.top_k = self.top_k_cuda[:len(top_k)]
-                spec_metadata.top_p = self.top_p_cuda[:len(top_p)]
-                spec_metadata.min_p = self.min_p_cuda[:len(min_p)]
+                spec_metadata.update_advanced_mtp_sampling_params(
+                    temperatures, top_k, top_p, min_p)
 
             spec_metadata.num_generations = len(
                 scheduled_requests.generation_requests)
@@ -2205,6 +2154,10 @@ def forward(
                 spec_metadata.is_spec_dec_tree,
                 spec_metadata.is_spec_dec_dynamic_tree,
                 spec_metadata.max_draft_len)
+
+            if self.is_advanced_mtp_sampler:
+                spec_metadata._set_up_advanced_mtp_sampling(
+                    self.batch_size, self.max_draft_len)
         else:
             spec_resource_manager = None
             spec_metadata = None
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -115,6 +115,11 @@ class MTPSpecMetadata(SpecMetadata):
     # subsequence draft forward.
     subseq_all_rank_num_tokens: Optional[List[int]] = None
 
+    temperatures_cuda: Optional[torch.Tensor] = None
+    top_k_cuda: Optional[torch.Tensor] = None
+    top_p_cuda: Optional[torch.Tensor] = None
+    min_p_cuda: Optional[torch.Tensor] = None
+
     def __post_init__(self) -> None:
         if self.mtp_hidden_states_manager is not None:
             # mtp_hidden_states_ptrs is a pointer tensor
@@ -204,6 +209,53 @@ def prepare(self):
                                         pin_memory=True)
             self.slot_ids[:num_seqs].copy_(mtp_slot_ids, non_blocking=True)
 
+    def _set_up_advanced_mtp_sampling(self, batch_size: int,
+                                      max_draft_len: int):
+        # create once and reuse
+        if self.temperatures_cuda is None:
+            # Set deterministic seed (one time) for consistent multi-GPU sampling using PyTorch RNG
+            # operations that avoid torch.multinomial's CPU-GPU sync overhead
+            torch.manual_seed(0)
+
+            max_total_sampling_size = batch_size * (max_draft_len + 1)
+            self.temperatures_cuda = torch.empty((max_total_sampling_size, ),
+                                                 dtype=torch.float,
+                                                 device='cuda')
+            self.top_k_cuda = torch.empty((max_total_sampling_size, ),
+                                          dtype=torch.int,
+                                          device='cuda')
+            self.top_p_cuda = torch.empty((max_total_sampling_size, ),
+                                          dtype=torch.float,
+                                          device='cuda')
+            self.min_p_cuda = torch.empty((max_total_sampling_size, ),
+                                          dtype=torch.float,
+                                          device='cuda')
+
+    def update_advanced_mtp_sampling_params(self, temperatures: list[float],
+                                            top_k: list[int],
+                                            top_p: list[float],
+                                            min_p: list[float]):
+        self.temperatures_cuda[:len(temperatures)].copy_(torch.tensor(
+            temperatures, dtype=torch.float, pin_memory=True),
+                                                         non_blocking=True)
+        self.top_k_cuda[:len(top_k)].copy_(torch.tensor(top_k,
+                                                        dtype=torch.int,
+                                                        pin_memory=True),
+                                           non_blocking=True)
+        self.top_p_cuda[:len(top_p)].copy_(torch.tensor(top_p,
+                                                        dtype=torch.float,
+                                                        pin_memory=True),
+                                           non_blocking=True)
+        self.min_p_cuda[:len(min_p)].copy_(torch.tensor(min_p,
+                                                        dtype=torch.float,
+                                                        pin_memory=True),
+                                           non_blocking=True)
+
+        self.temperatures = self.temperatures_cuda[:len(temperatures)]
+        self.top_k = self.top_k_cuda[:len(top_k)]
+        self.top_p = self.top_p_cuda[:len(top_p)]
+        self.min_p = self.min_p_cuda[:len(min_p)]
+
 
 class MTPSampler(TorchSampler):
     """
diff --git a/tests/unittest/_torch/speculative/test_mtp.py b/tests/unittest/_torch/speculative/test_mtp.py
@@ -367,7 +367,8 @@ def test_sample_and_accept_draft_tokens_adv_torch_sampler_greedy_mode(
         for i in range(batch_size):
             num_draft_tokens = draft_len[i]
             # set to greedy sampling mode (temperature <= 0.01 boundary) for advanced pytorch sampler
-            # sampling default config vals set in [tensorrt_llm/_torch/pyexecutor/model_engine.py:get_request_[param_name]]
+            # sampling default config vals set in
+            # [tensorrt_llm/_torch/pyexecutor/model_engine.py:get_request_[param_name]]
             temperatures.extend([0.01] * (num_draft_tokens + 1))
             top_k.extend([2147483647] * (num_draft_tokens + 1))
             top_p.extend([1.0] * (num_draft_tokens + 1))