code style clearnup and minor tweaks (no logic change)

nvxuanyuc · nvxuanyuc · commit 7f31555960cf · 2025-07-22T20:09:23.000-07:00
Signed-off-by: Xuanyu Chen &lt;xuanyuc@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -380,22 +380,21 @@ def __init__(
             self.max_draft_len = spec_config.max_draft_len
 
             if self.is_advanced_mtp_sampler:
+                mtp_total_sampling_size = self.batch_size * (
+                    self.max_draft_len + 1)
                 self.temperatures_cuda = torch.empty(
-                    (self.batch_size * (self.max_draft_len + 1), ),
-                    dtype=torch.float,
-                    device='cuda')
-                self.top_k_cuda = torch.empty(
-                    (self.batch_size * (self.max_draft_len + 1), ),
-                    dtype=torch.int,
-                    device='cuda')
-                self.top_p_cuda = torch.empty(
-                    (self.batch_size * (self.max_draft_len + 1), ),
-                    dtype=torch.float,
-                    device='cuda')
-                self.min_p_cuda = torch.empty(
-                    (self.batch_size * (self.max_draft_len + 1), ),
+                    (mtp_total_sampling_size, ),
                     dtype=torch.float,
                     device='cuda')
+                self.top_k_cuda = torch.empty((mtp_total_sampling_size, ),
+                                              dtype=torch.int,
+                                              device='cuda')
+                self.top_p_cuda = torch.empty((mtp_total_sampling_size, ),
+                                              dtype=torch.float,
+                                              device='cuda')
+                self.min_p_cuda = torch.empty((mtp_total_sampling_size, ),
+                                              dtype=torch.float,
+                                              device='cuda')
         else:
             self.without_logits = False
             self.max_draft_len = 0
@@ -1162,7 +1161,7 @@ def _prepare_tp_inputs(
 
         def get_request_temperature(request: LlmRequest) -> float:
             if not request.sampling_config.temperature:
-                return 0.7
+                return 1.0
             temperature = request.sampling_config.temperature[0]
             if 0 < temperature < 1e-2:
                 # temperature less than 0.01 may cause numerical errors
@@ -1174,7 +1173,7 @@ def get_request_top_k(request: LlmRequest) -> int:
                 top_k = 0
             else:
                 top_k = request.sampling_config.top_k[0]
-            # flashinfer expects k > d for no top_k filter
+
             if top_k <= 0:
                 top_k = 2147483647
             return top_k
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -414,11 +414,11 @@ def supports_backend(self, backend: str) -> bool:
 
 
 class MTPDecodingConfig(DecodingBaseConfig):
-    num_nextn_predict_layers: Optional[int] = 1
-    use_relaxed_acceptance_for_thinking: Optional[bool] = False
-    relaxed_topk: Optional[int] = 1
-    relaxed_delta: Optional[float] = 0.
-    use_mtp_vanilla: Optional[bool] = False
+    num_nextn_predict_layers: int = 1
+    use_relaxed_acceptance_for_thinking: bool = False
+    relaxed_topk: int = 1
+    relaxed_delta: float = 0.
+    use_mtp_vanilla: bool = False
     use_advanced_mtp_sampler: Optional[bool] = False
 
     # TODO: remove this after distinguishing `max_draft_len` and `num_nextn_predict_layers`
diff --git a/tensorrt_llm/llmapi/tokenizer.py b/tensorrt_llm/llmapi/tokenizer.py
@@ -96,11 +96,11 @@ def convert_ids_to_tokens(
             skip_special_tokens: bool = False) -> Union[str, List[str]]:
         # DeepSeek vocabulary has token ids not mapped to any tokens, these will get converted to None
         # by the tokenizer. We need to filter them out.
-        return [
-            token for token in self.tokenizer.convert_ids_to_tokens(
-                ids, skip_special_tokens=skip_special_tokens)
-            if token is not None
-        ]
+        tokens = self.tokenizer.convert_ids_to_tokens(
+            ids, skip_special_tokens=skip_special_tokens)
+        if isinstance(ids, int):
+            return tokens  # Single token, return as-is (could be None)
+        return [token for token in tokens if token is not None]
 
     def convert_tokens_to_string(
             self,