Change the logic to reuse enable_mixed_sampler flag; remove mtp advanced sampling flag

nvxuanyuc · nvxuanyuc · commit 0ee79b386513 · 2025-07-29T14:07:06.000-07:00
Signed-off-by: Xuanyu Chen &lt;xuanyuc@nvidia.com&gt;
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -112,9 +112,6 @@ def add_llm_args(parser):
     parser.add_argument('--draft_model_dir', type=str, default=None)
     parser.add_argument('--max_matching_ngram_size', type=int, default=5)
     parser.add_argument('--use_one_model', default=False, action='store_true')
-    parser.add_argument('--use_advanced_mtp_sampler',
-                        default=False,
-                        action='store_true')
 
     # Relaxed acceptance
     parser.add_argument('--use_relaxed_acceptance_for_thinking',
@@ -166,8 +163,7 @@ def setup_llm(args, **kwargs):
             use_relaxed_acceptance_for_thinking=args.
             use_relaxed_acceptance_for_thinking,
             relaxed_topk=args.relaxed_topk,
-            relaxed_delta=args.relaxed_delta,
-            use_advanced_mtp_sampler=args.use_advanced_mtp_sampler)
+            relaxed_delta=args.relaxed_delta)
     elif spec_decode_algo == "EAGLE3":
         spec_config = EagleDecodingConfig(
             max_draft_len=args.spec_decode_max_draft_len,
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
@@ -89,6 +89,10 @@ class ModelConfig(Generic[TConfig]):
     # Allow models to select op according to whether CUDA Graphs are used.
     use_cuda_graph: bool = False
 
+    # If true, iterate over sampling_params of each request and use the corresponding sampling strategy.
+    # Currently only used for DeepSeek-MTP.
+    enable_mixed_sampler: bool = False
+
     force_dynamic_quantization: bool = False
 
     extra_attrs: Dict = field(default_factory=dict, repr=False, init=False)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -280,7 +280,7 @@ def __init__(
         self.is_spec_decode = spec_config is not None
         self.is_draft_model = is_draft_model
         self.is_advanced_mtp_sampler = self.is_spec_decode and self.spec_config.spec_dec_mode.is_mtp(
-        ) and self.spec_config.use_advanced_mtp_sampler
+        ) and self.pytorch_backend_config.enable_mixed_sampler
 
         self.in_warmup = False
 
@@ -298,6 +298,7 @@ def __init__(
             max_num_tokens=max_num_tokens,
             moe_max_num_tokens=pytorch_backend_config.moe_max_num_tokens,
             moe_load_balancer=pytorch_backend_config.moe_load_balancer,
+            enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler,
             lora_config=lora_config)
         # In case that some tests use stub models and override `_load_model`.
         if not hasattr(self.model, 'extra_attrs'):
@@ -1195,7 +1196,7 @@ def get_request_top_k(request: LlmRequest) -> int:
                     top_k = request.sampling_config.top_k[0]
 
                 # set k to a very large value (larger than vocab size) to disable top_k sampling
-                TOP_K_DISABLED = (1 << 31) - 1
+                TOP_K_DISABLED = torch.iinfo(torch.int32).max
                 if top_k <= 0:
                     top_k = TOP_K_DISABLED
                 return top_k
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -376,6 +376,11 @@ def __init__(self, spec_config: "MTPDecodingConfig", model_config=None):
         self.model_config = model_config
         self.is_thop = False
 
+        # Default to greedy mode. If true, use advanced pytorch sampling strategy.
+        self.enable_mixed_sampler = False
+        if self.model_config is not None:
+            self.enable_mixed_sampler = self.model_config.enable_mixed_sampler
+
     def forward(
         self,
         input_ids,
@@ -891,7 +896,7 @@ def sample_and_accept_draft_tokens(
                     logits, spec_metadata.draft_tokens, target_tokens_cache,
                     mtp_num_modules, batch_size, num_contexts, logits.shape[-1])
             else:
-                if self.spec_config.use_advanced_mtp_sampler:
+                if self.enable_mixed_sampler:
                     # Do advanced sampling for the input logits
                     # target_log_probs currently unused but kept for future log probs support in MTP
                     target_tokens, target_log_probs = sampling_batch(
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -478,7 +478,6 @@ class MTPDecodingConfig(DecodingBaseConfig):
     relaxed_topk: int = 1
     relaxed_delta: float = 0.
     use_mtp_vanilla: bool = False
-    use_advanced_mtp_sampler: Optional[bool] = False
 
     # TODO: remove this after distinguishing `max_draft_len` and `num_nextn_predict_layers`
     # Now we need a flag when MTPDecodingConfig is updated by PyTorchModelEngine.
diff --git a/tests/unittest/_torch/speculative/test_mtp.py b/tests/unittest/_torch/speculative/test_mtp.py
@@ -342,8 +342,7 @@ def test_sample_and_accept_draft_tokens_adv_torch_sampler_greedy_mode(
         batch_size = len(draft_len)
         # enable advanced pytorch sampler
         spec_config = MTPDecodingConfig(
-            num_nextn_predict_layers=mtp_num_modules,
-            use_advanced_mtp_sampler=True)
+            num_nextn_predict_layers=mtp_num_modules)
 
         # attention metedata
         attn_metadata = TrtllmAttentionMetadata(max_num_requests=batch_size,
@@ -389,6 +388,7 @@ def test_sample_and_accept_draft_tokens_adv_torch_sampler_greedy_mode(
         # mtp worker
         # is_thop default to False for advanced pytorch sampler testing only
         mtpworker = MTPWorker(spec_config)
+        mtpworker.enable_mixed_sampler = True
 
         # Test advanced torch sampler
         accepted_tokens, num_accepted_tokens = mtpworker.sample_and_accept_draft_tokens(