NVIDIA
diff --git a/‎examples/llm-api/quickstart_advanced.py
Lines changed: 5 additions & 1 deletion b/‎examples/llm-api/quickstart_advanced.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py
Lines changed: 92 additions & 55 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py
Lines changed: 92 additions & 55 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/sampler.py
Lines changed: 17 additions & 60 deletions b/‎tensorrt_llm/_torch/pyexecutor/sampler.py
Lines changed: 17 additions & 60 deletions
@@ -112,6 +112,9 @@ def add_llm_args(parser):
     parser.add_argument('--draft_model_dir', type=str, default=None)
     parser.add_argument('--max_matching_ngram_size', type=int, default=5)
     parser.add_argument('--use_one_model', default=False, action='store_true')
+    parser.add_argument('--use_advanced_mtp_sampler',
+                        default=False,
+                        action='store_true')
 
     # Relaxed acceptance
     parser.add_argument('--use_relaxed_acceptance_for_thinking',
@@ -163,7 +166,8 @@ def setup_llm(args, **kwargs):
             use_relaxed_acceptance_for_thinking=args.
             use_relaxed_acceptance_for_thinking,
             relaxed_topk=args.relaxed_topk,
-            relaxed_delta=args.relaxed_delta)
+            relaxed_delta=args.relaxed_delta,
+            use_advanced_mtp_sampler=args.use_advanced_mtp_sampler)
     elif spec_decode_algo == "EAGLE3":
         spec_config = EagleDecodingConfig(
             max_draft_len=args.spec_decode_max_draft_len,
 
@@ -17,8 +17,8 @@
 import tensorrt_llm.bindings.internal.userbuffers as ub
 from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
     BaseCheckpointLoader
-from tensorrt_llm._torch.pyexecutor.sampler import SampleStateTensors
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
+from tensorrt_llm._torch.pyexecutor.sampler import SampleStateTensors
 from tensorrt_llm._torch.speculative.mtp import SampleStateTensorsMTP
 from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
                                  torch_dtype_to_str, trace_func)
@@ -261,7 +261,10 @@ def __init__(
         lora_config: Optional[LoraConfig] = None,
         is_draft_model: bool = False,
     ):
+        # Set deterministic seed for consistent multi-GPU sampling using PyTorch RNG
+        # operations that avoid torch.multinomial's CPU-GPU sync overhead
         torch.manual_seed(0)
+
         self.ub_buffers = None
         self.batch_size = batch_size
         self.max_num_tokens = max_num_tokens
@@ -278,6 +281,8 @@ def __init__(
         self.spec_config = spec_config
         self.is_spec_decode = spec_config is not None
         self.is_draft_model = is_draft_model
+        self.is_advanced_mtp_sampler = self.is_spec_decode and self.spec_config.spec_dec_mode.is_mtp(
+        ) and self.spec_config.use_advanced_mtp_sampler
 
         self.in_warmup = False
 
@@ -373,18 +378,24 @@ def __init__(
             self.without_logits = self.spec_config.spec_dec_mode.without_logits(
             )
             self.max_draft_len = spec_config.max_draft_len
-            self.temperatures_cuda = torch.empty((self.batch_size * (self.max_draft_len + 1), ),
-                                               dtype=torch.float,
-                                               device='cuda')
-            self.top_k_cuda = torch.empty((self.batch_size * (self.max_draft_len + 1), ),
-                                          dtype=torch.int,
-                                          device='cuda')
-            self.top_p_cuda = torch.empty((self.batch_size * (self.max_draft_len + 1), ),
-                                          dtype=torch.float,
-                                          device='cuda')
-            self.min_p_cuda = torch.empty((self.batch_size * (self.max_draft_len + 1), ),
-                                          dtype=torch.float,
-                                          device='cuda')
+
+            if self.is_advanced_mtp_sampler:
+                self.temperatures_cuda = torch.empty(
+                    (self.batch_size * (self.max_draft_len + 1), ),
+                    dtype=torch.float,
+                    device='cuda')
+                self.top_k_cuda = torch.empty(
+                    (self.batch_size * (self.max_draft_len + 1), ),
+                    dtype=torch.int,
+                    device='cuda')
+                self.top_p_cuda = torch.empty(
+                    (self.batch_size * (self.max_draft_len + 1), ),
+                    dtype=torch.float,
+                    device='cuda')
+                self.min_p_cuda = torch.empty(
+                    (self.batch_size * (self.max_draft_len + 1), ),
+                    dtype=torch.float,
+                    device='cuda')
         else:
             self.without_logits = False
             self.max_draft_len = 0
@@ -1142,11 +1153,12 @@ def _prepare_tp_inputs(
         draft_lens = []
         multimodal_params_list = []
         gen_request_seq_slots = []  # per generation request
-        
-        temperatures = []
-        top_k = []
-        top_p = []
-        min_p = []
+
+        if self.is_advanced_mtp_sampler:
+            temperatures = []
+            top_k = []
+            top_p = []
+            min_p = []
 
         def get_request_temperature(request: LlmRequest) -> float:
             if not request.sampling_config.temperature:
@@ -1213,12 +1225,13 @@ def get_request_min_p(request: LlmRequest) -> float:
 
             if multimodal_params.has_content():
                 multimodal_params_list.append(multimodal_params)
-                
-            temperatures.append(get_request_temperature(request))
-            top_k.append(get_request_top_k(request))
-            top_p.append(get_request_top_p(request))
-            min_p.append(get_request_min_p(request))
-            
+
+            if self.is_advanced_mtp_sampler:
+                temperatures.append(get_request_temperature(request))
+                top_k.append(get_request_top_k(request))
+                top_p.append(get_request_top_p(request))
+                min_p.append(get_request_min_p(request))
+
             request.py_batch_idx = request.py_seq_slot
 
         num_ctx_requests = len(scheduled_requests.context_requests)
@@ -1300,10 +1313,17 @@ def get_request_min_p(request: LlmRequest) -> float:
                               past_seen_token_num + 1 + num_draft_tokens)))
                 num_cached_tokens_per_seq.append(past_seen_token_num)
                 request_ids.append(request.py_request_id)
-                temperatures.extend([get_request_temperature(request)] * (num_draft_tokens + 1))
-                top_k.extend([get_request_top_k(request)] * (num_draft_tokens + 1))
-                top_p.extend([get_request_top_p(request)] * (num_draft_tokens + 1))
-                min_p.extend([get_request_min_p(request)] * (num_draft_tokens + 1))
+
+                if self.is_advanced_mtp_sampler:
+                    temperatures.extend([get_request_temperature(request)] *
+                                        (num_draft_tokens + 1))
+                    top_k.extend([get_request_top_k(request)] *
+                                 (num_draft_tokens + 1))
+                    top_p.extend([get_request_top_p(request)] *
+                                 (num_draft_tokens + 1))
+                    min_p.extend([get_request_min_p(request)] *
+                                 (num_draft_tokens + 1))
+
                 # update batch index
                 request.py_batch_idx = request.py_seq_slot
             else:
@@ -1332,10 +1352,16 @@ def get_request_min_p(request: LlmRequest) -> float:
                                                  self.max_draft_len + 1)
                 prompt_lengths.append(request.py_prompt_len)
                 request_ids.append(request.py_request_id)
-                temperatures.extend([get_request_temperature(request)] * (self.max_draft_len + 1))
-                top_k.extend([get_request_top_k(request)] * (self.max_draft_len + 1))
-                top_p.extend([get_request_top_p(request)] * (self.max_draft_len + 1))
-                min_p.extend([get_request_min_p(request)] * (self.max_draft_len + 1))
+
+                if self.is_advanced_mtp_sampler:
+                    temperatures.extend([get_request_temperature(request)] *
+                                        (self.max_draft_len + 1))
+                    top_k.extend([get_request_top_k(request)] *
+                                 (self.max_draft_len + 1))
+                    top_p.extend([get_request_top_p(request)] *
+                                 (self.max_draft_len + 1))
+                    min_p.extend([get_request_min_p(request)] *
+                                 (self.max_draft_len + 1))
 
         for request in generation_requests:
             beam_width = request.sampling_config.beam_width
@@ -1368,11 +1394,17 @@ def get_request_min_p(request: LlmRequest) -> float:
 
             request_ids.append(request.py_request_id)
             gen_request_seq_slots.append(request.py_seq_slot)
-            
-            temperatures.extend([get_request_temperature(request)] * (self.max_draft_len + 1))
-            top_k.extend([get_request_top_k(request)] * (self.max_draft_len + 1))
-            top_p.extend([get_request_top_p(request)] * (self.max_draft_len + 1))
-            min_p.extend([get_request_min_p(request)] * (self.max_draft_len + 1))
+
+            if self.is_advanced_mtp_sampler:
+                temperatures.extend([get_request_temperature(request)] *
+                                    (self.max_draft_len + 1))
+                top_k.extend([get_request_top_k(request)] *
+                             (self.max_draft_len + 1))
+                top_p.extend([get_request_top_p(request)] *
+                             (self.max_draft_len + 1))
+                min_p.extend([get_request_min_p(request)] *
+                             (self.max_draft_len + 1))
+
             request.py_batch_idx = request.py_seq_slot
 
         previous_batch_len = len(previous_batch_indices)
@@ -1476,18 +1508,21 @@ def previous_seq_slots_device():
             self.gather_ids_cuda[:len(gather_ids)].copy_(torch.tensor(
                 gather_ids, dtype=torch.int, pin_memory=True),
                                                          non_blocking=True)
-            self.temperatures_cuda[:len(temperatures)].copy_(torch.tensor(
-                temperatures, dtype=torch.float, pin_memory=True),
-                                                        non_blocking=True)
-            self.top_k_cuda[:len(top_k)].copy_(torch.tensor(
-                top_k, dtype=torch.int, pin_memory=True),
-                                                        non_blocking=True)
-            self.top_p_cuda[:len(top_p)].copy_(torch.tensor(
-                top_p, dtype=torch.float, pin_memory=True),
-                                                        non_blocking=True)
-            self.min_p_cuda[:len(min_p)].copy_(torch.tensor(
-                min_p, dtype=torch.float, pin_memory=True),
-                                                        non_blocking=True)
+            if self.is_advanced_mtp_sampler:
+                self.temperatures_cuda[:len(temperatures)].copy_(
+                    torch.tensor(temperatures,
+                                 dtype=torch.float,
+                                 pin_memory=True),
+                    non_blocking=True)
+                self.top_k_cuda[:len(top_k)].copy_(torch.tensor(
+                    top_k, dtype=torch.int, pin_memory=True),
+                                                   non_blocking=True)
+                self.top_p_cuda[:len(top_p)].copy_(torch.tensor(
+                    top_p, dtype=torch.float, pin_memory=True),
+                                                   non_blocking=True)
+                self.min_p_cuda[:len(min_p)].copy_(torch.tensor(
+                    min_p, dtype=torch.float, pin_memory=True),
+                                                   non_blocking=True)
 
         if not attn_metadata.is_cuda_graph:
             # Assumes seq lens do not change between CUDA graph invocations. This applies
@@ -1562,12 +1597,14 @@ def previous_seq_slots_device():
                                                                 total_draft_lens]
             spec_metadata.request_ids = request_ids
             spec_metadata.gather_ids = self.gather_ids_cuda[:len(gather_ids)]
-            spec_metadata.temperatures = self.temperatures_cuda[:len(temperatures)]
-            spec_metadata.top_k = self.top_k_cuda[:len(top_k)]
-            spec_metadata.top_p = self.top_p_cuda[:len(top_p)]
-            spec_metadata.min_p = self.min_p_cuda[:len(min_p)]
-            # if attn_metadata.is_cuda_graph and not torch.cuda.is_current_stream_capturing():
-                # spec_metadata.generator = torch.Generator(device='cpu').manual_seed(0)
+
+            if self.is_advanced_mtp_sampler:
+                spec_metadata.temperatures = self.temperatures_cuda[:len(
+                    temperatures)]
+                spec_metadata.top_k = self.top_k_cuda[:len(top_k)]
+                spec_metadata.top_p = self.top_p_cuda[:len(top_p)]
+                spec_metadata.min_p = self.min_p_cuda[:len(min_p)]
+
             spec_metadata.num_generations = len(
                 scheduled_requests.generation_requests)
             spec_metadata.num_tokens = total_num_tokens
 
@@ -4,7 +4,6 @@
 from typing import Literal, Optional
 
 import torch
-import flashinfer
 
 from tensorrt_llm._torch.pyexecutor.handle_logits import HandleLogits
 from tensorrt_llm._torch.pyexecutor.make_decoding_batch_input_output import \
@@ -151,42 +150,6 @@ def top_p_sampling_batch(logits: torch.Tensor, top_p: float = 0.9):
     next_tokens = torch.multinomial(softmax, num_samples=1).squeeze(-1)
     return next_tokens, softmax
 
-def flashinfer_sample(
-    logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
-    generator: Optional[torch.Generator] = None,
-) -> torch.Tensor:
-    """Sample from the logits using FlashInfer.
-
-    Statistically, this function is equivalent to the `random_sample` function.
-    However, this function is faster because it avoids sorting the logits tensor
-    via rejection sampling.
-
-    NOTE: The outputs of this function do not necessarily match the outputs of
-    the `random_sample` function. It only guarantees that the outputs are
-    statistically equivalent.
-
-    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
-    does not. Call this function at the end of the forward pass to minimize
-    the synchronization overhead.
-    """
-    assert not (k is None and p is None)
-    if k is None:
-        # Top-p only.
-        probs = logits.softmax(dim=-1, dtype=torch.float32)
-        next_token_ids = flashinfer.sampling.top_p_sampling_from_probs(
-            probs, p, deterministic=True, generator=generator)
-    elif p is None:
-        # Top-k only.
-        probs = logits.softmax(dim=-1, dtype=torch.float32)
-        next_token_ids = flashinfer.sampling.top_k_sampling_from_probs(
-            probs, k, deterministic=True, generator=generator)
-    else:
-        # Both top-k and top-p.
-        next_token_ids = flashinfer.sampling.top_k_top_p_sampling_from_logits(
-            logits, k, p, deterministic=True, generator=generator)
-    return next_token_ids.view(-1).long()
 
 def forward_native(
     logits: torch.Tensor,
@@ -202,9 +165,8 @@ def forward_native(
     probs = logits.softmax(dim=-1, dtype=torch.float32)
     return random_sample(probs)
 
-def random_sample(
-    probs: torch.Tensor,
-) -> torch.Tensor:
+
+def random_sample(probs: torch.Tensor, ) -> torch.Tensor:
     """Randomly sample from the probabilities.
 
     We use this function instead of torch.multinomial because torch.multinomial
@@ -214,6 +176,7 @@ def random_sample(
     q.exponential_()
     return probs.div_(q).argmax(dim=-1).view(-1)
 
+
 def apply_min_p(
     logits: torch.Tensor,
     min_p: torch.Tensor,
@@ -224,9 +187,7 @@ def apply_min_p(
     # Convert logits to probability distribution
     probability_values = torch.nn.functional.softmax(logits, dim=-1)
     # Calculate maximum probabilities per sequence
-    max_probabilities = torch.amax(probability_values,
-                                    dim=-1,
-                                    keepdim=True)
+    max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
     # Reshape min_p for broadcasting
     adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
     # Identify valid tokens using threshold comparison
@@ -235,6 +196,7 @@ def apply_min_p(
     logits[~valid_token_mask] = -float('inf')
     return logits
 
+
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],
@@ -268,44 +230,39 @@ def apply_top_k_top_p(
     logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
     return logits
 
+
 def greedy_sample(logits: torch.Tensor) -> torch.Tensor:
     return logits.argmax(dim=-1).view(-1)
 
+
 def apply_temperature(
     logits: torch.Tensor,
     temp: torch.Tensor,
 ) -> torch.Tensor:
     # Use in-place division to avoid creating a new tensor.
     return logits.div_(temp.unsqueeze(dim=1))
 
-def sampling_batch(
-        logits: torch.Tensor,
-        temperatures: torch.Tensor,
-        top_k: torch.Tensor,
-        top_p: torch.Tensor,
-        min_p: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+def sampling_batch(logits: torch.Tensor, temperatures: torch.Tensor,
+                   top_k: torch.Tensor, top_p: torch.Tensor,
+                   min_p: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     raw_probs = torch.softmax(logits, dim=-1)
     greedy_sampled = greedy_sample(logits)
     logits = apply_temperature(logits, temperatures)
     logits = apply_min_p(logits, min_p)
-    # if not torch.cuda.is_current_stream_capturing():
-    #     generator = torch.Generator(device="cuda")
-    #     generator.manual_seed(0)
-    # next_tokens = flashinfer_sample(adjusted_logits, top_k, top_p, generator)
-    # logits = apply_top_k_top_p(logits, top_k, top_p)
     random_sampled = forward_native(logits, top_k, top_p)
     next_tokens = torch.where(
-            temperatures < 1e-5,
-            greedy_sampled,
-            random_sampled,
-            out=greedy_sampled,  # Reuse tensor
-        )
+        temperatures <= 1e-2,  # Match the clamping threshold
+        greedy_sampled,
+        random_sampled,
+        out=greedy_sampled,  # Reuse tensor
+    )
     token_probs = torch.gather(raw_probs, dim=1,
                                index=next_tokens.unsqueeze(1)).squeeze(-1)
     log_probs = torch.log(token_probs)
     return next_tokens, log_probs
 
+
 def greedy_search_sampling_batch(logits):
     next_tokens = torch.argmax(logits, dim=-1)
     softmax = torch.softmax(logits, dim=-1)