feat: sampling using FlashInfer.sampling

ixlmar · ixlmar · commit f2af5a04d037 · 2025-10-22T10:39:27.000Z
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,7 @@ extend_skip_glob = [
     "tests/unittest/_torch/sampler/test_torch_sampler.py",
     "tensorrt_llm/_torch/pyexecutor/sampler.py",
     "tensorrt_llm/_torch/pyexecutor/sampling_utils.py",
+    "tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py",
 ]
 
 [tool.yapf]
@@ -71,6 +72,7 @@ ignore_patterns = [
     "tests/unittest/_torch/sampler/test_torch_sampler.py",
     "tensorrt_llm/_torch/pyexecutor/sampler.py",
     "tensorrt_llm/_torch/pyexecutor/sampling_utils.py",
+    "tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py",
 ]
 
 [tool.codespell]
@@ -108,6 +110,7 @@ exclude = [
     "tests/unittest/_torch/models/checkpoints/hf/test_weight_loader.py",
     "tensorrt_llm/_torch/pyexecutor/sampler.py",
     "tensorrt_llm/_torch/pyexecutor/sampling_utils.py",
+    "tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py",
 ]
 
 
@@ -155,6 +158,7 @@ include = [
     "tests/unittest/_torch/sampler/test_torch_sampler.py",
     "tensorrt_llm/_torch/pyexecutor/sampler.py",
     "tensorrt_llm/_torch/pyexecutor/sampling_utils.py",
+    "tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py",
 ]
 exclude = [
     "**3rdparty/**",
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -820,7 +820,8 @@ def create_py_executor_instance(
 def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
                               max_batch_size: int,
                               speculative_config: SpeculativeConfig,
-                              max_beam_width: int):
+                              max_beam_width: int,
+                              disable_flash_infer_sampling: bool):
     max_num_sequences = max_batch_size * mapping.pp_size
     max_draft_len = (0 if speculative_config is None else
                      speculative_config.max_draft_len)
@@ -838,6 +839,7 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
         max_total_draft_tokens=max_total_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=max_beam_width,
+        disable_flash_infer_sampling=disable_flash_infer_sampling,
     )
 
 
@@ -847,13 +849,17 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                         max_seq_len: int, mm_encoder_only: bool,
                         speculative_config: SpeculativeConfig,
                         decoding_config: trtllm.DecodingConfig,
-                        kv_cache_config: KvCacheConfig):
+                        kv_cache_config: KvCacheConfig,
+                        disable_flash_infer_sampling: bool,
+                        ):
     sampler_args = create_torch_sampler_args(
         mapping,
         max_seq_len=engine.max_seq_len,
         max_batch_size=max_batch_size,
         speculative_config=speculative_config,
-        max_beam_width=max_beam_width)
+        max_beam_width=max_beam_width,
+        disable_flash_infer_sampling=disable_flash_infer_sampling,
+        )
     decoding_mode = get_decoding_mode(decoding_config=decoding_config,
                                       max_beam_width=max_beam_width)
     if mapping.cp_config.get('cp_type') == CpType.STAR:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -201,7 +201,7 @@ def get_guided_decoding_config(guided_decoding_backend: str,
 
 def create_py_executor(
     llm_args: TorchLlmArgs,
-    checkpoint_dir: str = None,
+    checkpoint_dir: Optional[str] = None,
     tokenizer: Optional[TokenizerBase] = None,
     profiling_stage_data: Optional[dict] = None,
 ) -> PyExecutor:
@@ -501,7 +501,9 @@ def drafting_loop_wrapper(model):
                                       mm_encoder_only=mm_encoder_only,
                                       speculative_config=spec_config,
                                       decoding_config=decoding_config,
-                                      kv_cache_config=kv_cache_config)
+                                      kv_cache_config=kv_cache_config,
+                                      disable_flash_infer_sampling=llm_args._disable_flash_infer_sampling,
+                            )
         logger.info(f"Using Sampler: {type(sampler).__name__}")
 
     if kv_connector_config is not None:
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -19,7 +19,7 @@
 from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import repeat
-from typing import Any, Callable, List, Optional, TypeVar, cast
+from typing import Any, Callable, List, Optional, Type, TypeVar, cast
 
 import torch
 import torch.nn.functional as F
@@ -53,13 +53,15 @@
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.sampling_params import SamplingParams
 
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 from ..speculative.spec_tree_manager import SpecTreeManager
 from .finish_reason import FinishedState
 from .llm_request import LlmRequest, LlmRequestState, get_draft_token_length
 from .resource_manager import ResourceManager, ResourceManagerType
 from .sampling_utils import (
     GREEDY,
     GenericStrategyKeyType,
+    GroupedStrategySampler,
     SimpleGroupedStrategySampler,
     Strategy,
     UtilsSamplingParams,
@@ -71,6 +73,9 @@
 )
 from .scheduler import ScheduledRequests
 
+if IS_FLASHINFER_AVAILABLE:
+    from .sampling_utils_flashinfer import FlashInferGroupedStrategySampler
+
 if sys.version_info[:2] >= (3, 12):
     from typing import override
 else:
@@ -266,7 +271,7 @@ def _request_strategy(request: LlmRequest, *, vocab_size: int) -> Strategy:
 def _group_requests_by_strategy_key(
     requests: Iterable[LlmRequest],
     *,
-    strategy_to_key: Callable[[Strategy], GenericStrategyKeyType],
+    strategy_to_key: Callable[[Strategy, bool], GenericStrategyKeyType],
     pin_memory: bool = False,
     vocab_size: int,
 ) -> dict[tuple[GenericStrategyKeyType, bool], tuple[torch.Tensor, List[Strategy]]]:
@@ -276,8 +281,8 @@ def _group_requests_by_strategy_key(
     )
     for req_index, req in enumerate(requests):
         strategy = _request_strategy(req, vocab_size=vocab_size)
-        strategy_key = strategy_to_key(strategy)
         speculation_needs_probs = req.py_draft_logits is not None and strategy is not GREEDY
+        strategy_key = strategy_to_key(strategy, speculation_needs_probs)
         group_dict_entry = group_dict[(strategy_key, speculation_needs_probs)]
         group_dict_entry[0].append(req_index)
         group_dict_entry[1].append(strategy)
@@ -586,6 +591,7 @@ class Args:
         max_num_sequences: int
         max_beam_width: int
         max_total_draft_tokens: int
+        disable_flash_infer_sampling: bool = False
 
     def __init__(self, args: Args):
         self.max_seq_len = args.max_seq_len
@@ -602,6 +608,13 @@ def __init__(self, args: Args):
         with torch.inference_mode(False):
             self.store = self.create_store()
 
+        self._grouped_sampler_cls: Type[GroupedStrategySampler]
+        if IS_FLASHINFER_AVAILABLE and not args.disable_flash_infer_sampling:
+            cls_not_possibly_unbound = FlashInferGroupedStrategySampler  # type: ignore
+            self._grouped_sampler_cls = cls_not_possibly_unbound
+        else:
+            self._grouped_sampler_cls = SimpleGroupedStrategySampler
+
         # Initialize seed for multi-GPU consistency
         self._global_seed = 42
         self._generator = None
@@ -1181,7 +1194,7 @@ def _sample_batched_by_strategy(
             requests,
             pin_memory=True,
             vocab_size=logits_cuda.size(1),
-            strategy_to_key=SimpleGroupedStrategySampler.strategy_grouping_key,
+            strategy_to_key=self._grouped_sampler_cls.strategy_grouping_key,
         )
         generator_cuda = self.get_generator(cuda_device)
 
@@ -1238,7 +1251,7 @@ def _sample_batched_by_strategy(
                 for _ in range(steps)
             ]
             group_next_tokens_cuda, group_softmax_cuda = (
-                SimpleGroupedStrategySampler.sample_grouped_strategies(
+                self._grouped_sampler_cls.sample_grouped_strategies(
                     strategy_key,
                     group_strategies_per_step,
                     group_logits_cuda,
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils.py
@@ -33,13 +33,13 @@
     from typing_extensions import override
 
 
-TemperatureOnly = tuple[Literal["temperature"], float]
-TopK = tuple[Literal["top_k"], int, float]
-TopP = tuple[Literal["top_p"], float, float]
-TopKTopP = tuple[Literal["top_k_top_p"], int, float, float]
-Greedy = tuple[Literal["greedy"], None]
+TemperatureOnly: TypeAlias = tuple[Literal["temperature"], float]
+TopK: TypeAlias = tuple[Literal["top_k"], int, float]
+TopP: TypeAlias = tuple[Literal["top_p"], float, float]
+TopKTopP: TypeAlias = tuple[Literal["top_k_top_p"], int, float, float]
+Greedy: TypeAlias = tuple[Literal["greedy"], None]
 GREEDY: Greedy = ("greedy", None)
-Strategy = TopK | TopP | Greedy | TopKTopP | TemperatureOnly
+Strategy: TypeAlias = TopK | TopP | Greedy | TopKTopP | TemperatureOnly
 
 
 @dataclass(frozen=True, kw_only=True)
@@ -292,7 +292,7 @@ def sample(
 class GroupedStrategySampler(Generic[GenericStrategyKeyType], abc.ABC):
     @staticmethod
     @abc.abstractmethod
-    def strategy_grouping_key(strategy: Strategy) -> GenericStrategyKeyType:
+    def strategy_grouping_key(strategy: Strategy, return_probs: bool) -> GenericStrategyKeyType:
         raise NotImplementedError
 
     @staticmethod
@@ -314,7 +314,7 @@ class SimpleGroupedStrategySampler(GroupedStrategySampler[Strategy]):
 
     @override
     @staticmethod
-    def strategy_grouping_key(strategy: Strategy) -> STRATEGY_KEY_TYPE:
+    def strategy_grouping_key(strategy: Strategy, return_probs: bool) -> STRATEGY_KEY_TYPE:
         return strategy
 
     @override
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py