Add AutoDecodingConfig to apply the default spec_decoding heuristic with Ngram.

SimengLiu-nv · SimengLiu-nv · commit 21cfc3da78d7 · 2025-07-21T23:37:00.000-07:00
Signed-off-by: Simeng Liu &lt;simengl@nvidia.com&gt;
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -1,10 +1,10 @@
 import argparse
 
 from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (CudaGraphConfig, DraftTargetDecodingConfig,
-                                 EagleDecodingConfig, KvCacheConfig, MoeConfig,
-                                 MTPDecodingConfig, NGramDecodingConfig,
-                                 TorchCompileConfig)
+from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig,
+                                 DraftTargetDecodingConfig, EagleDecodingConfig,
+                                 KvCacheConfig, MoeConfig, MTPDecodingConfig,
+                                 NGramDecodingConfig, TorchCompileConfig)
 
 example_prompts = [
     "Hello, my name is",
@@ -107,7 +107,11 @@ def add_llm_args(parser):
     parser.add_argument('--max_beam_width', type=int, default=1)
 
     # Speculative decoding
-    parser.add_argument('--spec_decode_algo', type=str, default=None)
+    parser.add_argument(
+        '--spec_decode_algo',
+        type=str,
+        default=None,
+        choices=['MTP', 'EAGLE3', 'DRAFT_TARGET', 'NGRAM', 'AUTO'])
     parser.add_argument('--spec_decode_max_draft_len', type=int, default=0)
     parser.add_argument('--draft_model_dir', type=str, default=None)
     parser.add_argument('--max_matching_ngram_size', type=int, default=0)
@@ -152,11 +156,6 @@ def setup_llm(args, **kwargs):
     spec_decode_algo = args.spec_decode_algo.upper(
     ) if args.spec_decode_algo is not None else None
 
-    # Update spec_decode_max_draft_len to 1 if unset by the user for non-NGRAM spec_decode_algo
-    # NGRAM spec_decode_algo will use default heuristic to set spec_decode_max_draft_len and max_matching_ngram_size
-    if spec_decode_algo != "NGRAM" and args.spec_decode_max_draft_len == 0:
-        args.spec_decode_max_draft_len = 1
-
     if spec_decode_algo == 'MTP':
         if not args.use_one_model:
             print(
@@ -186,6 +185,8 @@ def setup_llm(args, **kwargs):
             is_use_oldest=True,
             is_public_pool=True,
         )
+    elif spec_decode_algo == "AUTO":
+        spec_config = AutoDecodingConfig()
     else:
         spec_config = None
 
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -18,6 +18,7 @@ class SpeculativeDecodingMode(IntEnum):
     DRAFT_TARGET = auto()
     USER_PROVIDED = auto()
     NONE = auto()
+    AUTO = auto()
 
     def is_mtp(self):
         return self == SpeculativeDecodingMode.MTP or self == SpeculativeDecodingMode.MTP_EAGLE
diff --git a/tensorrt_llm/llmapi/__init__.py b/tensorrt_llm/llmapi/__init__.py
@@ -8,11 +8,11 @@
                        CapacitySchedulerPolicy, ContextChunkingPolicy,
                        CudaGraphConfig, DraftTargetDecodingConfig,
                        DynamicBatchConfig, EagleDecodingConfig,
-                       ExtendedRuntimePerfKnobConfig, KvCacheConfig, LlmArgs,
-                       LookaheadDecodingConfig, MedusaDecodingConfig, MoeConfig,
-                       MTPDecodingConfig, NGramDecodingConfig, SchedulerConfig,
-                       TorchCompileConfig, TorchLlmArgs, TrtLlmArgs,
-                       UserProvidedDecodingConfig)
+                       ExtendedRuntimePerfKnobConfig, KvCacheConfig,
+                       LlmArgs, LookaheadDecodingConfig, MedusaDecodingConfig,
+                       MoeConfig, MTPDecodingConfig, NGramDecodingConfig,
+                       SchedulerConfig, TorchCompileConfig, TorchLlmArgs,
+                       TrtLlmArgs, UserProvidedDecodingConfig)
 from .llm_utils import (BuildConfig, KvCacheRetentionConfig, QuantAlgo,
                         QuantConfig)
 from .mpi_session import MpiCommSession
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -31,8 +31,8 @@
 from ..logger import logger
 from ..sampling_params import SamplingParams
 from .llm_args import (TORCH_LLMARGS_EXPLICIT_DOCSTRING,
-                       TRT_LLMARGS_EXPLICIT_DOCSTRING, PybindMirror,
-                       TorchLlmArgs, TrtLlmArgs)
+                       TRT_LLMARGS_EXPLICIT_DOCSTRING, NGramDecodingConfig,
+                       PybindMirror, TorchLlmArgs, TrtLlmArgs)
 from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig,
                         LlmBuildStats, ModelLoader, _ModelRuntimeContext)
 from .mpi_session import MpiPoolSession, external_mpi_comm_available
@@ -959,30 +959,30 @@ def _build_model(self):
 
         spec_config = self.args.speculative_config
         max_batch_size = self._executor_config.max_batch_size
-        # Apply heuristic to incomplete NGramDecodingConfig based on benchmark results
+        # Apply default heuristic to AutoDecodingConfig based on benchmark results
         # With concurrency <= 4, max_draft_len = 5, max_matching_ngram_size = 3
         # With concurrency <= 32, max_draft_len = 3, max_matching_ngram_size = 5
-        if spec_config.spec_dec_mode() == "NGRAM" and max_batch_size <= 32:
+        # With concurrency > 32, speculative decoding is disabled.
+        if spec_config is not None and spec_config.decoding_type == "AUTO" and max_batch_size <= 32:
             if not self.args.disable_overlap_scheduler:
                 logger.info(
-                    "Disable overlap scheduler to enable NGram speculative decoding."
+                    "Disable overlap scheduler to enable Auto speculative decoding with Ngram."
                 )
                 # From benchmark results, we found that NGram speculative decoding provides better performance than overlap scheduler with low concurrency <= 32.
                 # Therefore, we disable overlap scheduler to enable NGram speculative decoding.
                 self.args.disable_overlap_scheduler = True
 
-            if spec_config.max_draft_len != 0 and spec_config.max_matching_ngram_size != 0:
-                pass
-            else:
-                if max_batch_size <= 4:
-                    spec_config.max_draft_len = 5 if spec_config.max_draft_len == 0 else spec_config.max_draft_len
-                    spec_config.max_matching_ngram_size = 3 if spec_config.max_matching_ngram_size == 0 else spec_config.max_matching_ngram_size
-                elif max_batch_size <= 32:
-                    spec_config.max_draft_len = 3 if spec_config.max_draft_len == 0 else spec_config.max_draft_len
-                    spec_config.max_matching_ngram_size = 5 if spec_config.max_matching_ngram_size == 0 else spec_config.max_matching_ngram_size
-                logger.info(
-                    f"Apply heuristic to incomplete NGramDecodingConfig: max_draft_len={spec_config.max_draft_len}, max_matching_ngram_size={spec_config.max_matching_ngram_size}"
-                )
+            spec_config = NGramDecodingConfig(
+                max_draft_len=5 if max_batch_size <= 4 else 3,
+                max_matching_ngram_size=3 if max_batch_size <= 4 else 5,
+                is_keep_all=True,
+                is_use_oldest=True,
+                is_public_pool=True,
+            )
+
+            logger.info(
+                f"Apply heuristic to incomplete NGramDecodingConfig: max_draft_len={spec_config.max_draft_len}, max_matching_ngram_size={spec_config.max_matching_ngram_size}"
+            )
 
         update_executor_config(
             self._executor_config,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -262,6 +262,7 @@ def from_dict(cls, data: dict):
             "NGram": NGramDecodingConfig,
             "DraftTarget": DraftTargetDecodingConfig,
             "UserProvided": UserProvidedDecodingConfig,
+            "AUTO": AutoDecodingConfig,
         }
 
         config_class = config_classes.get(decoding_type)
@@ -458,6 +459,29 @@ def update_from_model_config(self, model_config):
             self.num_extra_kv_tokens = self.num_nextn_predict_layers - 1
 
 
+class AutoDecodingConfig(DecodingBaseConfig):
+    """
+    Configuration for auto speculative decoding.
+
+    This config is used to automatically select the best speculative decoding algorithm.
+
+    According to benchmark results, the best algorithm in general is NGRAM with low concurrency <= 32.
+    Default heuristic:
+        With concurrency <= 4, max_draft_len = 5, max_matching_ngram_size = 3
+        With concurrency <= 32, max_draft_len = 3, max_matching_ngram_size = 5
+        With concurrency > 32, speculative decoding is disabled.
+    """
+
+    @classmethod
+    def from_dict(cls, data: dict):
+        return cls(**data)
+
+    decoding_type: ClassVar[str] = "AUTO"
+
+    def supports_backend(self, backend: str) -> bool:
+        return backend == "pytorch"
+
+
 class PybindMirror(ABC):
     ''' A class containing the utilities for mirroring Python classes to
     pybinding classes.
@@ -761,6 +785,7 @@ def supports_backend(self, backend: str) -> bool:
     MTPDecodingConfig,
     NGramDecodingConfig,
     UserProvidedDecodingConfig,
+    AutoDecodingConfig,
 ]]
 
 
@@ -1178,7 +1203,6 @@ def from_kwargs(cls, **kwargs: Any) -> "BaseLlmArgs":
             tensorrt_llm.llmapi.llm_utils.BaseLlmArgs: The `BaseLlmArgs` instance.
         """
         kwargs = BaseLlmArgs._check_consistency(dict(kwargs))
-
         ret = cls(**kwargs)
         return ret
 
@@ -1507,6 +1531,11 @@ def validate_speculative_config(self):
                 self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.USER_PROVIDED
                 self.build_config.max_draft_len = self.speculative_config.max_draft_len
 
+            elif isinstance(self.speculative_config, AutoDecodingConfig):
+                assert self.backend in ['pytorch', '_autodeploy']
+                self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.AUTO
+                self.build_config.max_draft_len = self.speculative_config.max_draft_len
+
             else:
                 raise ValueError(
                     f"Unrecognized speculative config type {type(self.speculative_config)}"
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
@@ -98,6 +98,7 @@ class SpeculativeDecodingMode(IntFlag):
     EAGLE = auto()
     NGRAM = auto()
     USER_PROVIDED = auto()
+    AUTO = auto()
 
     @staticmethod
     def from_arguments(args: argparse.Namespace):
@@ -117,6 +118,8 @@ def from_arguments(args: argparse.Namespace):
             return SpeculativeDecodingMode.NGRAM
         elif args.speculative_decoding_mode == "user_provided":
             return SpeculativeDecodingMode.USER_PROVIDED
+        elif args.speculative_decoding_mode == "auto":
+            return SpeculativeDecodingMode.AUTO
         else:
             assert False, "Unknown speculative_decoding_mode " + args.speculative_decoding_mode