NVIDIA
diff --git a/‎examples/auto_deploy/.gitignore
Lines changed: 2 additions & 0 deletions b/‎examples/auto_deploy/.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/auto_deploy/build_and_run_ad.py
Lines changed: 42 additions & 7 deletions b/‎examples/auto_deploy/build_and_run_ad.py
Lines changed: 42 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
Lines changed: 401 additions & 317 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
Lines changed: 401 additions & 317 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
Lines changed: 3 additions & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/export/export.py
Lines changed: 2 additions & 13 deletions b/‎tensorrt_llm/_torch/auto_deploy/export/export.py
Lines changed: 2 additions & 13 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/export/interface.py
Lines changed: 19 additions & 8 deletions b/‎tensorrt_llm/_torch/auto_deploy/export/interface.py
Lines changed: 19 additions & 8 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm.py
Lines changed: 87 additions & 7 deletions b/‎tensorrt_llm/_torch/auto_deploy/llm.py
Lines changed: 87 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm_args.py
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/llm_args.py
Lines changed: 1 addition & 1 deletion
@@ -2,3 +2,5 @@
 !.vscode
 benchmark_results.json
 *.png
+# ignore config files that users might put here for debugging
+*.yaml
@@ -26,6 +26,9 @@
 # Global torch config, set the torch compile cache to fix up to llama 405B
 torch._dynamo.config.cache_size_limit = 20
 
+# simple string, TRT-LLM style text-only prompt or full-scale HF message template
+PromptInput = Union[str, Dict, List[Dict]]
+
 
 class PromptConfig(BaseModel):
     """Prompt configuration.
@@ -35,13 +38,27 @@ class PromptConfig(BaseModel):
     """
 
     batch_size: int = Field(default=2, description="Number of queries")
-    queries: Union[str, List[str]] = Field(
+    queries: Union[PromptInput, List[PromptInput]] = Field(
         default_factory=lambda: [
+            # OPTION 1: simple text prompt
             "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
-        ]
+            # OPTION 2: wrapped text prompt for TRT-LLM
+            {"prompt": "In simple words and a single sentence, explain the concept of gravity: "},
+            # OPTION 3: a full-scale HF message template (this one works for text-only models!)
+            # Learn more about chat templates: https://huggingface.co/docs/transformers/en/chat_templating
+            # and multi-modal templates: https://huggingface.co/docs/transformers/en/chat_templating_multimodal
+            [
+                {
+                    "role": "user",
+                    "content": "How to fix slicing in golf?",
+                }
+            ],
+            # More prompts...
+            {"prompt": "Where is the capital of Iceland? "},
+        ],
+        description="Example queries to prompt the model with. We support both TRT-LLM text-only "
+        "queries via the 'prompt' key and full-scale HF message template called via "
+        "apply_chat_template.",
     )
     sp_kwargs: Dict[str, Any] = Field(
         default_factory=lambda: {"max_tokens": 100, "top_k": 200, "temperature": 1.0},
@@ -55,10 +72,28 @@ def model_post_init(self, __context: Any):
         NOTE (lucaslie): has to be done with model_post_init to ensure it's always run. field
         validators are only run if a value is provided.
         """
-        queries = [self.queries] if isinstance(self.queries, str) else self.queries
+        queries = self.queries if isinstance(self.queries, list) else [self.queries]
         batch_size = self.batch_size
         queries = queries * (batch_size // len(queries) + 1)
-        self.queries = queries[:batch_size]
+        queries = queries[:batch_size]
+
+        # now let's standardize the queries for the LLM api to understand them
+        queries_processed = []
+        for query in queries:
+            if isinstance(query, str):
+                queries_processed.append({"prompt": query})
+            elif isinstance(query, dict):
+                queries_processed.append(query)
+            elif isinstance(query, list):
+                queries_processed.append(
+                    {
+                        "prompt": "Fake prompt. Check out messages field for the HF chat template.",
+                        "messages": query,  # contains the actual HF chat template
+                    }
+                )
+            else:
+                raise ValueError(f"Invalid query type: {type(query)}")
+        self.queries = queries_processed
 
     @field_validator("sp_kwargs", mode="after")
     @classmethod
 
@@ -63,6 +63,7 @@ def scaled_dot_product_attention(
     dropout_p: float = 0.0,
     is_causal: bool = False,
     scale: Optional[float] = None,
+    enable_gqa: bool = False,
 ) -> torch.Tensor:
     """A carbon copy of torch.nn.functional.scaled_dot_product_attention as custom op.
 
@@ -78,12 +79,13 @@ def scaled_dot_product_attention(
         dropout_p=dropout_p,
         is_causal=is_causal,
         scale=scale,
+        enable_gqa=enable_gqa,
     )
 
 
 @scaled_dot_product_attention.register_fake
 def scaled_dot_product_attention_fake(
-    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None, enable_gqa=False
 ):
     """Fake implementation of scaled_dot_product_attention."""
     return query.new_empty(*query.shape[:-1], value.shape[-1]).contiguous()
 
@@ -18,7 +18,7 @@
 )
 from ..utils.logger import ad_logger
 from ..utils.node_utils import is_op
-from .interface import ExportPatchRegistry, apply_export_patches
+from .interface import apply_export_patches
 
 try:
     from modelopt.torch.quantization.utils import export_torch_mode as torch_export_context
@@ -229,20 +229,9 @@ def torch_export_to_gm(
         patch_list: Optional list of patch names to apply with default settings.
                    Cannot be used together with patch_configs.
     """
-    # Validate that both patch_configs and patch_list are not provided simultaneously
-    if patch_configs is not None and patch_list is not None:
-        raise ValueError("Cannot specify both patch_configs and patch_list. Use only one.")
-
-    # Handle patch configuration
-    if patch_list is not None:
-        # Convert patch_list to patch_configs format
-        patch_configs = {patch_name: {} for patch_name in patch_list}
-    elif patch_configs is None:
-        # Default patch configurations - apply all registered patches with default settings
-        patch_configs = {patch_name: {} for patch_name in ExportPatchRegistry.list_patches()}
 
     # run export with patches and lifted to meta
-    with apply_export_patches(patch_configs), lift_to_meta(model) as state_dict:
+    with apply_export_patches(patch_configs, patch_list), lift_to_meta(model) as state_dict:
         # clean up args, kwargs and move to correct device
         args, kwargs = tree_to((args, kwargs or {}), device="meta")
 
 
@@ -5,7 +5,7 @@
 
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, List, Type, Union, final
+from typing import Any, Callable, Dict, List, Optional, Type, Union, final
 
 from pydantic import BaseModel, Field
 
@@ -183,6 +183,8 @@ def inner(patch_cls: Type[BaseExportPatch]) -> Type[BaseExportPatch]:
     @classmethod
     def get(cls, name: str) -> Type[BaseExportPatch]:
         """Get a patch class by name."""
+        if not cls.has(name):
+            raise ValueError(f"Unknown patch: {name}")
         return cls._registry[name]
 
     @classmethod
@@ -212,20 +214,29 @@ def list_patches(cls) -> List[str]:
 
 
 @contextmanager
-def apply_export_patches(patch_configs: Dict[str, Union[ExportPatchConfig, Dict[str, Any]]]):
+def apply_export_patches(
+    patch_configs: Optional[Dict[str, Union[ExportPatchConfig, Dict[str, Any]]]] = None,
+    patch_list: Optional[List[str]] = None,
+):
     """Context manager to apply multiple patches.
 
     Args:
         patch_configs: Dict mapping patch names to their configurations.
     """
-    patches = []
+    # Validate that both patch_configs and patch_list are not provided simultaneously
+    if patch_configs is not None and patch_list is not None:
+        raise ValueError("Cannot specify both patch_configs and patch_list. Use only one.")
+
+    # Handle patch configuration
+    if patch_list is not None:
+        # Convert patch_list to patch_configs format
+        patch_configs = {patch_name: {} for patch_name in patch_list}
+    elif patch_configs is None:
+        # Default patch configurations - apply all registered patches with default settings
+        patch_configs = {patch_name: {} for patch_name in ExportPatchRegistry.list_patches()}
 
     # Create patch instances
-    for name, config in patch_configs.items():
-        if not ExportPatchRegistry.has(name):
-            raise ValueError(f"Unknown patch: {name}")
-        patch = ExportPatchRegistry.create_patch(name, config)
-        patches.append(patch)
+    patches = [ExportPatchRegistry.create_patch(k, conf) for k, conf in patch_configs.items()]
 
     # Apply patches using nested context managers
     if not patches:
 
@@ -1,19 +1,92 @@
 import types
-from typing import List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from ...executor.result import CompletionOutput
-from ...inputs.registry import create_input_processor
+from ...inputs.registry import DefaultInputProcessor, ExtraProcessedInputs
 from ...llmapi.llm import RequestOutput, _TorchLLM
-from ...llmapi.tokenizer import TokenizerBase, tokenizer_factory
+from ...llmapi.tokenizer import TokenizerBase, TransformersTokenizer, tokenizer_factory
+from ...sampling_params import SamplingParams
 from .distributed import common as dist_ad
 from .llm_args import LlmArgs
+from .models.factory import ModelFactory
 from .shim.demollm import DemoGenerationExecutor
 
 
+class ADInputProcessor(DefaultInputProcessor):
+    """Input processor for AutoDeploy backend.
+
+    This is a wrapper to either support standard TRT-LLM text-only input processing or use HF's
+    message chat template system to process multimodal inputs.
+    """
+
+    def __init__(self, tokenizer: Optional[TokenizerBase], processor: Optional[Any] = None):
+        super().__init__(None, None, tokenizer)
+        # NOTE: HF's tokenizer/processor that has the apply_chat_template method
+        self.processor = processor or getattr(tokenizer, "tokenizer", None)
+
+    def __call__(
+        self, inputs: Dict[str, Any], sampling_params: SamplingParams
+    ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
+        if self.processor is None:
+            raise ValueError("processor is required to tokenize inputs")
+
+        # construct kwargs to reflect DefaultInputProcessor
+        kwargs = {
+            "add_special_tokens": sampling_params.add_special_tokens,
+        }
+        if sampling_params.truncate_prompt_tokens is not None:
+            kwargs = {
+                "truncation": True,
+                "max_length": sampling_params.truncate_prompt_tokens,
+            }
+        # check for messages field and if yes, use the apply_chat_template method
+        if "messages" in inputs:
+            # TODO: we don't really need this but it makes for a good sanity check. Consider
+            # removing this in the future if we need to speed things up.
+            prompt = self.processor.apply_chat_template(
+                inputs["messages"],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            inputs["prompt"] = prompt
+
+            all_args = self.processor.apply_chat_template(
+                inputs["messages"],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+                padding=False,  # there shouldn't be a need for padding ever...
+                return_attention_mask=False,
+                **kwargs,
+            )
+            # TODO: is there a more reliable way to avoid the attention_mask here?
+            all_args.pop("attention_mask", None)
+
+            # TODO: can we avoid the extra tolist() here eventually?
+            token_ids = all_args.pop("input_ids")
+            assert token_ids.shape[0] == 1, "messages should be unbatched at this point."
+            if all_args:
+                extra_processed_inputs = {"multimodal_data": all_args}
+            else:
+                extra_processed_inputs = None
+            return token_ids[0].tolist(), extra_processed_inputs
+        else:
+            token_ids = self.tokenizer.encode(inputs["prompt"], **kwargs)
+            return token_ids, None
+
+
 class LLM(_TorchLLM):
     """LLM class is the main class for running an LLM model using AutoDeploy backend."""
 
     args: LlmArgs
+    _factory: ModelFactory
+
+    @property
+    def factory(self) -> ModelFactory:
+        if not getattr(self, "_factory", None):
+            self._factory = self.args.create_factory()
+        return self._factory
 
     def __init__(self, *args, **kwargs):
         kwargs["backend"] = "_autodeploy"
@@ -23,16 +96,18 @@ def _try_load_tokenizer(self) -> Optional[TokenizerBase]:
         if self.args.skip_tokenizer_init:
             return None
 
-        factory = self.args.create_factory()
-        return tokenizer_factory(factory.init_tokenizer())
+        return tokenizer_factory(self.factory.init_tokenizer())
 
     def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
         """We don't need to validate args for AutoDeploy backend for now."""
         pass
 
+    def _create_input_processor(self) -> ADInputProcessor:
+        return ADInputProcessor(self.tokenizer, self.factory.init_processor())
+
     def _prefetch_model(self):
         """Prefetch the model for the LLM."""
-        self.args.create_factory().prefetch_checkpoint()
+        self.factory.prefetch_checkpoint()
 
     def _build_model(self):
         """Build the model for the LLM.
@@ -47,6 +122,11 @@ def _build_model(self):
         # _autodeploy backend.
         super()._build_model()
 
+        # now correct input processor
+        assert isinstance(self.input_processor, DefaultInputProcessor)
+        assert self.tokenizer is None or isinstance(self.tokenizer, TransformersTokenizer)
+        self.input_processor = self._create_input_processor()
+
 
 class DemoLLM(LLM):
     """A simple LLM class to demo the LLM interface while debugging the e2e workflow.
@@ -63,7 +143,7 @@ def __init__(self, **kwargs):
         # prefetch model and load tokenizer
         self._prefetch_model()
         self._tokenizer = self._try_load_tokenizer()
-        self.input_processor = create_input_processor(None, self.tokenizer)
+        self.input_processor = self._create_input_processor()
 
         # construct demo executor + engine
         self._executor = DemoGenerationExecutor(
 
@@ -57,7 +57,7 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
         description="The path to the model checkpoint or the model name from the Hugging Face Hub."
     )
 
-    model_factory: Literal["AutoModelForCausalLM", "AutoModelForImageTextToText"] = Field(
+    model_factory: str = Field(
         default="AutoModelForCausalLM",
         description="The model factory to use for loading the model.",
     )
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):`
`57`	`57`	`description="The path to the model checkpoint or the model name from the Hugging Face Hub."`
`58`	`58`	`)`
`59`	`59`
`60`		`- model_factory: Literal["AutoModelForCausalLM", "AutoModelForImageTextToText"] = Field(`
	`60`	`+ model_factory: str = Field(`
`61`	`61`	`default="AutoModelForCausalLM",`
`62`	`62`	`description="The model factory to use for loading the model.",`
`63`	`63`	`)`