[feat] Improve 2-model perf

mikeiovine · mikeiovine · commit 4721451075b7 · 2025-08-25T12:58:42.000-07:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Generic, Optional, Tuple
+from typing import Dict, Generic, Optional, Tuple
 
 import torch
 from torch import nn
@@ -293,18 +293,6 @@ def load_weights_from_target_model(self,
         if self.load_lm_head_from_target:
             self.lm_head = target_model.lm_head
 
-    # TODO: should input/position IDs be included in this? Keeping it implicit
-    # for now since the shapes/dtypes are the same across all models we have.
-    def get_warmup_extra_inputs(self, batch_size: int,
-                                num_tokens: int) -> Dict[str, Any]:
-
-        hidden_states = torch.empty(batch_size * num_tokens,
-                                    self.model.hidden_size,
-                                    dtype=self.model.dtype,
-                                    device='cuda')
-
-        return {'hidden_states': hidden_states}
-
     def apply_eagle3_fc(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Hack for eagle3. We might need to run a matmul to reduce
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -9,7 +9,7 @@
 import weakref
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple
 
 import torch
 import torch._dynamo.config
@@ -274,6 +274,8 @@ def __init__(
         spec_config: Optional["DecodingBaseConfig"] = None,
         lora_config: Optional[LoraConfig] = None,
         is_draft_model: bool = False,
+        drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
+                                                 torch.nn.Module]] = None,
     ):
         self.ub_buffers = None
         self.batch_size = batch_size
@@ -309,7 +311,8 @@ def __init__(
             max_num_tokens=max_num_tokens,
             moe_max_num_tokens=pytorch_backend_config.moe_max_num_tokens,
             moe_load_balancer=pytorch_backend_config.moe_load_balancer,
-            lora_config=lora_config)
+            lora_config=lora_config,
+            drafting_loop_wrapper=drafting_loop_wrapper)
         # In case that some tests use stub models and override `_load_model`.
         if not hasattr(self.model, 'extra_attrs'):
             self.model.extra_attrs = {}
@@ -402,7 +405,7 @@ def __init__(
                                                              dtype=torch.int,
                                                              device='cuda')
             self.without_logits = self.spec_config.spec_dec_mode.without_logits(
-            )
+            ) or self.model_is_wrapped
             self.max_draft_len = spec_config.max_draft_len
         else:
             self.without_logits = False
@@ -902,6 +905,8 @@ def _load_model(self,
                     moe_max_num_tokens: Optional[int] = None,
                     moe_load_balancer: Optional[MoeLoadBalancerConfig] = None,
                     lora_config: Optional[LoraConfig] = None,
+                    drafting_loop_wrapper: Optional[Callable[
+                        [torch.nn.Module], torch.nn.Module]] = None,
                     **kwargs) -> DecoderModelForCausalLM:
         config = checkpoint_loader.load_config(
             checkpoint_dir,
@@ -1005,6 +1010,13 @@ def init_meta_tensor(t: torch.Tensor):
                 logger.info("moe_load_balancer finalize model done")
 
             torch.cuda.current_stream().synchronize()
+
+        if drafting_loop_wrapper is not None:
+            model = drafting_loop_wrapper(model)
+            self.model_is_wrapped = True
+        else:
+            self.model_is_wrapped = False
+
         return model
 
     def _call_load_weights(self, load_method, weights, weight_mapper):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -252,13 +252,28 @@ def create_py_executor(
         with mem_monitor.observe_creation_stage(
                 _ExecutorCreationStage.MODEL_ENGINE_DRAFT):
             draft_spec_config = copy.copy(spec_config)
-            draft_pytorch_backend_config = copy.copy(pytorch_backend_config)
-            if spec_config.load_format == "dummy":
-                draft_pytorch_backend_config.load_format = LoadFormat.DUMMY
             # The draft model won't have any draft tokens attached to
             # generation requests when we invoke it autoregressively
             draft_spec_config.max_draft_len = 0
 
+            use_chain_drafter = (
+                executor_config.guided_decoding_config is None
+                and not pytorch_backend_config.enable_mixed_sampler)
+
+            if use_chain_drafter:
+
+                def drafting_loop_wrapper(model):
+                    from tensorrt_llm._torch.speculative.drafting_loops import \
+                        ChainDrafter
+
+                    return ChainDrafter(spec_config.max_draft_len, model)
+            else:
+                drafting_loop_wrapper = None
+
+            draft_pytorch_backend_config = copy.copy(pytorch_backend_config)
+            if spec_config.load_format == "dummy":
+                draft_pytorch_backend_config.load_format = LoadFormat.DUMMY
+
             draft_model_engine = PyTorchModelEngine(
                 model_path=spec_config.speculative_model_dir,
                 pytorch_backend_config=draft_pytorch_backend_config,
@@ -274,6 +289,7 @@ def create_py_executor(
                 spec_config=draft_spec_config,
                 checkpoint_loader=executor_config.checkpoint_loader,
                 is_draft_model=True,
+                drafting_loop_wrapper=drafting_loop_wrapper,
             )
             draft_model_engine.kv_cache_manager_key = ResourceManagerType.DRAFT_KV_CACHE_MANAGER
             draft_model_engine.load_weights_from_target_model(
diff --git a/tensorrt_llm/_torch/speculative/drafting_loops.py b/tensorrt_llm/_torch/speculative/drafting_loops.py
@@ -0,0 +1,121 @@
+"""
+This module contains capturable drafting loops for speculative decoding.
+
+Thes are torch modules wrap another draft model. The wrapped module
+is supposed to invoke the draft model autoregressively and invoke
+a sampling algorithm to obtain draft tokens. By structuring the code
+like this, we are able to avoid host overhead: the entire drafting process
+for speculation can be launched as a single CUDA graph.
+"""
+
+import torch
+from contextlib import contextmanager
+
+from tensorrt_llm._torch.attention_backend.interface import AttentionMetadata
+from tensorrt_llm._torch.speculative.interface import SpecMetadata
+from tensorrt_llm._torch.speculative.eagle3 import Eagle3SpecMetadata
+
+
+@contextmanager
+def save_metadata_state(attn_metadata: AttentionMetadata,
+                        spec_metadata: SpecMetadata) -> None:
+    batch_size = attn_metadata.num_seqs
+    if attn_metadata.is_cuda_graph:
+        seq_len = attn_metadata._seq_lens[:batch_size].clone()
+        seq_len_cuda = attn_metadata._seq_lens_cuda[:batch_size].clone()
+
+    try:
+        yield
+    finally:
+        if attn_metadata.is_cuda_graph:
+            attn_metadata._seq_lens[:batch_size].copy_(seq_len[:batch_size])
+            attn_metadata._seq_lens_cuda[:batch_size].copy_(
+                seq_len_cuda[:batch_size])
+
+        spec_metadata.reset()
+
+
+def prepare_for_generation(attn_metadata: AttentionMetadata,
+                           spec_metadata: SpecMetadata,
+                           last_tokens_idx: torch.Tensor) -> None:
+    batch_size = attn_metadata.num_seqs
+    attn_metadata._seq_lens[:batch_size].fill_(1)
+    attn_metadata._seq_lens_cuda[:batch_size].fill_(1)
+    attn_metadata.on_update()
+    attn_metadata.kv_lens_cuda[:batch_size] += 1
+
+    attn_metadata.host_request_types[:attn_metadata.num_contexts].fill_(1)
+    attn_metadata.num_contexts = 0
+
+    spec_metadata.num_tokens = batch_size
+
+    if isinstance(spec_metadata, Eagle3SpecMetadata):
+        spec_metadata.eagle3_resource_manager.is_first_draft = False
+        spec_metadata.is_first_draft = False
+
+        old_write_indices = spec_metadata.hidden_states_write_indices
+
+        spec_metadata.hidden_states_read_indices[:batch_size].copy_(
+            old_write_indices[last_tokens_idx])
+        spec_metadata.hidden_states_write_indices[:batch_size].copy_(
+            torch.arange(
+                batch_size,
+                dtype=spec_metadata.hidden_states_write_indices.dtype,
+                device=spec_metadata.hidden_states_write_indices.device))
+
+
+class ChainDrafter(torch.nn.Module):
+
+    def __init__(self, max_draft_len: int, draft_model: torch.nn.Module):
+        super().__init__()
+        self.draft_model = draft_model
+        self.config = self.draft_model.config
+        self.model_config = self.draft_model.model_config
+        self.max_draft_len = max_draft_len
+
+    def forward(self, input_ids: torch.Tensor, position_ids: torch.Tensor,
+                attn_metadata: AttentionMetadata,
+                spec_metadata: AttentionMetadata, **kwargs) -> None:
+
+        logits = self.draft_model.forward(input_ids=input_ids,
+                                          position_ids=position_ids,
+                                          attn_metadata=attn_metadata,
+                                          spec_metadata=spec_metadata)
+
+        new_draft_tokens = [self.sample(logits)]
+
+        with save_metadata_state(attn_metadata, spec_metadata):
+            batch_size = attn_metadata.num_seqs
+            last_tokens_idx = torch.cumsum(
+                attn_metadata.seq_lens_cuda, dim=0, dtype=torch.long) - 1
+            new_position_ids = position_ids[0, last_tokens_idx] + 1
+
+            prepare_for_generation(attn_metadata, spec_metadata,
+                                   last_tokens_idx)
+
+            for i in range(self.max_draft_len - 1):
+                logits = self.draft_model.forward(
+                    input_ids=new_draft_tokens[-1],
+                    position_ids=new_position_ids,
+                    attn_metadata=attn_metadata,
+                    spec_metadata=spec_metadata)
+                new_draft_tokens.append(self.sample(logits))
+                new_position_ids += 1
+                attn_metadata.kv_lens_cuda[:batch_size] += 1
+                if i == 0:
+                    spec_metadata.hidden_states_read_indices[:batch_size].copy_(
+                        spec_metadata.hidden_states_write_indices[:batch_size])
+
+        return torch.stack(new_draft_tokens)
+
+    def sample(self, logits: torch.Tensor) -> torch.Tensor:
+        tokens = torch.argmax(logits, dim=-1)
+        if hasattr(self.draft_model.model, "d2t"):
+            d2t = self.draft_model.model.d2t.data
+            return tokens + d2t[tokens]
+
+        return tokens
+
+    def load_weights_from_target_model(self,
+                                       target_model: torch.nn.Module) -> None:
+        self.draft_model.load_weights_from_target_model(target_model)
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -185,6 +185,10 @@ def get_hidden_states(self):
             hidden_states = hidden_states[:, :self.hidden_size]
         return hidden_states
 
+    def reset(self):
+        self.is_first_draft = True
+        self.eagle3_resource_manager.is_first_draft = True
+
 
 @dataclass
 class Eagle3OneModelSpecMetadata(SpecMetadata):
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -209,3 +209,9 @@ def all_rank_num_tokens(self, value: Optional[List[int]]):
         value = value if value is not SpecMetadata.all_rank_num_tokens else None
         self._all_rank_num_tokens = value
         self.all_rank_max_num_tokens = max(value) if value is not None else None
+
+    def reset(self):
+        """
+        Currently used by 2-model static drafting loops only. Used to reset any spec metadata
+        to its original state after the drafting loop exists. Does nothing by default.
+        """
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -72,6 +72,12 @@ def __init__(
             self._request_draft_logits = sampler.enable_mixed_sampler
         self.guided_decoder = guided_decoder
 
+        self.use_static_draft_loop = draft_model_engine.model_is_wrapped
+        if self.use_static_draft_loop:
+            # TODO: enable sampling/guided decoding on static draft loop
+            assert guided_decoder is None
+            assert not sampler.enable_mixed_sampler
+
     def _create_draft_request(self, request: LlmRequest,
                               input_tokens: Optional[List]) -> LlmRequest:
         """Create a draft request with common parameters."""
@@ -237,6 +243,8 @@ def _should_disable_cuda_graph(
         """Check if CUDA graph should be disabled for the current forward pass."""
         if previous_batch is not None:
             return False
+        if self.use_static_draft_loop:
+            return False
         return self.spec_config.spec_dec_mode.needs_kv_cache_recompute()
 
     def _forward_draft_model(
@@ -256,8 +264,10 @@ def _forward_draft_model(
                 resource_manager,
                 new_tensors_device=new_tensors_device)
 
-        # Handle d2t data if available
-        if hasattr(self.draft_model_engine.model.model, 'd2t'):
+        # Handle d2t data if available. Static drafting loops should incorporate d2t
+        # in their implementations.
+        if not self.use_static_draft_loop and hasattr(
+                self.draft_model_engine.model.model, 'd2t'):
             outputs['d2t'] = self.draft_model_engine.model.model.d2t.data
 
         return outputs
@@ -377,6 +387,17 @@ def prepare_draft_tokens(
 
             # Initial forward pass
             outputs = self._forward_draft_model(draft_batch, resource_manager)
+
+            if self.use_static_draft_loop:
+                outputs_host = outputs.cpu()
+                for token_idx in range(self.max_draft_tokens):
+                    for req_idx, req in enumerate(draft_batch.all_requests()):
+                        target_req = req_id_to_old_request[req.py_request_id]
+                        target_req.py_draft_tokens.append(
+                            outputs_host[token_idx][req_idx])
+
+                return
+
             self._execute_guided_decoder(draft_batch,
                                          outputs['logits'],
                                          d2t=outputs.get('d2t'))