[feat] Improve 2-model perf

mikeiovine · mikeiovine · commit 1a42cf9747c7 · 2025-08-22T12:24:52.000-07:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -290,18 +290,6 @@ def load_weights_from_target_model(self,
         if self.load_lm_head_from_target:
             self.lm_head = target_model.lm_head
 
-    # TODO: should input/position IDs be included in this? Keeping it implicit
-    # for now since the shapes/dtypes are the same across all models we have.
-    def get_warmup_extra_inputs(self, batch_size: int,
-                                num_tokens: int) -> Dict[str, Any]:
-
-        hidden_states = torch.empty(batch_size * num_tokens,
-                                    self.model.hidden_size,
-                                    dtype=self.model.dtype,
-                                    device='cuda')
-
-        return {'hidden_states': hidden_states}
-
     def apply_eagle3_fc(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """
         Hack for eagle3. We might need to run a matmul to reduce
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -388,7 +388,7 @@ def __init__(
             self.spec_metadata = None
             update_spec_config_from_model_config(self.spec_config,
                                                  self.model.config)
-            max_num_draft_tokens = self.spec_config.max_draft_len * batch_size
+            max_num_draft_tokens = self.spec_config.max_draft_len * batch_size if not self.is_draft_model else 0
             self.draft_tokens_cuda = torch.empty((max_num_draft_tokens, ),
                                                  dtype=torch.int,
                                                  device='cuda')
@@ -402,9 +402,11 @@ def __init__(
             self.previous_kv_lens_offsets_cuda = torch.zeros((batch_size, ),
                                                              dtype=torch.int,
                                                              device='cuda')
+            # TODO undo this hack
             self.without_logits = self.spec_config.spec_dec_mode.without_logits(
-            )
-            self.max_draft_len = spec_config.max_draft_len
+            ) or (self.is_draft_model
+                  and self.spec_config.spec_dec_mode.is_eagle3())
+            self.max_draft_len = spec_config.max_draft_len if not self.is_draft_model else 0
         else:
             self.without_logits = False
             self.max_draft_len = 0
@@ -466,7 +468,7 @@ def __init__(
 
     @property
     def runtime_draft_len(self):
-        return self.max_draft_len if self.enable_spec_decode else 0
+        return self.max_draft_len if self.enable_spec_decode and not self.is_draft_model else 0
 
     def set_lora_model_config(self,
                               lora_target_modules: list[str],
@@ -989,7 +991,7 @@ def _maybe_get_cuda_graph(
         if ExpertStatistic.set_iter(self.iter_counter):
             return None
 
-        draft_len = self.spec_config.max_draft_len if self.enable_spec_decode else 0
+        draft_len = self.spec_config.max_draft_len if self.enable_spec_decode and not self.is_draft_model else 0
         can_run_cuda_graph = batch.can_run_cuda_graph
         batch_size = len(batch.generation_requests)
         if self._run_cuda_graphs and self.enable_attention_dp and self.mapping.tp_size > 1:
@@ -1022,6 +1024,8 @@ def _maybe_get_cuda_graph(
         if self.enable_spec_decode:
             spec_metadata = self.spec_metadata.create_cuda_graph_metadata(
                 num_sequences_in_batch)
+            if self.is_draft_model:
+                spec_metadata.max_draft_len = 0
             spec_metadata.draft_tokens = self.draft_tokens_cuda
         else:
             spec_metadata = None
@@ -1153,6 +1157,9 @@ def init_meta_tensor(t: torch.Tensor):
                 logger.info("moe_load_balancer finalize model done")
 
             torch.cuda.current_stream().synchronize()
+        if self.spec_config is not None and self.is_draft_model:
+            model = self.spec_config.get_draft_model_wrapper(model) or model
+
         return model
 
     def _call_load_weights(self, load_method, weights, weight_mapper):
@@ -1411,7 +1418,7 @@ def _prepare_tp_inputs(
                 past_seen_token_num = request.max_beam_num_tokens - 1
                 draft_lens.append(num_draft_tokens)
 
-                if self.enable_spec_decode and spec_config.spec_dec_mode.extend_ctx(
+                if self.enable_spec_decode and not self.is_draft_model and spec_config.spec_dec_mode.extend_ctx(
                         self.attn_backend):
                     # We're treating the prompt lengths as context requests here, so
                     # the the prompt lens should not include the cached tokens.
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -252,9 +252,6 @@ def create_py_executor(
         with mem_monitor.observe_creation_stage(
                 _ExecutorCreationStage.MODEL_ENGINE_DRAFT):
             draft_spec_config = copy.copy(spec_config)
-            # The draft model won't have any draft tokens attached to
-            # generation requests when we invoke it autoregressively
-            draft_spec_config.max_draft_len = 0
 
             draft_model_engine = PyTorchModelEngine(
                 model_path=spec_config.speculative_model_dir,
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Any, Dict
 
 import torch
 from torch import nn
@@ -493,3 +493,86 @@ def prepare_1st_drafter_inputs(
             "attn_metadata": attn_metadata,
             "spec_metadata": spec_metadata,
         }
+
+
+class ChainDrafter(torch.nn.Module):
+
+    def __init__(self, max_draft_len: int, draft_model: torch.nn.Module):
+        super().__init__()
+        self.draft_model = draft_model
+        self.config = self.draft_model.config
+        self.model_config = self.draft_model.model_config
+        self.max_draft_len = max_draft_len
+
+    def forward(self, input_ids, position_ids, attn_metadata, spec_metadata,
+                **kwargs):
+        batch_size = attn_metadata.num_seqs
+
+        logits = self.draft_model.forward(input_ids=input_ids,
+                                          position_ids=position_ids,
+                                          attn_metadata=attn_metadata,
+                                          spec_metadata=spec_metadata)
+
+        new_draft_tokens = [self.sample(logits)]
+
+        if attn_metadata.is_cuda_graph:
+            seq_len = attn_metadata._seq_lens[:batch_size].clone()
+            seq_len_cuda = attn_metadata._seq_lens_cuda[:batch_size].clone()
+
+        last_tokens_idx = torch.cumsum(
+            attn_metadata.seq_lens_cuda, dim=0, dtype=torch.long) - 1
+        new_position_ids = position_ids[0, last_tokens_idx] + 1
+
+        attn_metadata._seq_lens[:batch_size].fill_(1)
+        attn_metadata._seq_lens_cuda[:batch_size].fill_(1)
+        attn_metadata.on_update()
+        attn_metadata.kv_lens_cuda[:batch_size] += 1
+
+        attn_metadata.host_request_types[:attn_metadata.num_contexts].fill_(1)
+        attn_metadata.num_contexts = 0
+
+        spec_metadata.eagle3_resource_manager.is_first_draft = False
+        spec_metadata.is_first_draft = False
+
+        old_write_indices = spec_metadata.hidden_states_write_indices
+
+        spec_metadata.hidden_states_read_indices[:batch_size].copy_(
+            old_write_indices[last_tokens_idx])
+        spec_metadata.hidden_states_write_indices[:batch_size].copy_(
+            torch.arange(
+                batch_size,
+                dtype=spec_metadata.hidden_states_write_indices.dtype,
+                device=spec_metadata.hidden_states_write_indices.device))
+        spec_metadata.num_tokens = batch_size
+
+        for i in range(self.max_draft_len - 1):
+            logits = self.draft_model.forward(input_ids=new_draft_tokens[-1],
+                                              position_ids=new_position_ids,
+                                              attn_metadata=attn_metadata,
+                                              spec_metadata=spec_metadata)
+            new_draft_tokens.append(self.sample(logits))
+            new_position_ids += 1
+            attn_metadata.kv_lens_cuda[:batch_size] += 1
+            if i == 0:
+                spec_metadata.hidden_states_read_indices[:batch_size].copy_(
+                    spec_metadata.hidden_states_write_indices[:batch_size])
+
+        spec_metadata.is_first_draft = True
+        spec_metadata.eagle3_resource_manager.is_first_draft = True
+
+        if attn_metadata.is_cuda_graph:
+            attn_metadata._seq_lens[:batch_size].copy_(seq_len[:batch_size])
+            attn_metadata._seq_lens_cuda[:batch_size].copy_(
+                seq_len_cuda[:batch_size])
+
+        return torch.stack(new_draft_tokens)
+
+    def sample(self, logits: torch.Tensor) -> torch.Tensor:
+        tokens = torch.argmax(logits, dim=-1)
+        d2t = self.draft_model.model.d2t.data
+
+        return tokens + d2t[tokens]
+
+    def load_weights_from_target_model(self,
+                                       target_model: torch.nn.Module) -> None:
+        self.draft_model.load_weights_from_target_model(target_model)
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -72,6 +72,8 @@ def __init__(
             self._request_draft_logits = sampler.enable_mixed_sampler
         self.guided_decoder = guided_decoder
 
+        self.use_static_draft_loop = True
+
     def _create_draft_request(self, request: LlmRequest,
                               input_tokens: Optional[List]) -> LlmRequest:
         """Create a draft request with common parameters."""
@@ -257,8 +259,8 @@ def _forward_draft_model(
                 new_tensors_device=new_tensors_device)
 
         # Handle d2t data if available
-        if hasattr(self.draft_model_engine.model.model, 'd2t'):
-            outputs['d2t'] = self.draft_model_engine.model.model.d2t.data
+        # if hasattr(self.draft_model_engine.model.model, 'd2t'):
+        #     outputs['d2t'] = self.draft_model_engine.model.model.d2t.data
 
         return outputs
 
@@ -377,6 +379,17 @@ def prepare_draft_tokens(
 
             # Initial forward pass
             outputs = self._forward_draft_model(draft_batch, resource_manager)
+
+            if self.use_static_draft_loop:
+                outputs_host = outputs.cpu()
+                for token_idx in range(self.max_draft_tokens):
+                    for req_idx, req in enumerate(draft_batch.all_requests()):
+                        target_req = req_id_to_old_request[req.py_request_id]
+                        target_req.py_draft_tokens.append(
+                            outputs_host[token_idx][req_idx])
+
+                return
+
             self._execute_guided_decoder(draft_batch,
                                          outputs['logits'],
                                          d2t=outputs.get('d2t'))
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -400,6 +400,9 @@ def spec_dec_mode(self):
         return TorchSpeculativeDecodingMode.from_string(
             self.decoding_type.upper())
 
+    def get_draft_model_wrapper(self, model):
+        return None
+
 
 class MedusaDecodingConfig(DecodingBaseConfig):
     medusa_choices: Optional[List[List[int]]] = None
@@ -443,6 +446,11 @@ def spec_dec_mode(self):
             return TorchSpeculativeDecodingMode.EAGLE3_ONE_MODEL
         return TorchSpeculativeDecodingMode.EAGLE3
 
+    def get_draft_model_wrapper(self, model):
+        from tensorrt_llm._torch.speculative.eagle3 import ChainDrafter
+
+        return ChainDrafter(self.max_draft_len, model)
+
 
 class UserProvidedDecodingConfig(DecodingBaseConfig):
     # Cannot use real type annotations due to circular imports