[https://nvbugs/5252313][fix] Fix torch compile + MTP (#6554)

liji-nv · web-flow · commit dcbfa7e509bb · 2025-08-05T10:31:29.000-04:00
Signed-off-by: Jin Li &lt;59594262+liji-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/compilation/piecewise_optimizer.py b/tensorrt_llm/_torch/compilation/piecewise_optimizer.py
@@ -10,7 +10,6 @@
 from torch.fx.passes.split_module import split_module
 
 from tensorrt_llm.llmapi.utils import enable_llm_debug
-from tensorrt_llm.logger import logger
 
 from ..utils import (get_model_extra_attrs, get_piecewise_cuda_graph_flag,
                      make_weak_ref)
@@ -169,14 +168,11 @@ def __call__(self, *args):
         if entry.cuda_graph is None:
 
             if not get_enable_piecewise_cuda_graph_capture_flag():
-                logger.warning(
-                    f"Unexpectedly capture cuda graph for {self.name} with runtime_num_of_token {runtime_num_of_token}. Will fallback to non-CUDA graph execution."
-                )
                 return entry.callable(*args)
 
-            if entry.warmup_count < 2:
+            if entry.warmup_count < 3:
                 entry.warmup_count += 1
-                return self.default_callable(*args)
+                return entry.callable(*args)
 
             entry.input_addresses = [
                 i.data_ptr() for i in args if isinstance(i, torch.Tensor)
@@ -204,6 +200,8 @@ def __call__(self, *args):
                 i.data_ptr() for i in output if isinstance(i, torch.Tensor)
             ]
 
+            entry.cuda_graph.replay()
+
             return output
 
         if enable_llm_debug():
diff --git a/tensorrt_llm/_torch/compilation/utils.py b/tensorrt_llm/_torch/compilation/utils.py
@@ -30,7 +30,7 @@ def is_call_function(node: Node, target: Union[List[Callable], Callable]):
         return node.op == "call_function" and node.target == target
 
 
-_enable_piecewise_cuda_graph_capture = True
+_enable_piecewise_cuda_graph_capture = False
 
 
 def set_enable_piecewise_cuda_graph_capture_flag(enable: bool):
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -660,7 +660,6 @@ def disable_optimization(backend: Backend):
                                self._torch_compile_backend)
 
                 self._torch_compile_backend.enable_optimization()
-                set_enable_piecewise_cuda_graph_capture_flag(True)
 
                 # Disable cuda graph capture here so that we can properly capture it later
                 with self.no_cuda_graph():
@@ -748,26 +747,28 @@ def disable_optimization(backend: Backend):
                                      resource_manager=resource_manager)
                         torch.cuda.synchronize()
 
-                    if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
-                        with self.no_cuda_graph():
-                            with release_batch(
-                                    get_torch_compile_warmup_request(
-                                        1, bs)) as batch:
-                                logger.info(
-                                    f"Run piecewise CUDA graph warmup for batch size={bs}"
-                                )
-
-                                for _ in range(3):
-                                    self.forward(
-                                        batch,
-                                        new_tensors_device=None,
-                                        resource_manager=resource_manager)
+            if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
+                for seq_lens in cuda_graph_batch_sizes:
+                    set_enable_piecewise_cuda_graph_capture_flag(True)
+                    with self.no_cuda_graph():
+                        with release_batch(
+                                get_torch_compile_warmup_request(
+                                    1, seq_lens)) as batch:
+                            logger.info(
+                                f"Run piecewise CUDA graph warmup for seq_lens={seq_lens}"
+                            )
+                            # self.model.mtp_worker.stored_input_ids = []
+                            for _ in range(3):
                                 self.forward(batch,
                                              new_tensors_device=None,
                                              resource_manager=resource_manager)
-                                torch.cuda.synchronize()
-                                gc.collect()
-                                torch.cuda.empty_cache()
+                            self.forward(batch,
+                                         new_tensors_device=None,
+                                         resource_manager=resource_manager)
+                            torch.cuda.synchronize()
+                            gc.collect()
+                            torch.cuda.empty_cache()
+                    set_enable_piecewise_cuda_graph_capture_flag(False)
 
         # Set the value back to the original value
         self.enable_spec_decode = self.is_spec_decode
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -1153,17 +1153,20 @@ def prepare_position_ids_and_last_tokens(position_ids, attn_metadata):
             position_ids = position_ids.squeeze(0)
             last_tokens_idx = torch.cumsum(
                 attn_metadata.seq_lens_cuda, dim=0, dtype=torch.long) - 1
-            return position_ids, last_tokens_idx
+            last_tokens_idx_host = torch.cumsum(
+                attn_metadata.seq_lens, dim=0, dtype=torch.long) - 1
+            return position_ids, last_tokens_idx, last_tokens_idx_host
 
-        position_ids, last_tokens_idx = prepare_position_ids_and_last_tokens(
+        position_ids, last_tokens_idx, last_tokens_idx_host = prepare_position_ids_and_last_tokens(
             position_ids, attn_metadata)
-        inputs = self.prepare_drafter_inputs(input_ids=input_ids,
-                                             position_ids=position_ids,
-                                             last_tokens_idx=last_tokens_idx,
-                                             hidden_states=hidden_states,
-                                             accepted_tokens=accepted_tokens,
-                                             attn_metadata=attn_metadata,
-                                             spec_metadata=spec_metadata)
+        inputs = self.prepare_drafter_inputs(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            last_tokens_idx_host=last_tokens_idx_host,
+            hidden_states=hidden_states,
+            accepted_tokens=accepted_tokens,
+            attn_metadata=attn_metadata,
+            spec_metadata=spec_metadata)
 
         # Predict draft tokens
         next_draft_tokens = []
@@ -1277,7 +1280,7 @@ def prepare_drafter_inputs(
         self,
         input_ids: torch.IntTensor,
         position_ids: torch.IntTensor,
-        last_tokens_idx: torch.LongTensor,
+        last_tokens_idx_host: torch.LongTensor,
         hidden_states: torch.Tensor,
         accepted_tokens: torch.Tensor,
         attn_metadata: AttentionMetadata,
@@ -1292,7 +1295,9 @@ def prepare_drafter_inputs(
                                          device="cuda")
         input_ids_ctx[:-1].copy_(input_prompt_ids[1:])
         input_ids_ctx[
-            last_tokens_idx[:num_contexts]] = accepted_tokens[:num_contexts, 0]
+            last_tokens_idx_host[:
+                                 num_contexts]] = accepted_tokens[:num_contexts,
+                                                                  0]
 
         # generation
         input_ids_gen = accepted_tokens[num_contexts:, :].flatten()
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -874,9 +874,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                           [0, pytest.param(2, marks=skip_pre_hopper)])
     def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
                       overlap_scheduler, torch_compile):
-        if torch_compile and mtp_nextn > 0:
-            pytest.skip("https://nvbugs/5252313")
-
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -913,8 +910,6 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
     def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                             attention_dp, cuda_graph, overlap_scheduler,
                             torch_compile):
-        if torch_compile and mtp_nextn > 0:
-            pytest.skip("https://nvbugs/5252313")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
@@ -1004,8 +999,6 @@ def test_cute_dsl_fp8_block_scales(
         overlap_scheduler,
         torch_compile,
     ):
-        if torch_compile and mtp_nextn > 0:
-            pytest.skip("https://nvbugs/5252313")
         if torch_compile and attention_dp:
             pytest.skip("https://nvbugs/5252559")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
@@ -1105,8 +1098,6 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                     fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler, torch_compile):
-        if torch_compile and mtp_nextn > 0:
-            pytest.skip("https://nvbugs/5252313")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
@@ -1166,10 +1157,6 @@ def test_cute_dsl_fp8_block_scales_4gpus(
         overlap_scheduler,
         torch_compile,
     ):
-        if torch_compile and mtp_nextn > 0:
-            pytest.skip("https://nvbugs/5252313")
-        if torch_compile and attention_dp:
-            pytest.skip("https://nvbugs/5252559")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
@@ -1298,8 +1285,6 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv):
     @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
     def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
                    torch_compile, mtp_nextn, moe_backend):
-        if torch_compile and mtp_nextn > 0:
-            pytest.skip("https://nvbugs/5252313")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -1345,8 +1330,6 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, tp_size, pp_size, ep_size,
                          torch_compile, mtp_nextn, moe_backend):
-        if torch_compile and mtp_nextn > 0:
-            pytest.skip("https://nvbugs/5252313")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
         if moe_backend == "TRTLLM" and get_sm_version() == 120: