diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
index 971a6c992b1..92869ca401b 100644
--- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
+++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -1,11 +1,15 @@
 import bisect
 import contextlib
-import weakref
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional, Tuple
 
 import torch
 
+from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
+from tensorrt_llm.mapping import Mapping
+
 from ...inputs.multimodal import MultimodalParams
+from ..distributed import MPIDist
 from ..expert_statistic import ExpertStatistic
 from ..memory_buffer_utils import get_memory_buffers
 from ..modules.multi_stream_utils import with_multi_stream
@@ -15,13 +19,60 @@
                                ResourceManagerType)
 from .scheduler import ScheduledRequests
 
-if TYPE_CHECKING:
-    from .model_engine import PyTorchModelEngine
-
 # A large prime number used for dummy request IDs to avoid collisions
 CUDA_GRAPH_DUMMY_REQUEST_ID = (1 << 64) - 1
 
 
+@dataclass
+class CUDAGraphRunnerConfig:
+    """Configuration for the CUDAGraphRunner, passed from the ModelEngine."""
+    use_cuda_graph: bool
+    """
+    Master switch controlling the model's execution path.
+
+    This flag determines one of three distinct execution paths for the
+    model engine:
+
+    1.  **`False` (Pure Eager Path):**
+        * Forces all execution to be in eager mode.
+        * The `CUDAGraphRunner` instance is mostly dormant
+        * Methods like `maybe_get_cuda_graph` and `pad_batch`
+            will return immediately, signaling the model engine to
+            run in eager mode.
+
+    2.  **`True` (Eager Fallback Path):**
+        * The runner is active and checks for graph eligibility.
+        * If a batch is ineligible (e.g., it's a prefill batch,
+            stats collection is on, or it's an unsupported batch size),
+            the runner signals a fallback to eager mode for that batch.
+
+    3.  **`True` (CUDA Graph Path):**
+        * The runner finds an eligible batch and a matching graph.
+        * The graph is then captured (if new) or replayed.
+
+    Note: As of this implementation, the model engine *always* calls
+    `cuda_graph_runner.pad_batch` and `cuda_graph_runner.maybe_get_cuda_graph`
+    even when this is `False`. This could be refactored in the future
+    so that the engine bypasses the `CUDAGraphRunner` entirely in Case 1.
+    """
+    cuda_graph_padding_enabled: bool
+    cuda_graph_batch_sizes: list[int]
+    max_cuda_graph_batch_size: int
+    max_beam_width: int
+    max_num_tokens: int
+    spec_config: Optional[DecodingBaseConfig]
+    cuda_graph_mem_pool: Any
+    use_mrope: bool
+    original_max_draft_len: int
+    original_max_total_draft_tokens: int
+    is_draft_model: bool
+    enable_attention_dp: bool
+    batch_size: int
+    mapping: Optional[Mapping]
+    dist: Optional[MPIDist]
+    kv_cache_manager_key: Any
+
+
 class CUDAGraphRunner:
     """
     Manages the lifecycle and execution of CUDA graphs for the model engine.
@@ -32,22 +83,22 @@ class CUDAGraphRunner:
     """
     WARMUP_STEPS = 2
 
-    def __init__(self, engine: "PyTorchModelEngine"):
-        self.engine_ref = weakref.ref(engine)
+    def __init__(self, config: CUDAGraphRunnerConfig):
+        self.config = config
 
-        # High-level configuration
-        self.enabled = engine.llm_args.cuda_graph_config is not None
-        self.padding_enabled = engine._cuda_graph_padding_enabled
-        self.supported_batch_sizes = engine._cuda_graph_batch_sizes
-        self.max_supported_batch_size = engine._max_cuda_graph_batch_size
-        self.max_beam_width = engine.max_beam_width
-        self.spec_config = engine.spec_config
+        # High-level configuration from the config object
+        self.enabled = config.use_cuda_graph
+        self.padding_enabled = config.cuda_graph_padding_enabled
+        self.supported_batch_sizes = config.cuda_graph_batch_sizes
+        self.max_supported_batch_size = config.max_cuda_graph_batch_size
+        self.max_beam_width = config.max_beam_width
+        self.spec_config = config.spec_config
 
         self.graphs: Dict[Tuple[int, int, int], torch.cuda.CUDAGraph] = {}
         self.graph_outputs: Dict[Tuple[int, int, int],
                                  Callable[[], Optional[torch.Tensor]]] = {}
         self.graph_metadata: Dict[Tuple[int, int, int], Dict[str, Any]] = {}
-        self.memory_pool = engine._cuda_graph_mem_pool
+        self.memory_pool = config.cuda_graph_mem_pool
         self.padding_dummy_request: Optional["Request"] = None
 
         self.shared_static_tensors: Dict[str, torch.Tensor] = {}
@@ -57,12 +108,11 @@ def __init__(self, engine: "PyTorchModelEngine"):
 
     def _create_shared_static_tensors(self):
         """Allocates static tensors sized for the largest possible batch."""
-        engine = self._get_engine()
-
-        token_per_request = self.max_possible_draft_len + 1
+        max_draft_len = self.config.original_max_total_draft_tokens if self.config.spec_config is not None else 0
+        token_per_request = max_draft_len + 1
         max_total_tokens = (self.max_supported_batch_size *
                             self.max_beam_width * token_per_request)
-        max_total_tokens = min(max_total_tokens, engine.max_num_tokens)
+        max_total_tokens = min(max_total_tokens, self.config.max_num_tokens)
 
         self.shared_static_tensors = {
             "input_ids":
@@ -71,7 +121,7 @@ def _create_shared_static_tensors(self):
             torch.zeros((1, max_total_tokens), device="cuda",
                         dtype=torch.int32),
         }
-        if engine.use_mrope:
+        if self.config.use_mrope:
             self.shared_static_tensors["position_ids"] = torch.zeros(
                 (3, 1, max_total_tokens), device="cuda", dtype=torch.int32)
             self.shared_static_tensors["multimodal_params"] = [
@@ -85,28 +135,17 @@ def _create_shared_static_tensors(self):
                     }) for _ in range(max_total_tokens)
             ]
 
-    @property
-    def enable_spec_decode(self):
-        return self._get_engine().enable_spec_decode
-
-    @property
-    def max_possible_draft_len(self):
-        engine = self._get_engine()
-        return (engine.original_max_total_draft_tokens
-                if self.enable_spec_decode else 0)
-
     def get_graph_key(
             self,
             batch: ScheduledRequests,
             spec_resource_manager: Optional[BaseResourceManager] = None):
-        engine = self._get_engine()
-        if engine.is_draft_model and spec_resource_manager is not None and isinstance(
+        batch_size = batch.batch_size
+        if self.config.is_draft_model and spec_resource_manager is not None and isinstance(
                 spec_resource_manager, Eagle3ResourceManager):
             # If 'is_first_draft' is True, even with tree decoding, the length of draft_len will only be 'max_draft_len', not 'max_total_draft_token'.
             # Because we will pad the input to 'max_draft_len' length for the first draft layer.
-            draft_len = engine.original_max_draft_len if spec_resource_manager.is_first_draft else 0
-            key = (batch.batch_size, draft_len,
-                   spec_resource_manager.is_first_draft)
+            draft_len = self.config.original_max_draft_len if spec_resource_manager.is_first_draft else 0
+            key = (batch_size, draft_len, spec_resource_manager.is_first_draft)
         else:
             # With dynamic spec decode, the draft length maybe zero even when enable_spec_decode is True,
             # so we need to get the draft length from the batch instead of using enable_spec_decode.
@@ -116,56 +155,38 @@ def get_graph_key(
             draft_len = max(draft_len_list)
             assert len(
                 set(draft_len_list)) == 1, "All draft lengths must be the same"
-            key = (batch.batch_size, draft_len, False)
+            key = (batch_size, draft_len, False)
         return key
 
-    @property
-    def spec_metadata(self):
-        return self._get_engine().spec_metadata
-
-    @property
-    def draft_tokens_cuda(self):
-        return self._get_engine().draft_tokens_cuda
-
-    @property
-    def attn_metadata(self):
-        return self._get_engine().attn_metadata
-
     def __del__(self):
         self.clear()
 
-    def _get_engine(self) -> "PyTorchModelEngine":
-        """Safely dereferences the weak reference to the engine."""
-        engine = self.engine_ref()
-        if engine is None:
-            raise RuntimeError(
-                "The parent PyTorchModelEngine has been garbage collected.")
-        return engine
-
     def maybe_get_cuda_graph(
-            self,
-            batch: ScheduledRequests,
-            spec_resource_manager: Optional[BaseResourceManager] = None):
+        self,
+        batch: ScheduledRequests,
+        iter_counter: int,
+        enable_spec_decode: bool,
+        attn_metadata: Any,
+        spec_metadata: Optional[Any] = None,
+        draft_tokens_cuda: Optional[torch.Tensor] = None,
+        spec_resource_manager: Optional[BaseResourceManager] = None,
+    ) -> Tuple[Optional[Any], Optional[Any], Optional[Tuple[int, int, bool]]]:
         """
         Determines if the current batch can be run with a CUDA graph.
 
         Returns a tuple containing:
-        - A boolean indicating if a graph can be used.
         - The attn_metadata for the graph, if applicable.
         - The spec_metadata for the graph, if applicable.
-        - The key for the graph.
+        - The key for the graph, if applicable.
         """
-        engine = self._get_engine()
-
         # disable when doing statistic
-        if hasattr(engine, 'iter_counter') and ExpertStatistic.set_iter(
-                engine.iter_counter):
-            return False, None, None, None
+        if ExpertStatistic.set_iter(iter_counter):
+            return None, None, None
 
         can_run_cuda_graph = batch.can_run_cuda_graph
         batch_size = batch.batch_size
-        if self.enabled and engine.enable_attention_dp and engine.mapping.tp_size > 1:
-            all_can_graph_batch = engine.dist.tp_allgather(
+        if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1:
+            all_can_graph_batch = self.config.dist.tp_allgather(
                 [can_run_cuda_graph, batch_size])
             is_all_gen_only = all(all_can_graph[0]
                                   for all_can_graph in all_can_graph_batch)
@@ -174,34 +195,33 @@ def maybe_get_cuda_graph(
                 for all_gen_only in all_can_graph_batch)
 
             if not is_all_gen_only or not all_batch_size_equal:
-                return False, None, None, None
+                return None, None, None
 
         if not self.enabled or not can_run_cuda_graph:
-            return False, None, None, None
+            return None, None, None
         key = self.get_graph_key(batch, spec_resource_manager)
 
         if key in self.graphs:
-            return True, self.graph_metadata[key][
+            return self.graph_metadata[key][
                 "attn_metadata"], self.graph_metadata[key]["spec_metadata"], key
 
         if batch_size not in self.supported_batch_sizes:
-            return False, None, None, None
+            return None, None, None
 
         num_sequences_in_batch = batch_size * self.max_beam_width
-        attn_metadata = self.attn_metadata.create_cuda_graph_metadata(
+        graph_attn_metadata = attn_metadata.create_cuda_graph_metadata(
             num_sequences_in_batch, False, key[1], self.cuda_graph_meta_buffers)
-        assert attn_metadata.is_cuda_graph
+        assert graph_attn_metadata.is_cuda_graph
 
-        if self.enable_spec_decode:
-            spec_metadata = self.spec_metadata.create_cuda_graph_metadata(
+        if enable_spec_decode:
+            graph_spec_metadata = spec_metadata.create_cuda_graph_metadata(
                 num_sequences_in_batch)
-            spec_metadata.draft_tokens = self.draft_tokens_cuda
+            graph_spec_metadata.draft_tokens = draft_tokens_cuda
         else:
-            spec_metadata = None
-        return True, attn_metadata, spec_metadata, key
+            graph_spec_metadata = None
+        return graph_attn_metadata, graph_spec_metadata, key
 
     def needs_capture(self, key: Tuple[int, int, int]):
-
         return key not in self.graph_outputs
 
     def get_graph_pool(self):
@@ -217,9 +237,9 @@ def capture(self,
                 key: Tuple[int, int, int],
                 forward_fn: Callable,
                 initial_inputs: Dict[str, Any],
+                enable_spec_decode: bool = False,
                 postprocess_fn: Optional[Callable] = None):
         """Captures the forward pass for a given batch size."""
-        engine = self._get_engine()
         batch_size = key[0]
         # [CUDA graph spec decode padding]
         # We pad input IDs/position IDs to the maximum draft length (token per request).
@@ -236,7 +256,7 @@ def capture(self,
             self.shared_static_tensors["position_ids"]
             [:, :num_tokens_for_capture],
         }
-        if engine.use_mrope:
+        if self.config.use_mrope:
             sliced_static_tensors["position_ids"] = self.shared_static_tensors[
                 "position_ids"][:, :, :num_tokens_for_capture],
             sliced_static_tensors[
@@ -254,12 +274,10 @@ def capture(self,
         def _setup_spec_decoding_and_forward(key: Tuple[int, int, int],
                                              forward_fn: Callable,
                                              capture_inputs: Dict[str, Any]):
-            engine = self._get_engine()
-            # for the first inference of draft model, we need to set the use_spec_decoding to True when capture the graph for multiple runs.
             is_first_draft = key[2]
-            needs_kv_cache_recompute = True if engine.enable_spec_decode and engine.spec_config.spec_dec_mode.needs_kv_cache_recompute(
+            needs_kv_cache_recompute = True if enable_spec_decode and self.config.spec_config.spec_dec_mode.needs_kv_cache_recompute(
             ) else False
-            if is_first_draft and engine.is_draft_model and needs_kv_cache_recompute:
+            if is_first_draft and self.config.is_draft_model and needs_kv_cache_recompute:
                 capture_inputs['attn_metadata'].use_spec_decoding = True
             return forward_fn(capture_inputs)
 
@@ -288,7 +306,6 @@ def _setup_spec_decoding_and_forward(key: Tuple[int, int, int],
     def replay(self, key: Tuple[int, int, int],
                current_inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
         """Replays a previously captured graph."""
-        engine = self._get_engine()
         stored_meta = self.graph_metadata[key]
         assert current_inputs["attn_metadata"] is stored_meta["attn_metadata"]
         if stored_meta["spec_metadata"] is not None:
@@ -302,7 +319,7 @@ def replay(self, key: Tuple[int, int, int],
         static_tensors["input_ids"][:seqlen].copy_(input_ids)
 
         position_ids = current_inputs["position_ids"]
-        if engine.use_mrope and current_inputs.get(
+        if self.config.use_mrope and current_inputs.get(
                 'multimodal_params') is not None:
             static_tensors["position_ids"][:, :, :seqlen].copy_(position_ids)
             for i, multimodal_param in enumerate(
@@ -322,16 +339,16 @@ def replay(self, key: Tuple[int, int, int],
         return output_ref
 
     def _get_padded_batch(self, batch: ScheduledRequests,
-                          resource_manager: ResourceManager) -> int:
-        engine = self._get_engine()
+                          resource_manager: ResourceManager,
+                          runtime_draft_len: int) -> int:
         kv_cache_manager = resource_manager.get_resource_manager(
-            engine.kv_cache_manager_key)
+            self.config.kv_cache_manager_key)
         can_run_cuda_graph = batch.can_run_cuda_graph
         batch_size = batch.batch_size
         new_batch_size = batch_size
 
-        if self.enabled and engine.enable_attention_dp and engine.mapping.tp_size > 1:
-            graph_batch_size = engine.dist.tp_allgather(
+        if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1:
+            graph_batch_size = self.config.dist.tp_allgather(
                 [can_run_cuda_graph, batch_size])
             all_can_graph = all(graph_batch[0]
                                 for graph_batch in graph_batch_size)
@@ -349,7 +366,7 @@ def _get_padded_batch(self, batch: ScheduledRequests,
             return 0
 
         padding_size = padded_batch_size - batch_size
-        if padding_size + batch.batch_size > engine.batch_size:
+        if padding_size + batch.batch_size > self.config.batch_size:
             return 0
 
         # No padding if it would create too many concurrent requests.
@@ -364,9 +381,9 @@ def _get_padded_batch(self, batch: ScheduledRequests,
             self.padding_dummy_request = kv_cache_manager.add_dummy_requests(
                 [CUDA_GRAPH_DUMMY_REQUEST_ID],
                 is_gen=True,
-                max_num_draft_tokens=engine.runtime_draft_len,
-                use_mrope=engine.use_mrope,
-                max_beam_width=engine.max_beam_width)[0]
+                max_num_draft_tokens=runtime_draft_len,
+                use_mrope=self.config.use_mrope,
+                max_beam_width=self.config.max_beam_width)[0]
             self.padding_dummy_request.is_cuda_graph_dummy = True
             spec_res_mgr = resource_manager.get_resource_manager(
                 ResourceManagerType.SPEC_RESOURCE_MANAGER)
@@ -387,12 +404,14 @@ def _round_up_batch_size(self, batch_size: int) -> int:
         return self.supported_batch_sizes[idx]
 
     @contextlib.contextmanager
-    def pad_batch(self, scheduled_requests: ScheduledRequests,
-                  resource_manager: ResourceManager):
+    def pad_batch(self,
+                  scheduled_requests: ScheduledRequests,
+                  resource_manager: ResourceManager,
+                  runtime_draft_len: int = 0):
         """Context manager to pad a batch to a graph-compatible size."""
-
         padding_size = self._get_padded_batch(scheduled_requests,
-                                              resource_manager)
+                                              resource_manager,
+                                              runtime_draft_len)
         try:
             yield scheduled_requests
         finally:
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index e3c12e36b49..5c2b9ac9f6a 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -55,7 +55,7 @@
                      set_per_request_piecewise_cuda_graph_flag,
                      set_torch_compiling, with_model_extra_attrs)
 from .config_utils import is_mla
-from .cuda_graph_runner import CUDAGraphRunner
+from .cuda_graph_runner import CUDAGraphRunner, CUDAGraphRunnerConfig
 from .guided_decoder import CapturableGuidedDecoder
 from .layerwise_nvtx_marker import LayerwiseNvtxMarker
 from .llm_request import get_draft_token_length
@@ -370,9 +370,31 @@ def __init__(
         # We look up this key in resource_manager during forward to find the
         # kv cache manager. Can be changed to support multiple model engines
         # with different KV cache managers.
-        self.kv_cache_manager_key = ResourceManagerType.KV_CACHE_MANAGER
+        self.kv_cache_manager_key = ResourceManagerType.DRAFT_KV_CACHE_MANAGER if is_draft_model else ResourceManagerType.KV_CACHE_MANAGER
         self.lora_model_config: Optional[LoraModelConfig] = None
-        self.cuda_graph_runner = CUDAGraphRunner(self)
+
+        # Create config and runner
+        cuda_graph_runner_config = CUDAGraphRunnerConfig(
+            use_cuda_graph=self.cuda_graph_config is not None,
+            cuda_graph_padding_enabled=self._cuda_graph_padding_enabled,
+            cuda_graph_batch_sizes=self._cuda_graph_batch_sizes,
+            max_cuda_graph_batch_size=self._max_cuda_graph_batch_size,
+            max_beam_width=self.max_beam_width,
+            spec_config=self.spec_config,
+            cuda_graph_mem_pool=self._cuda_graph_mem_pool,
+            max_num_tokens=self.max_num_tokens,
+            use_mrope=self.use_mrope,
+            original_max_draft_len=self.original_max_draft_len,
+            original_max_total_draft_tokens=self.
+            original_max_total_draft_tokens,
+            is_draft_model=self.is_draft_model,
+            enable_attention_dp=self.enable_attention_dp,
+            batch_size=self.batch_size,
+            mapping=self.mapping,
+            dist=self.dist,
+            kv_cache_manager_key=self.kv_cache_manager_key,
+        )
+        self.cuda_graph_runner = CUDAGraphRunner(cuda_graph_runner_config)
 
         # Setup the local cache indirection buffer only once and reuse it.
         # This way it can also be used for CUDA graphs.
@@ -2319,11 +2341,21 @@ def forward(
                     return self._forward_step(inputs, gather_ids,
                                               gather_context_logits)
         with self.cuda_graph_runner.pad_batch(
-                scheduled_requests, resource_manager) as padded_requests:
-
-            maybe_graph, maybe_attn_metadata, maybe_spec_metadata, key = self.cuda_graph_runner.maybe_get_cuda_graph(
-                padded_requests, spec_resource_manager)
-            if maybe_graph:
+                scheduled_requests, resource_manager,
+                self.runtime_draft_len) as padded_requests:
+
+            maybe_attn_metadata, maybe_spec_metadata, key = self.cuda_graph_runner.maybe_get_cuda_graph(
+                padded_requests,
+                iter_counter=self.iter_counter,
+                enable_spec_decode=self.enable_spec_decode,
+                attn_metadata=attn_metadata,
+                spec_metadata=spec_metadata,
+                draft_tokens_cuda=self.draft_tokens_cuda
+                if self.is_spec_decode else None,
+                spec_resource_manager=spec_resource_manager,
+            )
+            can_run_graph = key is not None
+            if can_run_graph:
                 attn_metadata = maybe_attn_metadata
                 spec_metadata = maybe_spec_metadata
             else:
@@ -2339,7 +2371,7 @@ def forward(
 
             self.iter_counter += 1
             with with_shared_pool(self.cuda_graph_runner.get_graph_pool()):
-                if not maybe_graph:
+                if not can_run_graph:
                     # Fallback to eager execution if graph was not used
                     with MoeLoadBalancerIterContext(moe_load_balancer):
                         outputs = self._forward_step(inputs, gather_ids,
@@ -2357,9 +2389,12 @@ def capture_forward_fn(inputs: Dict[str, Any]):
                         def capture_postprocess_fn(inputs: Dict[str, Any]):
                             self._postprocess_inputs(inputs)
 
-                        self.cuda_graph_runner.capture(key, capture_forward_fn,
-                                                       inputs,
-                                                       capture_postprocess_fn)
+                        self.cuda_graph_runner.capture(
+                            key,
+                            capture_forward_fn,
+                            inputs,
+                            enable_spec_decode=self.enable_spec_decode,
+                            postprocess_fn=capture_postprocess_fn)
 
                         # here we don't need to use context since cuda graph capture didn't run kernel.
                         # maybe we need a cleaner way to do this.
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index 813585950c9..3f7658354ab 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -384,7 +384,6 @@ def drafting_loop_wrapper(model):
             # For DeepseekV3 MTP, we need to set the num_hidden_layers to 1 for the draft model
             if spec_config.spec_dec_mode.is_mtp_eagle():
                 draft_model_engine.model.model_config.pretrained_config.num_hidden_layers = 1
-            draft_model_engine.kv_cache_manager_key = ResourceManagerType.DRAFT_KV_CACHE_MANAGER
             draft_model_engine.load_weights_from_target_model(
                 model_engine.model)
     else:
diff --git a/tests/unittest/_torch/helpers.py b/tests/unittest/_torch/helpers.py
index a915956f0a9..163309cb1f4 100644
--- a/tests/unittest/_torch/helpers.py
+++ b/tests/unittest/_torch/helpers.py
@@ -3,7 +3,10 @@
 import torch
 import torch.nn.functional as F
 
-from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
+from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import (
+    CUDAGraphRunner, CUDAGraphRunnerConfig)
+from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
+from tensorrt_llm.mapping import Mapping
 
 
 def ceil_div(x: int, y: int) -> int:
@@ -166,42 +169,23 @@ def block_scale_gemm(mat_a: torch.Tensor, mat_scale_a: torch.Tensor,
     return results.view_as(x)
 
 
-class MockPytorchBackendConfig:
-
-    def __init__(self, use_cuda_graph, cuda_graph_padding_enabled):
-        self.use_cuda_graph = use_cuda_graph
-        self.cuda_graph_padding_enabled = cuda_graph_padding_enabled
-
-
-class MockEngine:
-    """A replacement for SimpleNamespace that supports weak references."""
-
-    def __init__(self, **kwargs):
-        self.__dict__.update(kwargs)
-
-
-def create_mock_engine(batch_size: int):
-
-    class MockSpecConfig:
-
-        class SpecDecMode:
-
-            def needs_kv_cache_recompute(self):
-                return False
-
-        spec_dec_mode = SpecDecMode()
-
-    return MockEngine(
-        llm_args=TorchLlmArgs(model="dummy"),
-        _cuda_graph_padding_enabled=True,
-        _cuda_graph_batch_sizes=[batch_size],
-        _max_cuda_graph_batch_size=batch_size,
+def create_mock_cuda_graph_runner(batch_size: int, use_mrope: bool = False):
+    config = CUDAGraphRunnerConfig(
+        use_cuda_graph=True,
+        cuda_graph_padding_enabled=False,
+        cuda_graph_batch_sizes=[batch_size],
+        max_cuda_graph_batch_size=batch_size,
+        batch_size=batch_size,
         max_beam_width=1,
-        max_num_tokens=8192,
-        is_spec_decode=False,
-        enable_spec_decode=False,
-        spec_config=MockSpecConfig(),
+        max_num_tokens=1,
+        use_mrope=use_mrope,
+        spec_config=None,
+        cuda_graph_mem_pool=None,
+        enable_attention_dp=False,
+        original_max_draft_len=0,
+        original_max_total_draft_tokens=0,
         is_draft_model=False,
-        _cuda_graph_mem_pool=None,
-        use_mrope=False,
-    )
+        mapping=Mapping(),
+        dist=None,
+        kv_cache_manager_key=ResourceManagerType.KV_CACHE_MANAGER)
+    return CUDAGraphRunner(config)
diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
index 28a35323b6e..a224cea1186 100644
--- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py
+++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
@@ -22,7 +22,7 @@ class Exaone4Config(PretrainedConfig):
     # TODO: Remove this once we have a proper config for Exaone4
     SKIP_EXAONE4_HF_ACCURACY_TEST = True
 
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from transformers.cache_utils import HybridCache
 from utils.util import getSMVersion
 
@@ -31,7 +31,6 @@ class Exaone4Config(PretrainedConfig):
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_exaone4 import Exaone4ForCausalLM
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -338,10 +337,8 @@ def test_exaone4_allclose_to_hf(self, scenario: Scenario) -> None:
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_llama.py b/tests/unittest/_torch/modeling/test_modeling_llama.py
index 0fdfa7ff0fa..9b9c9e53874 100644
--- a/tests/unittest/_torch/modeling/test_modeling_llama.py
+++ b/tests/unittest/_torch/modeling/test_modeling_llama.py
@@ -4,7 +4,7 @@
 from typing import Any
 
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from parameterized import parameterized
 from transformers import LlamaConfig
 from transformers import LlamaForCausalLM as HFLlamaForCausalLM
@@ -16,7 +16,6 @@
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_llama import LlamaForCausalLM
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._torch.pyexecutor.scheduler import ScheduledRequests
@@ -331,10 +330,8 @@ def test_llama_allclose_to_hf(self, scenario: Scenario) -> None:
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
index 367dee787a0..941b15890e3 100644
--- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
+++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
@@ -4,7 +4,7 @@
 
 import torch
 import transformers
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from parameterized import parameterized
 from transformers import Llama4Config
 from transformers import \
@@ -20,7 +20,6 @@
     Llama4HfWeightMapper
 from tensorrt_llm._torch.models.modeling_llama import \
     Llama4ForConditionalGeneration
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -406,10 +405,8 @@ def test_llama_allclose_to_hf(self, scenario: AllCloseScenario) -> None:
                          input_ids.size(-1) + gen_input_ids.size(-1))
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_mistral.py b/tests/unittest/_torch/modeling/test_modeling_mistral.py
index a79e9415bdb..2be7f4acdbf 100644
--- a/tests/unittest/_torch/modeling/test_modeling_mistral.py
+++ b/tests/unittest/_torch/modeling/test_modeling_mistral.py
@@ -8,7 +8,7 @@
 import torch
 import transformers
 import transformers.models.mistral3
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from PIL import Image
 from utils.util import getSMVersion
 
@@ -19,7 +19,6 @@
 from tensorrt_llm._torch.attention_backend import utils as attention_utils
 from tensorrt_llm._torch.models import modeling_mistral
 from tensorrt_llm._torch.pyexecutor import resource_manager
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm.bindings import executor as executor_lib
 from tensorrt_llm.models import modeling_utils
 
@@ -404,10 +403,7 @@ def test_mistral_3_vlm_allclose_to_hf(mistral_small_3_1_24b_config, backend, use
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(1) if use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_mixtral.py b/tests/unittest/_torch/modeling/test_modeling_mixtral.py
index b8beecaa772..7071a440ff5 100644
--- a/tests/unittest/_torch/modeling/test_modeling_mixtral.py
+++ b/tests/unittest/_torch/modeling/test_modeling_mixtral.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from parameterized import parameterized
 from transformers import MixtralConfig
 from transformers import MixtralForCausalLM as HFMixtralForCausalLM
@@ -16,7 +16,6 @@
 from tensorrt_llm._torch.models.checkpoints.hf.mixtral_weight_mapper import \
     MixtralHfWeightMapper
 from tensorrt_llm._torch.models.modeling_mixtral import MixtralForCausalLM
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -310,10 +309,8 @@ def test_mixtral_allclose_to_hf(self, scenario: Scenario):
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_mllama.py b/tests/unittest/_torch/modeling/test_modeling_mllama.py
index 597c084b41d..a9423b86d35 100644
--- a/tests/unittest/_torch/modeling/test_modeling_mllama.py
+++ b/tests/unittest/_torch/modeling/test_modeling_mllama.py
@@ -4,7 +4,7 @@
 
 import pytest
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from parameterized import parameterized
 from test_modeling_llama import Scenario, reduce_llama_config
 from transformers import MllamaConfig
@@ -17,7 +17,6 @@
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_mllama import \
     MllamaForConditionalGeneration
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -420,10 +419,8 @@ def test_mllama_allclose_to_hf_text_only(self, scenario: Scenario) -> None:
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
index 00817ae062f..c22ac3e308e 100644
--- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py
+++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py
@@ -8,7 +8,7 @@
 from typing import Dict, List, Optional, Tuple, Type
 
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from transformers import AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel
 from utils.llm_data import llm_models_root
 
@@ -17,7 +17,6 @@
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.bindings.executor import KvCacheConfig
@@ -425,8 +424,7 @@ def run_trtllm_forward(self, trtllm_inputs, use_cuda_graph: bool = False):
             trtllm_inputs["attn_metadata"].prepare()
             return self.trtllm_model.forward(**trtllm_inputs)
         else:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+            graph_runner = create_mock_cuda_graph_runner(1)
             trtllm_inputs["attn_metadata"] = trtllm_inputs[
                 "attn_metadata"
             ].create_cuda_graph_metadata(1)
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron.py b/tests/unittest/_torch/modeling/test_modeling_nemotron.py
index d06a6bc6b81..2dcac56ea55 100644
--- a/tests/unittest/_torch/modeling/test_modeling_nemotron.py
+++ b/tests/unittest/_torch/modeling/test_modeling_nemotron.py
@@ -4,7 +4,7 @@
 from typing import Any
 
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from parameterized import parameterized
 from transformers import NemotronConfig
 from transformers import NemotronForCausalLM as HFNemotronForCausalLM
@@ -15,7 +15,6 @@
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_nemotron import NemotronForCausalLM
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -318,10 +317,8 @@ def test_nemotron_allclose_to_hf(self, scenario: Scenario) -> None:
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_phi3.py b/tests/unittest/_torch/modeling/test_modeling_phi3.py
index 1a50b874ae5..1f7f0316611 100644
--- a/tests/unittest/_torch/modeling/test_modeling_phi3.py
+++ b/tests/unittest/_torch/modeling/test_modeling_phi3.py
@@ -4,7 +4,7 @@
 from typing import Any
 
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from transformers import Phi3Config
 from transformers import Phi3ForCausalLM as HFPhi3ForCausalLM
 from utils.util import default_dtype
@@ -14,7 +14,6 @@
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_phi3 import Phi3ForCausalLM
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -310,10 +309,8 @@ def test_phi3_allclose_to_hf(self) -> None:
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen.py b/tests/unittest/_torch/modeling/test_modeling_qwen.py
index a35dc9131f6..d2f9cdaac73 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen.py
@@ -17,12 +17,11 @@
 from tensorrt_llm._torch.models.modeling_qwen import (
     Qwen2ForCausalLM, Qwen2ForProcessRewardModel)
 # yapf: enable
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 
 from utils.llm_data import llm_models_root
 from utils.util import getSMVersion
@@ -265,10 +264,8 @@ def test_qwen_allclose_to_hf(self, scenario: Scenario) -> None:
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py
index 8d6c8649412..56f71d2bad0 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py
@@ -3,7 +3,7 @@
 from typing import List
 
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal
 from transformers import Qwen2_5_VLConfig
 from transformers import \
@@ -13,7 +13,6 @@
 from tensorrt_llm._torch.models.checkpoints.hf.qwen2vl_weight_mapper import \
     Qwen2VLHfWeightMapper
 from tensorrt_llm._torch.models.modeling_qwen2vl import Qwen2_5_VLModel
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 
 QWEN2_5_VL_7B_CONFIG = {
     "architectures": ["Qwen2_5_VLForConditionalGeneration"],
@@ -187,10 +186,8 @@ def run_trtllm_forward(self, trtllm_inputs, use_cuda_graph: bool = False):
             trtllm_inputs["attn_metadata"].prepare()
             return self.trtllm_model.forward(**trtllm_inputs)
         else:
-            mock_engine = create_mock_engine(1)
             # NOTE: Qwen2.5-VL model uses mrope
-            mock_engine.use_mrope = True
-            graph_runner = CUDAGraphRunner(mock_engine)
+            graph_runner = create_mock_cuda_graph_runner(1, True)
             trtllm_inputs["attn_metadata"] = trtllm_inputs[
                 "attn_metadata"].create_cuda_graph_metadata(1)
 
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py
index b8db3be83d6..39cbf33b823 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 
 import torch
-from _torch.helpers import create_mock_engine
+from _torch.helpers import create_mock_cuda_graph_runner
 from parameterized import parameterized
 from transformers import Qwen2MoeConfig
 from transformers import Qwen2MoeForCausalLM as HFQwen2MoeForCausalLM
@@ -16,7 +16,6 @@
 from tensorrt_llm._torch.models.checkpoints.hf.qwen2_moe_weight_mapper import \
     Qwen2MoeHfWeightMapper
 from tensorrt_llm._torch.models.modeling_qwen_moe import Qwen2MoeForCausalLM
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -315,10 +314,8 @@ def test_qwen_moe_allclose_to_hf(self, scenario: Scenario):
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
-        graph_runner = None
-        if scenario.use_cuda_graph:
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+        graph_runner = create_mock_cuda_graph_runner(
+            1) if scenario.use_cuda_graph else None
 
         def run_forward(input_ids, position_ids, attn_metadata):
             attn_metadata.prepare()