diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py index 971a6c992b1..92869ca401b 100644 --- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py +++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py @@ -1,11 +1,15 @@ import bisect import contextlib -import weakref -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple +from dataclasses import dataclass +from typing import Any, Callable, Dict, Optional, Tuple import torch +from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig +from tensorrt_llm.mapping import Mapping + from ...inputs.multimodal import MultimodalParams +from ..distributed import MPIDist from ..expert_statistic import ExpertStatistic from ..memory_buffer_utils import get_memory_buffers from ..modules.multi_stream_utils import with_multi_stream @@ -15,13 +19,60 @@ ResourceManagerType) from .scheduler import ScheduledRequests -if TYPE_CHECKING: - from .model_engine import PyTorchModelEngine - # A large prime number used for dummy request IDs to avoid collisions CUDA_GRAPH_DUMMY_REQUEST_ID = (1 << 64) - 1 +@dataclass +class CUDAGraphRunnerConfig: + """Configuration for the CUDAGraphRunner, passed from the ModelEngine.""" + use_cuda_graph: bool + """ + Master switch controlling the model's execution path. + + This flag determines one of three distinct execution paths for the + model engine: + + 1. **`False` (Pure Eager Path):** + * Forces all execution to be in eager mode. + * The `CUDAGraphRunner` instance is mostly dormant + * Methods like `maybe_get_cuda_graph` and `pad_batch` + will return immediately, signaling the model engine to + run in eager mode. + + 2. **`True` (Eager Fallback Path):** + * The runner is active and checks for graph eligibility. + * If a batch is ineligible (e.g., it's a prefill batch, + stats collection is on, or it's an unsupported batch size), + the runner signals a fallback to eager mode for that batch. + + 3. **`True` (CUDA Graph Path):** + * The runner finds an eligible batch and a matching graph. + * The graph is then captured (if new) or replayed. + + Note: As of this implementation, the model engine *always* calls + `cuda_graph_runner.pad_batch` and `cuda_graph_runner.maybe_get_cuda_graph` + even when this is `False`. This could be refactored in the future + so that the engine bypasses the `CUDAGraphRunner` entirely in Case 1. + """ + cuda_graph_padding_enabled: bool + cuda_graph_batch_sizes: list[int] + max_cuda_graph_batch_size: int + max_beam_width: int + max_num_tokens: int + spec_config: Optional[DecodingBaseConfig] + cuda_graph_mem_pool: Any + use_mrope: bool + original_max_draft_len: int + original_max_total_draft_tokens: int + is_draft_model: bool + enable_attention_dp: bool + batch_size: int + mapping: Optional[Mapping] + dist: Optional[MPIDist] + kv_cache_manager_key: Any + + class CUDAGraphRunner: """ Manages the lifecycle and execution of CUDA graphs for the model engine. @@ -32,22 +83,22 @@ class CUDAGraphRunner: """ WARMUP_STEPS = 2 - def __init__(self, engine: "PyTorchModelEngine"): - self.engine_ref = weakref.ref(engine) + def __init__(self, config: CUDAGraphRunnerConfig): + self.config = config - # High-level configuration - self.enabled = engine.llm_args.cuda_graph_config is not None - self.padding_enabled = engine._cuda_graph_padding_enabled - self.supported_batch_sizes = engine._cuda_graph_batch_sizes - self.max_supported_batch_size = engine._max_cuda_graph_batch_size - self.max_beam_width = engine.max_beam_width - self.spec_config = engine.spec_config + # High-level configuration from the config object + self.enabled = config.use_cuda_graph + self.padding_enabled = config.cuda_graph_padding_enabled + self.supported_batch_sizes = config.cuda_graph_batch_sizes + self.max_supported_batch_size = config.max_cuda_graph_batch_size + self.max_beam_width = config.max_beam_width + self.spec_config = config.spec_config self.graphs: Dict[Tuple[int, int, int], torch.cuda.CUDAGraph] = {} self.graph_outputs: Dict[Tuple[int, int, int], Callable[[], Optional[torch.Tensor]]] = {} self.graph_metadata: Dict[Tuple[int, int, int], Dict[str, Any]] = {} - self.memory_pool = engine._cuda_graph_mem_pool + self.memory_pool = config.cuda_graph_mem_pool self.padding_dummy_request: Optional["Request"] = None self.shared_static_tensors: Dict[str, torch.Tensor] = {} @@ -57,12 +108,11 @@ def __init__(self, engine: "PyTorchModelEngine"): def _create_shared_static_tensors(self): """Allocates static tensors sized for the largest possible batch.""" - engine = self._get_engine() - - token_per_request = self.max_possible_draft_len + 1 + max_draft_len = self.config.original_max_total_draft_tokens if self.config.spec_config is not None else 0 + token_per_request = max_draft_len + 1 max_total_tokens = (self.max_supported_batch_size * self.max_beam_width * token_per_request) - max_total_tokens = min(max_total_tokens, engine.max_num_tokens) + max_total_tokens = min(max_total_tokens, self.config.max_num_tokens) self.shared_static_tensors = { "input_ids": @@ -71,7 +121,7 @@ def _create_shared_static_tensors(self): torch.zeros((1, max_total_tokens), device="cuda", dtype=torch.int32), } - if engine.use_mrope: + if self.config.use_mrope: self.shared_static_tensors["position_ids"] = torch.zeros( (3, 1, max_total_tokens), device="cuda", dtype=torch.int32) self.shared_static_tensors["multimodal_params"] = [ @@ -85,28 +135,17 @@ def _create_shared_static_tensors(self): }) for _ in range(max_total_tokens) ] - @property - def enable_spec_decode(self): - return self._get_engine().enable_spec_decode - - @property - def max_possible_draft_len(self): - engine = self._get_engine() - return (engine.original_max_total_draft_tokens - if self.enable_spec_decode else 0) - def get_graph_key( self, batch: ScheduledRequests, spec_resource_manager: Optional[BaseResourceManager] = None): - engine = self._get_engine() - if engine.is_draft_model and spec_resource_manager is not None and isinstance( + batch_size = batch.batch_size + if self.config.is_draft_model and spec_resource_manager is not None and isinstance( spec_resource_manager, Eagle3ResourceManager): # If 'is_first_draft' is True, even with tree decoding, the length of draft_len will only be 'max_draft_len', not 'max_total_draft_token'. # Because we will pad the input to 'max_draft_len' length for the first draft layer. - draft_len = engine.original_max_draft_len if spec_resource_manager.is_first_draft else 0 - key = (batch.batch_size, draft_len, - spec_resource_manager.is_first_draft) + draft_len = self.config.original_max_draft_len if spec_resource_manager.is_first_draft else 0 + key = (batch_size, draft_len, spec_resource_manager.is_first_draft) else: # With dynamic spec decode, the draft length maybe zero even when enable_spec_decode is True, # so we need to get the draft length from the batch instead of using enable_spec_decode. @@ -116,56 +155,38 @@ def get_graph_key( draft_len = max(draft_len_list) assert len( set(draft_len_list)) == 1, "All draft lengths must be the same" - key = (batch.batch_size, draft_len, False) + key = (batch_size, draft_len, False) return key - @property - def spec_metadata(self): - return self._get_engine().spec_metadata - - @property - def draft_tokens_cuda(self): - return self._get_engine().draft_tokens_cuda - - @property - def attn_metadata(self): - return self._get_engine().attn_metadata - def __del__(self): self.clear() - def _get_engine(self) -> "PyTorchModelEngine": - """Safely dereferences the weak reference to the engine.""" - engine = self.engine_ref() - if engine is None: - raise RuntimeError( - "The parent PyTorchModelEngine has been garbage collected.") - return engine - def maybe_get_cuda_graph( - self, - batch: ScheduledRequests, - spec_resource_manager: Optional[BaseResourceManager] = None): + self, + batch: ScheduledRequests, + iter_counter: int, + enable_spec_decode: bool, + attn_metadata: Any, + spec_metadata: Optional[Any] = None, + draft_tokens_cuda: Optional[torch.Tensor] = None, + spec_resource_manager: Optional[BaseResourceManager] = None, + ) -> Tuple[Optional[Any], Optional[Any], Optional[Tuple[int, int, bool]]]: """ Determines if the current batch can be run with a CUDA graph. Returns a tuple containing: - - A boolean indicating if a graph can be used. - The attn_metadata for the graph, if applicable. - The spec_metadata for the graph, if applicable. - - The key for the graph. + - The key for the graph, if applicable. """ - engine = self._get_engine() - # disable when doing statistic - if hasattr(engine, 'iter_counter') and ExpertStatistic.set_iter( - engine.iter_counter): - return False, None, None, None + if ExpertStatistic.set_iter(iter_counter): + return None, None, None can_run_cuda_graph = batch.can_run_cuda_graph batch_size = batch.batch_size - if self.enabled and engine.enable_attention_dp and engine.mapping.tp_size > 1: - all_can_graph_batch = engine.dist.tp_allgather( + if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1: + all_can_graph_batch = self.config.dist.tp_allgather( [can_run_cuda_graph, batch_size]) is_all_gen_only = all(all_can_graph[0] for all_can_graph in all_can_graph_batch) @@ -174,34 +195,33 @@ def maybe_get_cuda_graph( for all_gen_only in all_can_graph_batch) if not is_all_gen_only or not all_batch_size_equal: - return False, None, None, None + return None, None, None if not self.enabled or not can_run_cuda_graph: - return False, None, None, None + return None, None, None key = self.get_graph_key(batch, spec_resource_manager) if key in self.graphs: - return True, self.graph_metadata[key][ + return self.graph_metadata[key][ "attn_metadata"], self.graph_metadata[key]["spec_metadata"], key if batch_size not in self.supported_batch_sizes: - return False, None, None, None + return None, None, None num_sequences_in_batch = batch_size * self.max_beam_width - attn_metadata = self.attn_metadata.create_cuda_graph_metadata( + graph_attn_metadata = attn_metadata.create_cuda_graph_metadata( num_sequences_in_batch, False, key[1], self.cuda_graph_meta_buffers) - assert attn_metadata.is_cuda_graph + assert graph_attn_metadata.is_cuda_graph - if self.enable_spec_decode: - spec_metadata = self.spec_metadata.create_cuda_graph_metadata( + if enable_spec_decode: + graph_spec_metadata = spec_metadata.create_cuda_graph_metadata( num_sequences_in_batch) - spec_metadata.draft_tokens = self.draft_tokens_cuda + graph_spec_metadata.draft_tokens = draft_tokens_cuda else: - spec_metadata = None - return True, attn_metadata, spec_metadata, key + graph_spec_metadata = None + return graph_attn_metadata, graph_spec_metadata, key def needs_capture(self, key: Tuple[int, int, int]): - return key not in self.graph_outputs def get_graph_pool(self): @@ -217,9 +237,9 @@ def capture(self, key: Tuple[int, int, int], forward_fn: Callable, initial_inputs: Dict[str, Any], + enable_spec_decode: bool = False, postprocess_fn: Optional[Callable] = None): """Captures the forward pass for a given batch size.""" - engine = self._get_engine() batch_size = key[0] # [CUDA graph spec decode padding] # We pad input IDs/position IDs to the maximum draft length (token per request). @@ -236,7 +256,7 @@ def capture(self, self.shared_static_tensors["position_ids"] [:, :num_tokens_for_capture], } - if engine.use_mrope: + if self.config.use_mrope: sliced_static_tensors["position_ids"] = self.shared_static_tensors[ "position_ids"][:, :, :num_tokens_for_capture], sliced_static_tensors[ @@ -254,12 +274,10 @@ def capture(self, def _setup_spec_decoding_and_forward(key: Tuple[int, int, int], forward_fn: Callable, capture_inputs: Dict[str, Any]): - engine = self._get_engine() - # for the first inference of draft model, we need to set the use_spec_decoding to True when capture the graph for multiple runs. is_first_draft = key[2] - needs_kv_cache_recompute = True if engine.enable_spec_decode and engine.spec_config.spec_dec_mode.needs_kv_cache_recompute( + needs_kv_cache_recompute = True if enable_spec_decode and self.config.spec_config.spec_dec_mode.needs_kv_cache_recompute( ) else False - if is_first_draft and engine.is_draft_model and needs_kv_cache_recompute: + if is_first_draft and self.config.is_draft_model and needs_kv_cache_recompute: capture_inputs['attn_metadata'].use_spec_decoding = True return forward_fn(capture_inputs) @@ -288,7 +306,6 @@ def _setup_spec_decoding_and_forward(key: Tuple[int, int, int], def replay(self, key: Tuple[int, int, int], current_inputs: Dict[str, Any]) -> Optional[torch.Tensor]: """Replays a previously captured graph.""" - engine = self._get_engine() stored_meta = self.graph_metadata[key] assert current_inputs["attn_metadata"] is stored_meta["attn_metadata"] if stored_meta["spec_metadata"] is not None: @@ -302,7 +319,7 @@ def replay(self, key: Tuple[int, int, int], static_tensors["input_ids"][:seqlen].copy_(input_ids) position_ids = current_inputs["position_ids"] - if engine.use_mrope and current_inputs.get( + if self.config.use_mrope and current_inputs.get( 'multimodal_params') is not None: static_tensors["position_ids"][:, :, :seqlen].copy_(position_ids) for i, multimodal_param in enumerate( @@ -322,16 +339,16 @@ def replay(self, key: Tuple[int, int, int], return output_ref def _get_padded_batch(self, batch: ScheduledRequests, - resource_manager: ResourceManager) -> int: - engine = self._get_engine() + resource_manager: ResourceManager, + runtime_draft_len: int) -> int: kv_cache_manager = resource_manager.get_resource_manager( - engine.kv_cache_manager_key) + self.config.kv_cache_manager_key) can_run_cuda_graph = batch.can_run_cuda_graph batch_size = batch.batch_size new_batch_size = batch_size - if self.enabled and engine.enable_attention_dp and engine.mapping.tp_size > 1: - graph_batch_size = engine.dist.tp_allgather( + if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1: + graph_batch_size = self.config.dist.tp_allgather( [can_run_cuda_graph, batch_size]) all_can_graph = all(graph_batch[0] for graph_batch in graph_batch_size) @@ -349,7 +366,7 @@ def _get_padded_batch(self, batch: ScheduledRequests, return 0 padding_size = padded_batch_size - batch_size - if padding_size + batch.batch_size > engine.batch_size: + if padding_size + batch.batch_size > self.config.batch_size: return 0 # No padding if it would create too many concurrent requests. @@ -364,9 +381,9 @@ def _get_padded_batch(self, batch: ScheduledRequests, self.padding_dummy_request = kv_cache_manager.add_dummy_requests( [CUDA_GRAPH_DUMMY_REQUEST_ID], is_gen=True, - max_num_draft_tokens=engine.runtime_draft_len, - use_mrope=engine.use_mrope, - max_beam_width=engine.max_beam_width)[0] + max_num_draft_tokens=runtime_draft_len, + use_mrope=self.config.use_mrope, + max_beam_width=self.config.max_beam_width)[0] self.padding_dummy_request.is_cuda_graph_dummy = True spec_res_mgr = resource_manager.get_resource_manager( ResourceManagerType.SPEC_RESOURCE_MANAGER) @@ -387,12 +404,14 @@ def _round_up_batch_size(self, batch_size: int) -> int: return self.supported_batch_sizes[idx] @contextlib.contextmanager - def pad_batch(self, scheduled_requests: ScheduledRequests, - resource_manager: ResourceManager): + def pad_batch(self, + scheduled_requests: ScheduledRequests, + resource_manager: ResourceManager, + runtime_draft_len: int = 0): """Context manager to pad a batch to a graph-compatible size.""" - padding_size = self._get_padded_batch(scheduled_requests, - resource_manager) + resource_manager, + runtime_draft_len) try: yield scheduled_requests finally: diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index e3c12e36b49..5c2b9ac9f6a 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -55,7 +55,7 @@ set_per_request_piecewise_cuda_graph_flag, set_torch_compiling, with_model_extra_attrs) from .config_utils import is_mla -from .cuda_graph_runner import CUDAGraphRunner +from .cuda_graph_runner import CUDAGraphRunner, CUDAGraphRunnerConfig from .guided_decoder import CapturableGuidedDecoder from .layerwise_nvtx_marker import LayerwiseNvtxMarker from .llm_request import get_draft_token_length @@ -370,9 +370,31 @@ def __init__( # We look up this key in resource_manager during forward to find the # kv cache manager. Can be changed to support multiple model engines # with different KV cache managers. - self.kv_cache_manager_key = ResourceManagerType.KV_CACHE_MANAGER + self.kv_cache_manager_key = ResourceManagerType.DRAFT_KV_CACHE_MANAGER if is_draft_model else ResourceManagerType.KV_CACHE_MANAGER self.lora_model_config: Optional[LoraModelConfig] = None - self.cuda_graph_runner = CUDAGraphRunner(self) + + # Create config and runner + cuda_graph_runner_config = CUDAGraphRunnerConfig( + use_cuda_graph=self.cuda_graph_config is not None, + cuda_graph_padding_enabled=self._cuda_graph_padding_enabled, + cuda_graph_batch_sizes=self._cuda_graph_batch_sizes, + max_cuda_graph_batch_size=self._max_cuda_graph_batch_size, + max_beam_width=self.max_beam_width, + spec_config=self.spec_config, + cuda_graph_mem_pool=self._cuda_graph_mem_pool, + max_num_tokens=self.max_num_tokens, + use_mrope=self.use_mrope, + original_max_draft_len=self.original_max_draft_len, + original_max_total_draft_tokens=self. + original_max_total_draft_tokens, + is_draft_model=self.is_draft_model, + enable_attention_dp=self.enable_attention_dp, + batch_size=self.batch_size, + mapping=self.mapping, + dist=self.dist, + kv_cache_manager_key=self.kv_cache_manager_key, + ) + self.cuda_graph_runner = CUDAGraphRunner(cuda_graph_runner_config) # Setup the local cache indirection buffer only once and reuse it. # This way it can also be used for CUDA graphs. @@ -2319,11 +2341,21 @@ def forward( return self._forward_step(inputs, gather_ids, gather_context_logits) with self.cuda_graph_runner.pad_batch( - scheduled_requests, resource_manager) as padded_requests: - - maybe_graph, maybe_attn_metadata, maybe_spec_metadata, key = self.cuda_graph_runner.maybe_get_cuda_graph( - padded_requests, spec_resource_manager) - if maybe_graph: + scheduled_requests, resource_manager, + self.runtime_draft_len) as padded_requests: + + maybe_attn_metadata, maybe_spec_metadata, key = self.cuda_graph_runner.maybe_get_cuda_graph( + padded_requests, + iter_counter=self.iter_counter, + enable_spec_decode=self.enable_spec_decode, + attn_metadata=attn_metadata, + spec_metadata=spec_metadata, + draft_tokens_cuda=self.draft_tokens_cuda + if self.is_spec_decode else None, + spec_resource_manager=spec_resource_manager, + ) + can_run_graph = key is not None + if can_run_graph: attn_metadata = maybe_attn_metadata spec_metadata = maybe_spec_metadata else: @@ -2339,7 +2371,7 @@ def forward( self.iter_counter += 1 with with_shared_pool(self.cuda_graph_runner.get_graph_pool()): - if not maybe_graph: + if not can_run_graph: # Fallback to eager execution if graph was not used with MoeLoadBalancerIterContext(moe_load_balancer): outputs = self._forward_step(inputs, gather_ids, @@ -2357,9 +2389,12 @@ def capture_forward_fn(inputs: Dict[str, Any]): def capture_postprocess_fn(inputs: Dict[str, Any]): self._postprocess_inputs(inputs) - self.cuda_graph_runner.capture(key, capture_forward_fn, - inputs, - capture_postprocess_fn) + self.cuda_graph_runner.capture( + key, + capture_forward_fn, + inputs, + enable_spec_decode=self.enable_spec_decode, + postprocess_fn=capture_postprocess_fn) # here we don't need to use context since cuda graph capture didn't run kernel. # maybe we need a cleaner way to do this. diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 813585950c9..3f7658354ab 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -384,7 +384,6 @@ def drafting_loop_wrapper(model): # For DeepseekV3 MTP, we need to set the num_hidden_layers to 1 for the draft model if spec_config.spec_dec_mode.is_mtp_eagle(): draft_model_engine.model.model_config.pretrained_config.num_hidden_layers = 1 - draft_model_engine.kv_cache_manager_key = ResourceManagerType.DRAFT_KV_CACHE_MANAGER draft_model_engine.load_weights_from_target_model( model_engine.model) else: diff --git a/tests/unittest/_torch/helpers.py b/tests/unittest/_torch/helpers.py index a915956f0a9..163309cb1f4 100644 --- a/tests/unittest/_torch/helpers.py +++ b/tests/unittest/_torch/helpers.py @@ -3,7 +3,10 @@ import torch import torch.nn.functional as F -from tensorrt_llm.llmapi.llm_args import TorchLlmArgs +from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import ( + CUDAGraphRunner, CUDAGraphRunnerConfig) +from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType +from tensorrt_llm.mapping import Mapping def ceil_div(x: int, y: int) -> int: @@ -166,42 +169,23 @@ def block_scale_gemm(mat_a: torch.Tensor, mat_scale_a: torch.Tensor, return results.view_as(x) -class MockPytorchBackendConfig: - - def __init__(self, use_cuda_graph, cuda_graph_padding_enabled): - self.use_cuda_graph = use_cuda_graph - self.cuda_graph_padding_enabled = cuda_graph_padding_enabled - - -class MockEngine: - """A replacement for SimpleNamespace that supports weak references.""" - - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - -def create_mock_engine(batch_size: int): - - class MockSpecConfig: - - class SpecDecMode: - - def needs_kv_cache_recompute(self): - return False - - spec_dec_mode = SpecDecMode() - - return MockEngine( - llm_args=TorchLlmArgs(model="dummy"), - _cuda_graph_padding_enabled=True, - _cuda_graph_batch_sizes=[batch_size], - _max_cuda_graph_batch_size=batch_size, +def create_mock_cuda_graph_runner(batch_size: int, use_mrope: bool = False): + config = CUDAGraphRunnerConfig( + use_cuda_graph=True, + cuda_graph_padding_enabled=False, + cuda_graph_batch_sizes=[batch_size], + max_cuda_graph_batch_size=batch_size, + batch_size=batch_size, max_beam_width=1, - max_num_tokens=8192, - is_spec_decode=False, - enable_spec_decode=False, - spec_config=MockSpecConfig(), + max_num_tokens=1, + use_mrope=use_mrope, + spec_config=None, + cuda_graph_mem_pool=None, + enable_attention_dp=False, + original_max_draft_len=0, + original_max_total_draft_tokens=0, is_draft_model=False, - _cuda_graph_mem_pool=None, - use_mrope=False, - ) + mapping=Mapping(), + dist=None, + kv_cache_manager_key=ResourceManagerType.KV_CACHE_MANAGER) + return CUDAGraphRunner(config) diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py index 28a35323b6e..a224cea1186 100644 --- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py +++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py @@ -22,7 +22,7 @@ class Exaone4Config(PretrainedConfig): # TODO: Remove this once we have a proper config for Exaone4 SKIP_EXAONE4_HF_ACCURACY_TEST = True -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from transformers.cache_utils import HybridCache from utils.util import getSMVersion @@ -31,7 +31,6 @@ class Exaone4Config(PretrainedConfig): from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_exaone4 import Exaone4ForCausalLM -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping @@ -338,10 +337,8 @@ def test_exaone4_allclose_to_hf(self, scenario: Scenario) -> None: ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_llama.py b/tests/unittest/_torch/modeling/test_modeling_llama.py index 0fdfa7ff0fa..9b9c9e53874 100644 --- a/tests/unittest/_torch/modeling/test_modeling_llama.py +++ b/tests/unittest/_torch/modeling/test_modeling_llama.py @@ -4,7 +4,7 @@ from typing import Any import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from parameterized import parameterized from transformers import LlamaConfig from transformers import LlamaForCausalLM as HFLlamaForCausalLM @@ -16,7 +16,6 @@ from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_llama import LlamaForCausalLM -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm._torch.pyexecutor.scheduler import ScheduledRequests @@ -331,10 +330,8 @@ def test_llama_allclose_to_hf(self, scenario: Scenario) -> None: ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py index 367dee787a0..941b15890e3 100644 --- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py +++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py @@ -4,7 +4,7 @@ import torch import transformers -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from parameterized import parameterized from transformers import Llama4Config from transformers import \ @@ -20,7 +20,6 @@ Llama4HfWeightMapper from tensorrt_llm._torch.models.modeling_llama import \ Llama4ForConditionalGeneration -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping @@ -406,10 +405,8 @@ def test_llama_allclose_to_hf(self, scenario: AllCloseScenario) -> None: input_ids.size(-1) + gen_input_ids.size(-1)) ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_mistral.py b/tests/unittest/_torch/modeling/test_modeling_mistral.py index a79e9415bdb..2be7f4acdbf 100644 --- a/tests/unittest/_torch/modeling/test_modeling_mistral.py +++ b/tests/unittest/_torch/modeling/test_modeling_mistral.py @@ -8,7 +8,7 @@ import torch import transformers import transformers.models.mistral3 -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from PIL import Image from utils.util import getSMVersion @@ -19,7 +19,6 @@ from tensorrt_llm._torch.attention_backend import utils as attention_utils from tensorrt_llm._torch.models import modeling_mistral from tensorrt_llm._torch.pyexecutor import resource_manager -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm.bindings import executor as executor_lib from tensorrt_llm.models import modeling_utils @@ -404,10 +403,7 @@ def test_mistral_3_vlm_allclose_to_hf(mistral_small_3_1_24b_config, backend, use ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner(1) if use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_mixtral.py b/tests/unittest/_torch/modeling/test_modeling_mixtral.py index b8beecaa772..7071a440ff5 100644 --- a/tests/unittest/_torch/modeling/test_modeling_mixtral.py +++ b/tests/unittest/_torch/modeling/test_modeling_mixtral.py @@ -3,7 +3,7 @@ from dataclasses import dataclass import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from parameterized import parameterized from transformers import MixtralConfig from transformers import MixtralForCausalLM as HFMixtralForCausalLM @@ -16,7 +16,6 @@ from tensorrt_llm._torch.models.checkpoints.hf.mixtral_weight_mapper import \ MixtralHfWeightMapper from tensorrt_llm._torch.models.modeling_mixtral import MixtralForCausalLM -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping @@ -310,10 +309,8 @@ def test_mixtral_allclose_to_hf(self, scenario: Scenario): ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_mllama.py b/tests/unittest/_torch/modeling/test_modeling_mllama.py index 597c084b41d..a9423b86d35 100644 --- a/tests/unittest/_torch/modeling/test_modeling_mllama.py +++ b/tests/unittest/_torch/modeling/test_modeling_mllama.py @@ -4,7 +4,7 @@ import pytest import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from parameterized import parameterized from test_modeling_llama import Scenario, reduce_llama_config from transformers import MllamaConfig @@ -17,7 +17,6 @@ from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_mllama import \ MllamaForConditionalGeneration -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping @@ -420,10 +419,8 @@ def test_mllama_allclose_to_hf_text_only(self, scenario: Scenario) -> None: ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_multimodal.py b/tests/unittest/_torch/modeling/test_modeling_multimodal.py index 00817ae062f..c22ac3e308e 100644 --- a/tests/unittest/_torch/modeling/test_modeling_multimodal.py +++ b/tests/unittest/_torch/modeling/test_modeling_multimodal.py @@ -8,7 +8,7 @@ from typing import Dict, List, Optional, Tuple, Type import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from transformers import AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel from utils.llm_data import llm_models_root @@ -17,7 +17,6 @@ from tensorrt_llm._torch.attention_backend.utils import get_attention_backend from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm._utils import str_dtype_to_torch from tensorrt_llm.bindings.executor import KvCacheConfig @@ -425,8 +424,7 @@ def run_trtllm_forward(self, trtllm_inputs, use_cuda_graph: bool = False): trtllm_inputs["attn_metadata"].prepare() return self.trtllm_model.forward(**trtllm_inputs) else: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner(1) trtllm_inputs["attn_metadata"] = trtllm_inputs[ "attn_metadata" ].create_cuda_graph_metadata(1) diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron.py b/tests/unittest/_torch/modeling/test_modeling_nemotron.py index d06a6bc6b81..2dcac56ea55 100644 --- a/tests/unittest/_torch/modeling/test_modeling_nemotron.py +++ b/tests/unittest/_torch/modeling/test_modeling_nemotron.py @@ -4,7 +4,7 @@ from typing import Any import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from parameterized import parameterized from transformers import NemotronConfig from transformers import NemotronForCausalLM as HFNemotronForCausalLM @@ -15,7 +15,6 @@ from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_nemotron import NemotronForCausalLM -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping @@ -318,10 +317,8 @@ def test_nemotron_allclose_to_hf(self, scenario: Scenario) -> None: ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_phi3.py b/tests/unittest/_torch/modeling/test_modeling_phi3.py index 1a50b874ae5..1f7f0316611 100644 --- a/tests/unittest/_torch/modeling/test_modeling_phi3.py +++ b/tests/unittest/_torch/modeling/test_modeling_phi3.py @@ -4,7 +4,7 @@ from typing import Any import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from transformers import Phi3Config from transformers import Phi3ForCausalLM as HFPhi3ForCausalLM from utils.util import default_dtype @@ -14,7 +14,6 @@ from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.models.modeling_phi3 import Phi3ForCausalLM -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping @@ -310,10 +309,8 @@ def test_phi3_allclose_to_hf(self) -> None: ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen.py b/tests/unittest/_torch/modeling/test_modeling_qwen.py index a35dc9131f6..d2f9cdaac73 100644 --- a/tests/unittest/_torch/modeling/test_modeling_qwen.py +++ b/tests/unittest/_torch/modeling/test_modeling_qwen.py @@ -17,12 +17,11 @@ from tensorrt_llm._torch.models.modeling_qwen import ( Qwen2ForCausalLM, Qwen2ForProcessRewardModel) # yapf: enable -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.modeling_utils import QuantConfig -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from utils.llm_data import llm_models_root from utils.util import getSMVersion @@ -265,10 +264,8 @@ def test_qwen_allclose_to_hf(self, scenario: Scenario) -> None: ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare() diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py index 8d6c8649412..56f71d2bad0 100644 --- a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py +++ b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py @@ -3,7 +3,7 @@ from typing import List import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from test_modeling_multimodal import MultimodalScenario, TestModelingMultimodal from transformers import Qwen2_5_VLConfig from transformers import \ @@ -13,7 +13,6 @@ from tensorrt_llm._torch.models.checkpoints.hf.qwen2vl_weight_mapper import \ Qwen2VLHfWeightMapper from tensorrt_llm._torch.models.modeling_qwen2vl import Qwen2_5_VLModel -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner QWEN2_5_VL_7B_CONFIG = { "architectures": ["Qwen2_5_VLForConditionalGeneration"], @@ -187,10 +186,8 @@ def run_trtllm_forward(self, trtllm_inputs, use_cuda_graph: bool = False): trtllm_inputs["attn_metadata"].prepare() return self.trtllm_model.forward(**trtllm_inputs) else: - mock_engine = create_mock_engine(1) # NOTE: Qwen2.5-VL model uses mrope - mock_engine.use_mrope = True - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner(1, True) trtllm_inputs["attn_metadata"] = trtllm_inputs[ "attn_metadata"].create_cuda_graph_metadata(1) diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py index b8db3be83d6..39cbf33b823 100644 --- a/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py +++ b/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py @@ -3,7 +3,7 @@ from dataclasses import dataclass import torch -from _torch.helpers import create_mock_engine +from _torch.helpers import create_mock_cuda_graph_runner from parameterized import parameterized from transformers import Qwen2MoeConfig from transformers import Qwen2MoeForCausalLM as HFQwen2MoeForCausalLM @@ -16,7 +16,6 @@ from tensorrt_llm._torch.models.checkpoints.hf.qwen2_moe_weight_mapper import \ Qwen2MoeHfWeightMapper from tensorrt_llm._torch.models.modeling_qwen_moe import Qwen2MoeForCausalLM -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig from tensorrt_llm.mapping import Mapping @@ -315,10 +314,8 @@ def test_qwen_moe_allclose_to_hf(self, scenario: Scenario): ] gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda() - graph_runner = None - if scenario.use_cuda_graph: - mock_engine = create_mock_engine(1) - graph_runner = CUDAGraphRunner(mock_engine) + graph_runner = create_mock_cuda_graph_runner( + 1) if scenario.use_cuda_graph else None def run_forward(input_ids, position_ids, attn_metadata): attn_metadata.prepare()