diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py index 3e60d0f48cb..d104c5c27be 100644 --- a/examples/llm-api/quickstart_advanced.py +++ b/examples/llm-api/quickstart_advanced.py @@ -84,7 +84,6 @@ def add_llm_args(parser): parser.add_argument('--disable_kv_cache_reuse', default=False, action='store_true') - parser.add_argument("--kv_cache_fraction", type=float, default=None) # Runtime parser.add_argument('--disable_overlap_scheduler', @@ -170,6 +169,7 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="LLM models with the PyTorch workflow.") parser = add_llm_args(parser) + parser.add_argument("--kv_cache_fraction", type=float, default=0.9) args = parser.parse_args() return args diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py index 526bcf3ab06..66721a2526d 100644 --- a/examples/llm-api/quickstart_multimodal.py +++ b/examples/llm-api/quickstart_multimodal.py @@ -150,13 +150,10 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="Multimodal models with the PyTorch workflow.") parser = add_llm_args(parser) + parser.add_argument("--kv_cache_fraction", type=float, default=0.6) parser = add_multimodal_args(parser) parser = add_lora_args(parser) args = parser.parse_args() - - if args.kv_cache_fraction is None: - args.kv_cache_fraction = 0.6 # lower the default kv cache fraction for multimodal - return args diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 4354495ef19..c2dec024cd1 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -101,12 +101,6 @@ def __init__( self._kv_cache_manager_cls = get_kv_cache_manager_cls( model_engine.model.model_config) - def _get_free_gpu_memory_fraction(self) -> float: - fraction = self._kv_cache_config.free_gpu_memory_fraction - if fraction is None: - fraction = 0.9 - return fraction - def _get_kv_size_per_token(self): model_config = self._model_engine.model.model_config mapping = self._mapping @@ -299,7 +293,7 @@ def configure_kv_cache_capacity(self, py_executor: PyExecutor) -> None: # TODO: support CP by generating dummy requests for it. assert 'cp_type' not in mapping.cp_config - fraction = self._get_free_gpu_memory_fraction() + fraction = self._kv_cache_config.free_gpu_memory_fraction torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 1a75af598f1..324258dd74a 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -84,7 +84,7 @@ def get_llm_args(model: str, pipeline_parallel_size: int = 1, moe_expert_parallel_size: Optional[int] = None, gpus_per_node: Optional[int] = None, - free_gpu_memory_fraction: Optional[float] = None, + free_gpu_memory_fraction: float = 0.9, num_postprocess_workers: int = 0, trust_remote_code: bool = False, reasoning_parser: Optional[str] = None, diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 30f091a3cf5..8f6d5f24049 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1241,7 +1241,7 @@ class KvCacheConfig(StrictBaseModel, PybindMirror): description= "Number of sink tokens (tokens to always keep in attention window).") free_gpu_memory_fraction: Optional[float] = Field( - default=None, + default=0.9, description= "The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used." ) @@ -1323,6 +1323,16 @@ def _to_pybind(self): attention_dp_events_gather_period_ms, max_gpu_total_bytes=self.max_gpu_total_bytes) + @field_validator('free_gpu_memory_fraction') + @classmethod + def validate_free_gpu_memory_fraction(cls, v: float): + """Validates that the fraction is between 0.0 and 1.0.""" + if not 0 <= v <= 1: + raise ValueError( + "kv_cache_config.free_gpu_memory_fraction must be a float between 0 and 1" + ) + return v + @field_validator('max_gpu_total_bytes') @classmethod def validate_max_gpu_total_bytes(cls, v: int): diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index b4579e346c0..cd646c33781 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -180,10 +180,6 @@ def test_KvCacheConfig_declaration(): assert pybind_config.attention_dp_events_gather_period_ms == 10 -def test_KvCacheConfig_default_values(): - check_defaults(KvCacheConfig, tle.KvCacheConfig) - - def test_CapacitySchedulerPolicy(): val = CapacitySchedulerPolicy.MAX_UTILIZATION assert PybindMirror.maybe_to_pybind(