Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/llm-api/quickstart_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def add_llm_args(parser):
parser.add_argument('--disable_kv_cache_reuse',
default=False,
action='store_true')
parser.add_argument("--kv_cache_fraction", type=float, default=None)

# Runtime
parser.add_argument('--disable_overlap_scheduler',
Expand Down Expand Up @@ -170,6 +169,7 @@ def parse_arguments():
parser = argparse.ArgumentParser(
description="LLM models with the PyTorch workflow.")
parser = add_llm_args(parser)
parser.add_argument("--kv_cache_fraction", type=float, default=0.9)
args = parser.parse_args()
return args

Expand Down
5 changes: 1 addition & 4 deletions examples/llm-api/quickstart_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,10 @@ def parse_arguments():
parser = argparse.ArgumentParser(
description="Multimodal models with the PyTorch workflow.")
parser = add_llm_args(parser)
parser.add_argument("--kv_cache_fraction", type=float, default=0.6)
parser = add_multimodal_args(parser)
parser = add_lora_args(parser)
args = parser.parse_args()

if args.kv_cache_fraction is None:
args.kv_cache_fraction = 0.6 # lower the default kv cache fraction for multimodal

return args


Expand Down
8 changes: 1 addition & 7 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,6 @@ def __init__(
self._kv_cache_manager_cls = get_kv_cache_manager_cls(
model_engine.model.model_config)

def _get_free_gpu_memory_fraction(self) -> float:
fraction = self._kv_cache_config.free_gpu_memory_fraction
if fraction is None:
fraction = 0.9
return fraction

def _get_kv_size_per_token(self):
model_config = self._model_engine.model.model_config
mapping = self._mapping
Expand Down Expand Up @@ -299,7 +293,7 @@ def configure_kv_cache_capacity(self, py_executor: PyExecutor) -> None:
# TODO: support CP by generating dummy requests for it.
assert 'cp_type' not in mapping.cp_config

fraction = self._get_free_gpu_memory_fraction()
fraction = self._kv_cache_config.free_gpu_memory_fraction

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/commands/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def get_llm_args(model: str,
pipeline_parallel_size: int = 1,
moe_expert_parallel_size: Optional[int] = None,
gpus_per_node: Optional[int] = None,
free_gpu_memory_fraction: Optional[float] = None,
free_gpu_memory_fraction: float = 0.9,
num_postprocess_workers: int = 0,
trust_remote_code: bool = False,
reasoning_parser: Optional[str] = None,
Expand Down
12 changes: 11 additions & 1 deletion tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,7 +1241,7 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
description=
"Number of sink tokens (tokens to always keep in attention window).")
free_gpu_memory_fraction: Optional[float] = Field(
default=None,
default=0.9,
description=
"The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used."
)
Expand Down Expand Up @@ -1323,6 +1323,16 @@ def _to_pybind(self):
attention_dp_events_gather_period_ms,
max_gpu_total_bytes=self.max_gpu_total_bytes)

@field_validator('free_gpu_memory_fraction')
@classmethod
def validate_free_gpu_memory_fraction(cls, v: float):
"""Validates that the fraction is between 0.0 and 1.0."""
if not 0 <= v <= 1:
raise ValueError(
"kv_cache_config.free_gpu_memory_fraction must be a float between 0 and 1"
)
return v

@field_validator('max_gpu_total_bytes')
@classmethod
def validate_max_gpu_total_bytes(cls, v: int):
Expand Down
4 changes: 0 additions & 4 deletions tests/unittest/llmapi/test_llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,6 @@ def test_KvCacheConfig_declaration():
assert pybind_config.attention_dp_events_gather_period_ms == 10


def test_KvCacheConfig_default_values():
check_defaults(KvCacheConfig, tle.KvCacheConfig)


def test_CapacitySchedulerPolicy():
val = CapacitySchedulerPolicy.MAX_UTILIZATION
assert PybindMirror.maybe_to_pybind(
Expand Down