Skip to content

Commit 0aa1356

Browse files
author
Minsung-commit
committed
[Core] Add token-level KV cache metrics to V1 engine
Add token-level KV cache metrics (total, used, free) to complement existing percentage-based metrics in the V1 engine. ## Motivation Current V1 engine only provides kv_cache_usage as percentage (0.0-1.0). Absolute token counts are critical for: - Capacity planning: "28k tokens left" vs "35% free" - Cost accounting: Token-based billing - Monitoring: Prometheus/Grafana dashboards - Debugging: Understanding exact cache state ## Changes 1. **vllm/v1/metrics/stats.py**: Add fields to SchedulerStats - kv_cache_total_tokens: Total capacity - kv_cache_used_tokens: Currently occupied - kv_cache_free_tokens: Available space 2. **vllm/v1/core/block_pool.py**: Add get_num_total_blocks() - Returns total GPU blocks (excludes 1 reserved block) 3. **vllm/v1/core/kv_cache_manager.py**: Add properties - total_tokens, free_tokens, used_tokens - Derives block_size from coordinator (handles DCP/PCP scaling) 4. **vllm/v1/core/sched/scheduler.py**: Populate metrics in make_stats() ## Example Output Before: kv_cache_usage: 0.65 After: kv_cache_usage: 0.65 kv_cache_total_tokens: 82448 kv_cache_used_tokens: 53591 kv_cache_free_tokens: 28857 Addresses #12283, #26850 Signed-off-by: Minsung-commit <[email protected]>
1 parent 6fc5841 commit 0aa1356

File tree

4 files changed

+57
-1
lines changed

4 files changed

+57
-1
lines changed

vllm/v1/core/block_pool.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,18 @@ def get_num_free_blocks(self) -> int:
440440
"""
441441
return self.free_block_queue.num_free_blocks
442442

443+
def get_num_total_blocks(self) -> int:
444+
"""Get the total number of blocks in the pool.
445+
446+
Returns:
447+
The total number of GPU blocks available for allocation.
448+
449+
Note:
450+
Excludes 1 block reserved for system use to match
451+
internal allocation behavior.
452+
"""
453+
return self.num_gpu_blocks - 1
454+
443455
def get_usage(self) -> float:
444456
"""Get the KV cache usage.
445457

vllm/v1/core/kv_cache_manager.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,6 @@ def __init__(
106106
metrics_collector: KVCacheMetricsCollector | None = None,
107107
) -> None:
108108
self.max_model_len = max_model_len
109-
110109
self.enable_caching = enable_caching
111110
self.use_eagle = use_eagle
112111
self.log_stats = log_stats
@@ -130,6 +129,11 @@ def __init__(
130129
self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
131130
self.block_pool = self.coordinator.block_pool
132131
self.kv_cache_config = kv_cache_config
132+
133+
# Get block_size from coordinator (includes DCP/PCP scaling)
134+
self.block_size = (
135+
self.coordinator.block_size if enable_caching else None
136+
)
133137

134138
# Pre-constructed KVCacheBlocks with no blocks, callers should use this
135139
# via create_kv_cache_blocks instead of creating new ones to avoid GC
@@ -149,6 +153,40 @@ def usage(self) -> float:
149153
"""
150154
return self.block_pool.get_usage()
151155

156+
@property
157+
def total_tokens(self) -> int:
158+
"""Get the total KV cache capacity in tokens.
159+
160+
Returns:
161+
Total number of tokens that can be stored in the KV cache.
162+
Calculated as: num_total_blocks × block_size
163+
"""
164+
return self.block_pool.get_num_total_blocks() * self.block_size
165+
166+
@property
167+
def free_tokens(self) -> int:
168+
"""Get the number of available tokens in the KV cache.
169+
170+
Returns:
171+
Number of free tokens available for allocation.
172+
Calculated as: num_free_blocks × block_size
173+
"""
174+
return self.block_pool.get_num_free_blocks() * self.block_size
175+
176+
@property
177+
def used_tokens(self) -> int:
178+
"""Get the number of currently used tokens in the KV cache.
179+
180+
Returns:
181+
Number of tokens currently occupied in the KV cache.
182+
Calculated as: total_tokens - free_tokens
183+
184+
Note:
185+
This is a derived metric. The actual allocation is tracked
186+
at the block level by BlockPool.
187+
"""
188+
return self.total_tokens - self.free_tokens
189+
152190
def make_prefix_cache_stats(self) -> PrefixCacheStats | None:
153191
"""Get (and reset) the prefix cache stats.
154192

vllm/v1/core/sched/scheduler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,6 +1439,9 @@ def make_stats(
14391439
num_running_reqs=len(self.running),
14401440
num_waiting_reqs=len(self.waiting),
14411441
kv_cache_usage=self.kv_cache_manager.usage,
1442+
kv_cache_total_tokens=self.kv_cache_manager.total_tokens,
1443+
kv_cache_used_tokens=self.kv_cache_manager.used_tokens,
1444+
kv_cache_free_tokens=self.kv_cache_manager.free_tokens,
14421445
prefix_cache_stats=prefix_cache_stats,
14431446
connector_prefix_cache_stats=connector_prefix_cache_stats,
14441447
kv_cache_eviction_events=eviction_events,

vllm/v1/metrics/stats.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ class SchedulerStats:
171171
current_wave: int = 0
172172

173173
kv_cache_usage: float = 0.0
174+
kv_cache_total_tokens: int = 0
175+
kv_cache_used_tokens: int = 0
176+
kv_cache_free_tokens: int = 0
174177

175178
prefix_cache_stats: PrefixCacheStats = field(default_factory=PrefixCacheStats)
176179
connector_prefix_cache_stats: PrefixCacheStats | None = None

0 commit comments

Comments
 (0)