[Core] Add token-level KV cache metrics to V1 engine

Minsung-commit · Minsung-commit · commit 0aa135659761 · 2025-12-05T08:53:29.000+09:00
Add token-level KV cache metrics (total, used, free) to complement existing percentage-based metrics in the V1 engine. ## Motivation Current V1 engine only provides kv_cache_usage as percentage (0.0-1.0). Absolute token counts are critical for: - Capacity planning: "28k tokens left" vs "35% free" - Cost accounting: Token-based billing - Monitoring: Prometheus/Grafana dashboards - Debugging: Understanding exact cache state ## Changes 1. **vllm/v1/metrics/stats.py**: Add fields to SchedulerStats - kv_cache_total_tokens: Total capacity - kv_cache_used_tokens: Currently occupied - kv_cache_free_tokens: Available space 2. **vllm/v1/core/block_pool.py**: Add get_num_total_blocks() - Returns total GPU blocks (excludes 1 reserved block) 3. **vllm/v1/core/kv_cache_manager.py**: Add properties - total_tokens, free_tokens, used_tokens - Derives block_size from coordinator (handles DCP/PCP scaling) 4. **vllm/v1/core/sched/scheduler.py**: Populate metrics in make_stats() ## Example Output Before: kv_cache_usage: 0.65 After: kv_cache_usage: 0.65 kv_cache_total_tokens: 82448 kv_cache_used_tokens: 53591 kv_cache_free_tokens: 28857 Addresses #12283, #26850 Signed-off-by: Minsung-commit <dialstjd931203@gmail.com>
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
@@ -440,6 +440,18 @@ def get_num_free_blocks(self) -> int:
         """
         return self.free_block_queue.num_free_blocks
 
+    def get_num_total_blocks(self) -> int:
+        """Get the total number of blocks in the pool.
+
+        Returns:
+            The total number of GPU blocks available for allocation.
+
+        Note:
+            Excludes 1 block reserved for system use to match
+            internal allocation behavior.
+        """
+        return self.num_gpu_blocks - 1
+
     def get_usage(self) -> float:
         """Get the KV cache usage.
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -106,7 +106,6 @@ def __init__(
         metrics_collector: KVCacheMetricsCollector | None = None,
     ) -> None:
         self.max_model_len = max_model_len
-
         self.enable_caching = enable_caching
         self.use_eagle = use_eagle
         self.log_stats = log_stats
@@ -130,6 +129,11 @@ def __init__(
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
         self.kv_cache_config = kv_cache_config
+        
+        # Get block_size from coordinator (includes DCP/PCP scaling)
+        self.block_size = (
+            self.coordinator.block_size if enable_caching else None
+        )
 
         # Pre-constructed KVCacheBlocks with no blocks, callers should use this
         # via create_kv_cache_blocks instead of creating new ones to avoid GC
@@ -149,6 +153,40 @@ def usage(self) -> float:
         """
         return self.block_pool.get_usage()
 
+    @property
+    def total_tokens(self) -> int:
+        """Get the total KV cache capacity in tokens.
+
+        Returns:
+            Total number of tokens that can be stored in the KV cache.
+            Calculated as: num_total_blocks × block_size
+        """
+        return self.block_pool.get_num_total_blocks() * self.block_size
+
+    @property
+    def free_tokens(self) -> int:
+        """Get the number of available tokens in the KV cache.
+
+        Returns:
+            Number of free tokens available for allocation.
+            Calculated as: num_free_blocks × block_size
+        """
+        return self.block_pool.get_num_free_blocks() * self.block_size
+
+    @property
+    def used_tokens(self) -> int:
+        """Get the number of currently used tokens in the KV cache.
+
+        Returns:
+            Number of tokens currently occupied in the KV cache.
+            Calculated as: total_tokens - free_tokens
+
+        Note:
+            This is a derived metric. The actual allocation is tracked
+            at the block level by BlockPool.
+        """
+        return self.total_tokens - self.free_tokens
+
     def make_prefix_cache_stats(self) -> PrefixCacheStats | None:
         """Get (and reset) the prefix cache stats.
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -1439,6 +1439,9 @@ def make_stats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             kv_cache_usage=self.kv_cache_manager.usage,
+            kv_cache_total_tokens=self.kv_cache_manager.total_tokens,
+            kv_cache_used_tokens=self.kv_cache_manager.used_tokens,
+            kv_cache_free_tokens=self.kv_cache_manager.free_tokens,
             prefix_cache_stats=prefix_cache_stats,
             connector_prefix_cache_stats=connector_prefix_cache_stats,
             kv_cache_eviction_events=eviction_events,
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
@@ -171,6 +171,9 @@ class SchedulerStats:
     current_wave: int = 0
 
     kv_cache_usage: float = 0.0
+    kv_cache_total_tokens: int = 0
+    kv_cache_used_tokens: int = 0
+    kv_cache_free_tokens: int = 0
 
     prefix_cache_stats: PrefixCacheStats = field(default_factory=PrefixCacheStats)
     connector_prefix_cache_stats: PrefixCacheStats | None = None