Skip to content

Commit d1c997c

Browse files
richardhuo-nvnnshah1
authored andcommitted
feat: DIS-373 dynamo KVBM connector API integration with TRTLLM (#2544)
Signed-off-by: richardhuo-nv <[email protected]> Signed-off-by: nnshah1 <[email protected]>
1 parent c7669c8 commit d1c997c

File tree

29 files changed

+2656
-277
lines changed

29 files changed

+2656
-277
lines changed

.github/workflows/container-validation-dynamo.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ jobs:
6666
docker run -v ${{ github.workspace }}:/workspace -w /workspace \
6767
--name ${{ env.CONTAINER_ID }}_pytest \
6868
${{ steps.define_image_tag.outputs.image_tag }} \
69-
bash -c "pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\""
69+
bash -c "pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\" --ignore /workspace/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector "
7070
- name: Copy test report from test Container
7171
if: always()
7272
run: |
@@ -89,4 +89,4 @@ jobs:
8989
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
9090
with:
9191
name: Event File
92-
path: ${{ github.event_path }}
92+
path: ${{ github.event_path }}

.github/workflows/docs-link-check.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ jobs:
1515
steps:
1616
- name: Check out repository
1717
uses: actions/checkout@v4
18+
with:
19+
# For pull_request events, use the PR head (commit from the contributor's branch/repo)
20+
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
21+
ref: ${{ github.event.pull_request.head.sha || github.sha }}
22+
fetch-depth: 0
1823

1924
# Cache lychee results (e.g. to avoid hitting rate limits)
2025
# https://lychee.cli.rs/github_action_recipes/caching/

docs/guides/run_kvbm_in_trtllm.md

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
<!--
2+
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
SPDX-License-Identifier: Apache-2.0
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
-->
17+
18+
# Running KVBM in TensorRT-LLM
19+
20+
This guide explains how to leverage KVBM (KV Block Manager) to mange KV cache and do KV offloading in TensorRT-LLM (trtllm).
21+
22+
To learn what KVBM is, please check [here](https://docs.nvidia.com/dynamo/latest/architecture/kvbm_intro.html)
23+
24+
> [!Note]
25+
> - Ensure that `etcd` and `nats` are running before starting.
26+
> - KVBM does not currently support CUDA graphs in TensorRT-LLM.
27+
> - KVBM only supports TensorRT-LLM’s PyTorch backend.
28+
> - To enable disk cache offloading, you must first enable a CPU memory cache offloading.
29+
> - Disable partial reuse `enable_partial_reuse: false` in the LLM API config’s `kv_connector_config` to increase offloading cache hits.
30+
> - KVBM requires TensorRT-LLM at commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 or newer.
31+
> - Enabling KVBM metrics with TensorRT-LLM is still a work in progress.
32+
33+
## Quick Start
34+
35+
To use KVBM in TensorRT-LLM, you can follow the steps below:
36+
37+
```bash
38+
# start up etcd for KVBM leader/worker registration and discovery
39+
docker compose -f deploy/docker-compose.yml up -d
40+
41+
# Build a container that includes TensorRT-LLM and KVBM. Note: KVBM integration is only available in TensorRT-LLM commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 or newer.
42+
./container/build.sh --framework trtllm --tensorrtllm-commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 --enable-kvbm
43+
44+
# launch the container
45+
./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds
46+
47+
# enable kv offloading to CPU memory
48+
# 60 means 60GB of pinned CPU memory would be used
49+
export DYN_KVBM_CPU_CACHE_GB=60
50+
51+
# enable kv offloading to disk. Note: To enable disk cache offloading, you must first enable a CPU memory cache offloading.
52+
# 20 means 20GB of disk would be used
53+
export DYN_KVBM_DISK_CACHE_GB=20
54+
55+
# Allocating memory and disk storage can take some time.
56+
# We recommend setting a higher timeout for leader–worker initialization.
57+
# 1200 means 1200 seconds timeout
58+
export DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS=1200
59+
```
60+
61+
```bash
62+
# write an example LLM API config
63+
# Note: Disable partial reuse "enable_partial_reuse: false" in the LLM API config’s "kv_connector_config" to increase offloading cache hits.
64+
cat > "/tmp/kvbm_llm_api_config.yaml" <<EOF
65+
backend: pytorch
66+
cuda_graph_config: null
67+
kv_cache_config:
68+
enable_partial_reuse: false
69+
free_gpu_memory_fraction: 0.80
70+
kv_connector_config:
71+
connector_module: dynamo.llm.trtllm_integration.connector
72+
connector_scheduler_class: DynamoKVBMConnectorLeader
73+
connector_worker_class: DynamoKVBMConnectorWorker
74+
EOF
75+
76+
# start dynamo frontend
77+
python3 -m dynamo.frontend --http-port 8000 &
78+
79+
# To serve an LLM model with dynamo
80+
python3 -m dynamo.trtllm \
81+
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
82+
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
83+
--extra-engine-args /tmp/kvbm_llm_api_config.yaml &
84+
85+
# make a call to LLM
86+
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
87+
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
88+
"messages": [
89+
{
90+
"role": "user",
91+
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
92+
}
93+
],
94+
"stream":false,
95+
"max_tokens": 30
96+
}'
97+
98+
# Optionally, we could also serve an LLM with trtllm-serve to utilize the KVBM feature.
99+
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8001 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
100+
101+
```

lib/bindings/python/rust/llm/block_manager.rs

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
// limitations under the License.
1515

1616
use super::*;
17+
use anyhow::Result;
1718
use dynamo_llm::block_manager::block::{
1819
data::logical::distributed_leader_worker::DistributedLeaderWorkerResources, locality::Logical,
1920
};
@@ -220,3 +221,113 @@ impl BlockManager {
220221
&self.inner
221222
}
222223
}
224+
225+
#[derive(Default)]
226+
pub struct BlockManagerBuilder {
227+
worker_id: u64,
228+
leader: Option<distributed::KvbmLeader>,
229+
page_size: usize,
230+
disable_device_pool: bool,
231+
}
232+
233+
impl BlockManagerBuilder {
234+
pub fn new() -> Self {
235+
Self {
236+
page_size: 32, // default consistent with BlockManager::new
237+
..Default::default()
238+
}
239+
}
240+
241+
pub fn worker_id(mut self, id: u64) -> Self {
242+
self.worker_id = id;
243+
self
244+
}
245+
pub fn page_size(mut self, ps: usize) -> Self {
246+
self.page_size = ps;
247+
self
248+
}
249+
pub fn leader(mut self, l: distributed::KvbmLeader) -> Self {
250+
self.leader = Some(l);
251+
self
252+
}
253+
pub fn disable_device_pool(mut self, yes: bool) -> Self {
254+
self.disable_device_pool = yes;
255+
self
256+
}
257+
258+
/// Async build (call from an async context).
259+
pub async fn build(self) -> Result<BlockManager> {
260+
let worker_id = self.worker_id;
261+
let leader = self.leader.ok_or_else(|| {
262+
anyhow::anyhow!("leader is required (runtime is always taken from leader)")
263+
})?;
264+
265+
// Get (inner leader handle, runtime) from the provided leader.
266+
let (leader_inner, drt) = leader.dissolve();
267+
268+
let cancel_token = CancellationToken::new();
269+
270+
// Runtime & model config
271+
let runtime_config = dynamo_llm::block_manager::KvManagerRuntimeConfig::builder()
272+
.worker_id(worker_id)
273+
.cancellation_token(cancel_token.clone())
274+
.build()?;
275+
276+
let mut config =
277+
dynamo_llm::block_manager::KvBlockManagerConfig::builder().runtime(runtime_config);
278+
279+
let model_config = dynamo_llm::block_manager::KvManagerModelConfig::builder()
280+
.num_layers(1)
281+
.outer_dim(1)
282+
.page_size(self.page_size)
283+
.inner_dim(1)
284+
.build()?;
285+
286+
config = config.model(model_config);
287+
288+
// Layouts derived from leader’s counts
289+
if !self.disable_device_pool {
290+
config = config.device_layout(
291+
dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
292+
.num_blocks(leader_inner.num_device_blocks())
293+
.logical(Some(BlockParallelismStrategy::LeaderWorkerSharded))
294+
.build()?,
295+
);
296+
}
297+
298+
if leader_inner.num_host_blocks() > 0 {
299+
config = config.host_layout(
300+
dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
301+
.num_blocks(leader_inner.num_host_blocks())
302+
.logical(Some(BlockParallelismStrategy::LeaderWorkerSharded))
303+
.build()?,
304+
);
305+
}
306+
307+
if leader_inner.num_disk_blocks() > 0 {
308+
config = config.disk_layout(
309+
dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
310+
.num_blocks(leader_inner.num_disk_blocks())
311+
.logical(Some(BlockParallelismStrategy::LeaderWorkerSharded))
312+
.build()?,
313+
);
314+
}
315+
316+
let config = config.build()?;
317+
318+
let resources =
319+
DistributedLeaderWorkerResources::new(Some(leader_inner), cancel_token.child_token())?;
320+
321+
let inner = dynamo_llm::block_manager::KvBlockManager::<
322+
Logical<DistributedLeaderWorkerResources>,
323+
BasicMetadata,
324+
>::new(config, resources)
325+
.await?;
326+
327+
Ok(BlockManager {
328+
inner,
329+
drt,
330+
_controller: None,
331+
})
332+
}
333+
}

lib/bindings/python/rust/llm/block_manager/distributed.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ mod utils;
88
mod worker;
99

1010
pub use leader::KvbmLeader;
11-
pub use utils::get_barrier_id;
11+
pub use utils::get_barrier_id_prefix;
1212
pub use worker::{KvbmWorker, VllmTensor};

lib/bindings/python/rust/llm/block_manager/distributed/leader.rs

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
use super::*;
5-
use utils::get_barrier_id;
5+
use utils::get_barrier_id_prefix;
66

77
use derive_getters::Dissolve;
8-
use llm_rs::block_manager::distributed::{KvbmLeader as KvbmLeaderImpl, KvbmLeaderConfig};
8+
use llm_rs::block_manager::distributed::{
9+
KvbmLeader as KvbmLeaderImpl, KvbmLeaderConfig, KvbmLeaderNumBlocksConfig,
10+
};
911

1012
const CPU_CACHE: &str = "DYN_KVBM_CPU_CACHE_GB";
1113
const CPU_CACHE_OVERRIDE: &str = "DYN_KVBM_CPU_CACHE_OVERRIDE_NUM_BLOCKS";
@@ -16,15 +18,32 @@ const DISK_CACHE_OVERRIDE: &str = "DYN_KVBM_DISK_CACHE_OVERRIDE_NUM_BLOCKS";
1618
const LEADER_WORKER_INIT_TIMEOUT_SECS: &str = "DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS";
1719
const DEFAULT_INIT_TIMEOUT_SECS: u64 = 120;
1820

19-
fn compute_num_blocks(cache_size_key: &str, override_key: &str, bytes_per_block: usize) -> usize {
20-
if let Ok(override_num_blocks) = std::env::var(override_key) {
21-
override_num_blocks.parse::<usize>().unwrap_or(0)
22-
} else {
23-
let cache_size_gb = std::env::var(cache_size_key)
24-
.unwrap_or_default()
25-
.parse::<f64>()
26-
.unwrap_or(0.0);
27-
((cache_size_gb * 1_000_000_000.0) / bytes_per_block as f64) as usize
21+
fn read_env_usize(key: &str) -> Option<usize> {
22+
std::env::var(key).ok()?.trim().parse::<usize>().ok()
23+
}
24+
25+
fn read_cache_size_float(key: &str) -> f64 {
26+
std::env::var(key)
27+
.unwrap_or_default()
28+
.parse::<f64>()
29+
.unwrap_or(0.0)
30+
}
31+
32+
fn get_blocks_config(cache_size_key: &str, override_key: &str) -> KvbmLeaderNumBlocksConfig {
33+
if let Some(nblocks) = read_env_usize(override_key) {
34+
// Optional: still read cache size for observability, but override takes precedence.
35+
let cache_gb: f64 = read_cache_size_float(cache_size_key);
36+
return KvbmLeaderNumBlocksConfig {
37+
cache_size_in_gb: cache_gb,
38+
num_blocks_overriden: nblocks,
39+
};
40+
}
41+
42+
// No override -> compute from cache size (in GB)
43+
let cache_gb: f64 = read_cache_size_float(cache_size_key);
44+
KvbmLeaderNumBlocksConfig {
45+
cache_size_in_gb: cache_gb,
46+
num_blocks_overriden: 0,
2847
}
2948
}
3049

@@ -51,22 +70,19 @@ impl KvbmLeader {
5170
#[pymethods]
5271
impl KvbmLeader {
5372
#[new]
54-
#[pyo3(signature = (bytes_per_block, world_size, drt))]
55-
fn new(bytes_per_block: usize, world_size: usize, drt: DistributedRuntime) -> PyResult<Self> {
56-
let num_host_blocks = compute_num_blocks(CPU_CACHE, CPU_CACHE_OVERRIDE, bytes_per_block);
57-
let num_disk_blocks = compute_num_blocks(DISK_CACHE, DISK_CACHE_OVERRIDE, bytes_per_block);
58-
59-
let barrier_id = get_barrier_id();
73+
#[pyo3(signature = (world_size, drt))]
74+
fn new(world_size: usize, drt: DistributedRuntime) -> PyResult<Self> {
75+
let barrier_id_prefix = get_barrier_id_prefix();
6076
let leader_init_timeout_sec: u64 =
6177
get_leader_init_timeout_secs(LEADER_WORKER_INIT_TIMEOUT_SECS);
6278

6379
let config = KvbmLeaderConfig::builder()
64-
.barrier_id(barrier_id)
65-
.num_host_blocks(num_host_blocks)
66-
.num_disk_blocks(num_disk_blocks)
80+
.barrier_id_prefix(barrier_id_prefix)
6781
.world_size(world_size)
6882
.leader_init_timeout_secs(leader_init_timeout_sec)
6983
.drt(drt.inner().clone())
84+
.host_blocks_config(get_blocks_config(CPU_CACHE, CPU_CACHE_OVERRIDE))
85+
.disk_blocks_config(get_blocks_config(DISK_CACHE, DISK_CACHE_OVERRIDE))
7086
.build()
7187
.map_err(to_pyerr)?;
7288

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
33

4-
pub fn get_barrier_id() -> String {
5-
std::env::var("DYN_KVBM_BARRIER_ID").unwrap_or("kvbm".to_string())
4+
pub fn get_barrier_id_prefix() -> String {
5+
std::env::var("DYN_KVBM_BARRIER_ID_PREFIX")
6+
.ok()
7+
.filter(|s| !s.trim().is_empty())
8+
.unwrap_or_else(|| "kvbm".to_string())
69
}

0 commit comments

Comments
 (0)