Skip to content

Commit 3dfd2eb

Browse files
committed
add example
1 parent 11b0348 commit 3dfd2eb

File tree

1 file changed

+174
-0
lines changed

1 file changed

+174
-0
lines changed
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Example deployment configuration with cache initialization mode enabled
5+
# This configuration demonstrates the cache initialization strategy where:
6+
# 1. Planner starts with 1 replica for each worker type
7+
# 2. Workers initialize the cache collaboratively with file-based locking
8+
# 3. Planner scales to target replicas once cache is ready
9+
10+
apiVersion: nvidia.com/v1alpha1
11+
kind: DynamoGraphDeployment
12+
metadata:
13+
name: vllm-disagg-planner-cache-init
14+
annotations:
15+
nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
16+
spec:
17+
envs:
18+
- name: DYNAMO_SERVICE_CONFIG
19+
value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:8000"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend-cache-init:8000"]}]}]}}'
20+
- name: DYNAMO_NAMESPACE
21+
value: "vllm-disagg-planner-cache-init"
22+
services:
23+
Frontend:
24+
dynamoNamespace: vllm-disagg-planner-cache-init
25+
componentType: frontend
26+
replicas: 1
27+
extraPodSpec:
28+
mainContainer:
29+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
30+
Planner:
31+
dynamoNamespace: vllm-disagg-planner-cache-init
32+
envFromSecret: hf-token-secret
33+
componentType: planner
34+
replicas: 1
35+
livenessProbe:
36+
exec:
37+
command:
38+
- /bin/sh
39+
- -c
40+
- "exit 0"
41+
periodSeconds: 60
42+
timeoutSeconds: 30
43+
failureThreshold: 10
44+
readinessProbe:
45+
exec:
46+
command:
47+
- /bin/sh
48+
- -c
49+
- "exit 0"
50+
initialDelaySeconds: 60
51+
periodSeconds: 60
52+
timeoutSeconds: 30
53+
failureThreshold: 10
54+
pvc:
55+
create: false
56+
name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
57+
mountPoint: /data/profiling_results
58+
extraPodSpec:
59+
mainContainer:
60+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1 # This should be updated to the latest RC version
61+
workingDir: /workspace/components/planner/src/dynamo/planner
62+
ports:
63+
- name: metrics
64+
containerPort: 9085
65+
command:
66+
- /bin/sh
67+
- -c
68+
args:
69+
- >-
70+
python3 -m planner_sla
71+
--environment=kubernetes
72+
--backend=vllm
73+
--adjustment-interval=60
74+
--profile-results-dir=/data/profiling_results
75+
--vllm-cache-initialization-mode
76+
--post-vllm-cache-prefill-replicas=2
77+
--post-vllm-cache-decode-replicas=2
78+
--prometheus-port=8000
79+
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
80+
dynamoNamespace: vllm-disagg-planner-cache-init
81+
componentType: frontend
82+
replicas: 1
83+
envs:
84+
- name: PYTHONPATH
85+
value: "/workspace/components/planner/src"
86+
- name: PROMETHEUS_PORT
87+
value: "8000"
88+
livenessProbe:
89+
exec:
90+
command:
91+
- /bin/sh
92+
- -c
93+
- "exit 0"
94+
periodSeconds: 60
95+
timeoutSeconds: 30
96+
failureThreshold: 10
97+
readinessProbe:
98+
exec:
99+
command:
100+
- /bin/sh
101+
- -c
102+
- "exit 0"
103+
initialDelaySeconds: 30
104+
periodSeconds: 60
105+
timeoutSeconds: 30
106+
failureThreshold: 10
107+
extraPodSpec:
108+
mainContainer:
109+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
110+
workingDir: /workspace/components/backends/vllm
111+
command:
112+
- /bin/sh
113+
- -c
114+
args:
115+
- "python3 -m dynamo.planner.prometheus"
116+
VllmDecodeWorker:
117+
dynamoNamespace: vllm-disagg-planner-cache-init
118+
envFromSecret: hf-token-secret
119+
componentType: worker
120+
replicas: 1
121+
resources:
122+
limits:
123+
gpu: "1"
124+
pvc:
125+
create: false
126+
name: vllm-cache-pvc # Must be created before deployment
127+
mountPoint: /root/.cache/vllm
128+
extraPodSpec:
129+
mainContainer:
130+
startupProbe:
131+
httpGet:
132+
path: /health
133+
port: 9090
134+
periodSeconds: 10
135+
failureThreshold: 60
136+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
137+
workingDir: /workspace/components/backends/vllm
138+
command:
139+
- python3
140+
args:
141+
- -m
142+
- dynamo.vllm
143+
- --model
144+
- Qwen/Qwen3-0.6B
145+
VllmPrefillWorker:
146+
dynamoNamespace: vllm-disagg-planner-cache-init
147+
envFromSecret: hf-token-secret
148+
componentType: worker
149+
replicas: 0 # Start with 0 replica, will be scaled to TARGET_PREFILL_REPLICAS after cache init
150+
resources:
151+
limits:
152+
gpu: "1"
153+
pvc:
154+
create: false
155+
name: vllm-cache-pvc # Must be created before deployment
156+
mountPoint: /root/.cache/vllm
157+
extraPodSpec:
158+
mainContainer:
159+
startupProbe:
160+
httpGet:
161+
path: /health
162+
port: 9090
163+
periodSeconds: 10
164+
failureThreshold: 60
165+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
166+
workingDir: /workspace/components/backends/vllm
167+
command:
168+
- python3
169+
args:
170+
- -m
171+
- dynamo.vllm
172+
- --model
173+
- Qwen/Qwen3-0.6B
174+
- --is-prefill-worker

0 commit comments

Comments
 (0)