Skip to content

Commit 3e6d9c4

Browse files
committed
add sglang kv-aware-routing
Signed-off-by: Ning Wang <[email protected]>
1 parent e990869 commit 3e6d9c4

File tree

3 files changed

+515
-0
lines changed

3 files changed

+515
-0
lines changed
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
apiVersion: orchestration.aibrix.ai/v1alpha1
2+
kind: StormService
3+
metadata:
4+
name: sglang-router-2p2d-tp2-kv
5+
namespace: default
6+
spec:
7+
replicas: 1
8+
updateStrategy:
9+
type: InPlaceUpdate
10+
stateful: true
11+
selector:
12+
matchLabels:
13+
app: sglang-router-2p2d-tp2-kv
14+
template:
15+
metadata:
16+
labels:
17+
app: sglang-router-2p2d-tp2-kv
18+
spec:
19+
roles:
20+
- name: routing
21+
replicas: 1
22+
stateful: true
23+
template:
24+
spec:
25+
containers:
26+
- name: mini-lb
27+
image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang-router:v0.1.7-patch.2-20250731
28+
command: ["sh", "-c"]
29+
args:
30+
- |
31+
python3 -m sglang_router.launch_router \
32+
--pd-disaggregation \
33+
--host 0.0.0.0 \
34+
--policy cache_aware \
35+
--service-discovery \
36+
--service-discovery-port 30000 \
37+
--prefill-selector storm-service-name=$STORM_SERVICE_NAME role-name=prefill \
38+
--decode-selector storm-service-name=$STORM_SERVICE_NAME role-name=decode \
39+
--service-discovery-namespace default
40+
- name: prefill
41+
replicas: 2
42+
stateful: true
43+
template:
44+
metadata:
45+
annotations:
46+
k8s.volcengine.com/pod-networks: |
47+
[
48+
{
49+
"cniConf":{
50+
"name":"rdma"
51+
}
52+
},
53+
{
54+
"cniConf":{
55+
"name":"rdma"
56+
}
57+
}
58+
]
59+
spec:
60+
containers:
61+
- name: prefill
62+
image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
63+
command: ["sh", "-c"]
64+
args:
65+
- |
66+
python3 -m sglang.launch_server \
67+
--model-path /models/Qwen3-32B \
68+
--served-model-name qwen3-32b \
69+
--host 0.0.0.0 \
70+
--port 30000 \
71+
--disaggregation-mode prefill \
72+
--disaggregation-transfer-backend=mooncake \
73+
--tp-size 2 \
74+
--trust-remote-code \
75+
--mem-fraction-static 0.8 \
76+
--log-level debug
77+
env:
78+
- name: GLOO_SOCKET_IFNAME
79+
value: eth0
80+
- name: NCCL_SOCKET_IFNAME
81+
value: eth0
82+
- name: NCCL_IB_DISABLE
83+
value: "0"
84+
- name: NCCL_IB_GID_INDEX
85+
value: "7"
86+
- name: NCCL_DEBUG
87+
value: "INFO"
88+
volumeMounts:
89+
- name: model-vol
90+
mountPath: /models
91+
readOnly: true
92+
- mountPath: /dev/shm
93+
name: shared-mem
94+
resources:
95+
limits:
96+
nvidia.com/gpu: 2
97+
vke.volcengine.com/rdma: "2"
98+
securityContext:
99+
capabilities:
100+
add:
101+
- IPC_LOCK
102+
volumes:
103+
- name: model-vol
104+
hostPath:
105+
path: /data01/models
106+
type: Directory
107+
- emptyDir:
108+
medium: Memory
109+
name: shared-mem
110+
- name: decode
111+
replicas: 2
112+
stateful: true
113+
template:
114+
metadata:
115+
annotations:
116+
k8s.volcengine.com/pod-networks: |
117+
[
118+
{
119+
"cniConf":{
120+
"name":"rdma"
121+
}
122+
},
123+
{
124+
"cniConf":{
125+
"name":"rdma"
126+
}
127+
}
128+
]
129+
spec:
130+
containers:
131+
- name: decode
132+
image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
133+
command: ["sh", "-c"]
134+
args:
135+
- |
136+
python3 -m sglang.launch_server \
137+
--model-path /models/Qwen3-32B \
138+
--served-model-name qwen3-32b \
139+
--host 0.0.0.0 \
140+
--port 30000 \
141+
--disaggregation-mode decode \
142+
--disaggregation-transfer-backend=mooncake \
143+
--tp-size 2 \
144+
--trust-remote-code \
145+
--mem-fraction-static 0.8 \
146+
--log-level debug
147+
env:
148+
- name: GLOO_SOCKET_IFNAME
149+
value: eth0
150+
- name: NCCL_SOCKET_IFNAME
151+
value: eth0
152+
- name: NCCL_IB_DISABLE
153+
value: "0"
154+
- name: NCCL_IB_GID_INDEX
155+
value: "7"
156+
- name: NCCL_DEBUG
157+
value: "INFO"
158+
volumeMounts:
159+
- name: model-vol
160+
mountPath: /models
161+
readOnly: true
162+
- mountPath: /dev/shm
163+
name: shared-mem
164+
resources:
165+
limits:
166+
nvidia.com/gpu: 2
167+
vke.volcengine.com/rdma: "2"
168+
securityContext:
169+
capabilities:
170+
add:
171+
- IPC_LOCK
172+
volumes:
173+
- name: model-vol
174+
hostPath:
175+
path: /data01/models
176+
type: Directory
177+
- emptyDir:
178+
medium: Memory
179+
name: shared-mem
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
apiVersion: orchestration.aibrix.ai/v1alpha1
2+
kind: StormService
3+
metadata:
4+
name: sglang-router-1p1d-kv
5+
namespace: default
6+
spec:
7+
replicas: 1
8+
updateStrategy:
9+
type: InPlaceUpdate
10+
stateful: true
11+
selector:
12+
matchLabels:
13+
app: sglang-router-1p1d-kv
14+
template:
15+
metadata:
16+
labels:
17+
app: sglang-router-1p1d-kv
18+
spec:
19+
roles:
20+
- name: routing
21+
replicas: 1
22+
stateful: true
23+
template:
24+
spec:
25+
containers:
26+
- name: mini-lb
27+
image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang-router:v0.1.7-patch.2-20250731
28+
command: ["sh", "-c"]
29+
args:
30+
- |
31+
python3 -m sglang_router.launch_router \
32+
--pd-disaggregation \
33+
--host 0.0.0.0 \
34+
--policy cache_aware \
35+
--service-discovery \
36+
--service-discovery-port 30000 \
37+
--prefill-selector storm-service-name=$STORM_SERVICE_NAME role-name=prefill \
38+
--decode-selector storm-service-name=$STORM_SERVICE_NAME role-name=decode \
39+
--service-discovery-namespace default
40+
- name: prefill
41+
replicas: 1
42+
stateful: true
43+
template:
44+
metadata:
45+
annotations:
46+
k8s.volcengine.com/pod-networks: |
47+
[
48+
{
49+
"cniConf":{
50+
"name":"rdma"
51+
}
52+
}
53+
]
54+
spec:
55+
containers:
56+
- name: prefill
57+
image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
58+
command: ["sh", "-c"]
59+
args:
60+
- |
61+
python3 -m sglang.launch_server \
62+
--model-path /models/Qwen3-8B \
63+
--served-model-name qwen3-8b \
64+
--host 0.0.0.0 \
65+
--port 30000 \
66+
--disaggregation-mode prefill \
67+
--disaggregation-transfer-backend=mooncake \
68+
--trust-remote-code \
69+
--enable-metrics \
70+
--mem-fraction-static 0.8 \
71+
--log-level debug
72+
env:
73+
- name: GLOO_SOCKET_IFNAME
74+
value: eth0
75+
- name: NCCL_SOCKET_IFNAME
76+
value: eth0
77+
- name: NCCL_IB_DISABLE
78+
value: "0"
79+
- name: NCCL_IB_GID_INDEX
80+
value: "7"
81+
- name: NCCL_DEBUG
82+
value: "INFO"
83+
volumeMounts:
84+
- name: model-vol
85+
mountPath: /models
86+
readOnly: true
87+
- mountPath: /dev/shm
88+
name: shared-mem
89+
resources:
90+
limits:
91+
nvidia.com/gpu: 1
92+
vke.volcengine.com/rdma: "1"
93+
securityContext:
94+
capabilities:
95+
add:
96+
- IPC_LOCK
97+
volumes:
98+
- name: model-vol
99+
hostPath:
100+
path: /data01/models
101+
type: Directory
102+
- emptyDir:
103+
medium: Memory
104+
name: shared-mem
105+
- name: decode
106+
replicas: 1
107+
stateful: true
108+
template:
109+
metadata:
110+
annotations:
111+
k8s.volcengine.com/pod-networks: |
112+
[
113+
{
114+
"cniConf":{
115+
"name":"rdma"
116+
}
117+
}
118+
]
119+
spec:
120+
containers:
121+
- name: decode
122+
image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
123+
command: ["sh", "-c"]
124+
args:
125+
- |
126+
python3 -m sglang.launch_server \
127+
--model-path /models/Qwen3-8B \
128+
--served-model-name qwen3-8b \
129+
--host 0.0.0.0 \
130+
--port 30000 \
131+
--disaggregation-mode decode \
132+
--disaggregation-transfer-backend=mooncake \
133+
--trust-remote-code \
134+
--enable-metrics \
135+
--mem-fraction-static 0.8 \
136+
--log-level debug
137+
env:
138+
- name: GLOO_SOCKET_IFNAME
139+
value: eth0
140+
- name: NCCL_SOCKET_IFNAME
141+
value: eth0
142+
- name: NCCL_IB_DISABLE
143+
value: "0"
144+
- name: NCCL_IB_GID_INDEX
145+
value: "7"
146+
- name: NCCL_DEBUG
147+
value: "INFO"
148+
volumeMounts:
149+
- name: model-vol
150+
mountPath: /models
151+
readOnly: true
152+
- mountPath: /dev/shm
153+
name: shared-mem
154+
resources:
155+
limits:
156+
nvidia.com/gpu: 1
157+
vke.volcengine.com/rdma: "1"
158+
securityContext:
159+
capabilities:
160+
add:
161+
- IPC_LOCK
162+
volumes:
163+
- name: model-vol
164+
hostPath:
165+
path: /data01/models
166+
type: Directory
167+
- emptyDir:
168+
medium: Memory
169+
name: shared-mem

0 commit comments

Comments
 (0)