add sglang kv-aware-routing

nwangfw · nwangfw · commit 3e6d9c417f63 · 2025-07-31T16:14:52.000-07:00
Signed-off-by: Ning Wang &lt;n.wang.chn@hotmail.com&gt;
diff --git a/test/regression/v0.4.0/sglang/qwen-32b/sglang-router-2p2d-tp2-kv.yaml b/test/regression/v0.4.0/sglang/qwen-32b/sglang-router-2p2d-tp2-kv.yaml
@@ -0,0 +1,179 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: StormService
+metadata:
+  name: sglang-router-2p2d-tp2-kv
+  namespace: default
+spec:
+  replicas: 1
+  updateStrategy:
+    type: InPlaceUpdate
+  stateful: true
+  selector:
+    matchLabels:
+      app: sglang-router-2p2d-tp2-kv
+  template:
+    metadata:
+      labels:
+        app: sglang-router-2p2d-tp2-kv
+    spec:
+      roles:
+        - name: routing
+          replicas: 1
+          stateful: true
+          template:
+            spec:
+              containers:
+                - name: mini-lb
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang-router:v0.1.7-patch.2-20250731
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang_router.launch_router \
+                        --pd-disaggregation \
+                        --host 0.0.0.0 \
+                        --policy cache_aware \
+                        --service-discovery \
+                        --service-discovery-port 30000 \
+                        --prefill-selector storm-service-name=$STORM_SERVICE_NAME role-name=prefill \
+                        --decode-selector storm-service-name=$STORM_SERVICE_NAME role-name=decode \
+                        --service-discovery-namespace default
+        - name: prefill
+          replicas: 2
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+            spec:
+              containers:
+                - name: prefill
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-32B \
+                        --served-model-name qwen3-32b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode prefill \
+                        --disaggregation-transfer-backend=mooncake \
+                        --tp-size 2 \
+                        --trust-remote-code \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                      readOnly: true
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 2
+                      vke.volcengine.com/rdma: "2"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /data01/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem
+        - name: decode
+          replicas: 2
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+            spec:
+              containers:
+                - name: decode
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-32B \
+                        --served-model-name qwen3-32b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode decode \
+                        --disaggregation-transfer-backend=mooncake \
+                        --tp-size 2 \
+                        --trust-remote-code \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                      readOnly: true
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 2
+                      vke.volcengine.com/rdma: "2"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /data01/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem
diff --git a/test/regression/v0.4.0/sglang/qwen-8b/sglang-router-1p1d-kv.yaml b/test/regression/v0.4.0/sglang/qwen-8b/sglang-router-1p1d-kv.yaml
@@ -0,0 +1,169 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: StormService
+metadata:
+  name: sglang-router-1p1d-kv
+  namespace: default
+spec:
+  replicas: 1
+  updateStrategy:
+    type: InPlaceUpdate
+  stateful: true
+  selector:
+    matchLabels:
+      app: sglang-router-1p1d-kv
+  template:
+    metadata:
+      labels:
+        app: sglang-router-1p1d-kv
+    spec:
+      roles:
+        - name: routing
+          replicas: 1
+          stateful: true
+          template:
+            spec:
+              containers:
+                - name: mini-lb
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang-router:v0.1.7-patch.2-20250731
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang_router.launch_router \
+                        --pd-disaggregation \
+                        --host 0.0.0.0 \
+                        --policy cache_aware \
+                        --service-discovery \
+                        --service-discovery-port 30000 \
+                        --prefill-selector storm-service-name=$STORM_SERVICE_NAME role-name=prefill \
+                        --decode-selector storm-service-name=$STORM_SERVICE_NAME role-name=decode \
+                        --service-discovery-namespace default
+        - name: prefill
+          replicas: 1
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+            spec:
+              containers:
+                - name: prefill
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-8B \
+                        --served-model-name qwen3-8b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode prefill \
+                        --disaggregation-transfer-backend=mooncake \
+                        --trust-remote-code \
+                        --enable-metrics \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                      readOnly: true
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 1
+                      vke.volcengine.com/rdma: "1"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /data01/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem
+        - name: decode
+          replicas: 1
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+            spec:
+              containers:
+                - name: decode
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-8B \
+                        --served-model-name qwen3-8b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode decode \
+                        --disaggregation-transfer-backend=mooncake \
+                        --trust-remote-code \
+                        --enable-metrics \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                      readOnly: true
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 1
+                      vke.volcengine.com/rdma: "1"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /data01/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem
diff --git a/test/regression/v0.4.0/sglang/qwen-8b/sglang-router-4p3d-kv.yaml b/test/regression/v0.4.0/sglang/qwen-8b/sglang-router-4p3d-kv.yaml