add 235b model testing yamls

nwangfw · nwangfw · commit 1b8ba9b8e48e · 2025-08-01T19:05:14.000-07:00
Signed-off-by: Ning Wang &lt;n.wang.chn@hotmail.com&gt;
diff --git a/test/regression/v0.4.0/sglang/235b-service.yaml b/test/regression/v0.4.0/sglang/235b-service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: qwen3-235b-service
+  namespace: default
+spec:
+  selector:
+    model.aibrix.ai/name: qwen3-235b
+  ports:
+    - protocol: TCP
+      port: 8000
+      targetPort: 8000
+      nodePort: 30010
+  type: NodePort
diff --git a/test/regression/v0.4.0/sglang/8b-service.yaml b/test/regression/v0.4.0/sglang/8b-service.yaml
@@ -12,26 +12,3 @@ spec:
       targetPort: 8000
       nodePort: 30008  
   type: NodePort
-
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama2-7b
-  namespace: default
-  labels:
-    prometheus-discovery: "true"
-  annotations:
-    prometheus.io/scrape: "true"
-    prometheus.io/path: "/metrics"
-    prometheus.io/port: "8000"
-spec:
-  selector:
-    model.aibrix.ai/name: "llama2-7b"
-  ports:
-    - protocol: TCP
-      name: metrics
-      port: 8000
-      targetPort: 8000
-      nodePort: 30081
-  type: NodePort
diff --git a/test/regression/v0.4.0/sglang/qwen-235b/aibrix-router-1p1d-tp8.yaml b/test/regression/v0.4.0/sglang/qwen-235b/aibrix-router-1p1d-tp8.yaml
@@ -0,0 +1,227 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: StormService
+metadata:
+  name: aibrix-router-1p1d-tp8
+  namespace: default
+spec:
+  replicas: 1
+  updateStrategy:
+    type: InPlaceUpdate
+  stateful: true
+  selector:
+    matchLabels:
+      app: aibrix-router-1p1d-tp8
+  template:
+    metadata:
+      labels:
+        app: aibrix-router-1p1d-tp8
+    spec:
+      roles:
+        - name: prefill
+          replicas: 1
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+              labels:
+                model.aibrix.ai/name: qwen3-235b
+                model.aibrix.ai/port: "30000"
+                model.aibrix.ai/engine: sglang
+            spec:
+              containers:
+                - name: prefill
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-235B-A22B \
+                        --served-model-name qwen3-235b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode prefill \
+                        --disaggregation-transfer-backend=mooncake \
+                        --tp-size 8 \
+                        --trust-remote-code \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                      readOnly: true
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 8
+                      vke.volcengine.com/rdma: "8"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /data01/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem
+        - name: decode
+          replicas: 1
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    },
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+              labels:
+                model.aibrix.ai/name: qwen3-235b
+                model.aibrix.ai/port: "30000"
+                model.aibrix.ai/engine: sglang
+            spec:
+              containers:
+                - name: decode
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-235B-A22B \
+                        --served-model-name qwen3-235b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode decode \
+                        --disaggregation-transfer-backend=mooncake \
+                        --tp-size 8 \
+                        --trust-remote-code \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                      readOnly: true
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 8
+                      vke.volcengine.com/rdma: "8"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /data01/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem
diff --git a/test/regression/v0.4.0/sglang/qwen-235b/sglang-base.yaml b/test/regression/v0.4.0/sglang/qwen-235b/sglang-base.yaml
@@ -15,6 +15,50 @@ spec:
     metadata:
       labels:
         model.aibrix.ai/name: qwen3-235b
+      annotations:
+        k8s.volcengine.com/pod-networks: |
+          [
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            },
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            },
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            },
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            },
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            },
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            },
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            },
+            {
+              "cniConf":{
+                  "name":"rdma"
+              }
+            }
+          ]
     spec:
       containers:
         - name: sglang-server
@@ -23,17 +67,30 @@ spec:
           args:
             - |
               python3 -m sglang.launch_server \
-                --model-path models/Qwen3-235B-A22B \
+                --model-path /models/Qwen3-235B-A22B \
                 --served-model-name qwen3-235b \
                 --host 0.0.0.0 \
                 --port 8000 \
+                --tp-size 8 \
                 --trust-remote-code \
                 --enable-metrics \
                 --mem-fraction-static 0.8 \
                 --log-level debug
+          env:
+            - name: GLOO_SOCKET_IFNAME
+              value: eth0
+            - name: NCCL_SOCKET_IFNAME
+              value: eth0
+            - name: NCCL_IB_DISABLE
+              value: "0"
+            - name: NCCL_IB_GID_INDEX
+              value: "7"
+            - name: NCCL_DEBUG
+              value: "INFO"
           resources:
             limits:
-              nvidia.com/gpu: 1
+              nvidia.com/gpu: 8
+              vke.volcengine.com/rdma: "8"
           volumeMounts:
             - name: model-vol
               mountPath: /models
diff --git a/test/regression/v0.4.0/sglang/qwen-235b/sglang-router-1p1d-tp8.yaml b/test/regression/v0.4.0/sglang/qwen-235b/sglang-router-1p1d-tp8.yaml
@@ -10,11 +10,11 @@ spec:
   stateful: true
   selector:
     matchLabels:
-      app:  sglang-router-1p1d-tp8
+      app: sglang-router-1p1d-tp8
   template:
     metadata:
       labels:
-        app:  sglang-router-1p1d-tp8
+        app: sglang-router-1p1d-tp8
     spec:
       roles:
         - name: routing
@@ -94,7 +94,7 @@ spec:
                   args:
                     - |
                       python3 -m sglang.launch_server \
-                        --model-path models/Qwen3-235B-A22B \
+                        --model-path /models/Qwen3-235B-A22B \
                         --served-model-name qwen3-235b \
                         --host 0.0.0.0 \
                         --port 30000 \
@@ -194,7 +194,7 @@ spec:
                   args:
                     - |
                       python3 -m sglang.launch_server \
-                        --model-path models/Qwen3-235B-A22B \
+                        --model-path /models/Qwen3-235B-A22B \
                         --served-model-name qwen3-235b \
                         --host 0.0.0.0 \
                         --port 30000 \
diff --git a/test/regression/v0.4.0/vllm/235B-vllm-base.yaml b/test/regression/v0.4.0/vllm/235B-vllm-base.yaml
diff --git a/test/regression/v0.4.0/vllm/aibrix-router-2p2d-tp2 .yaml b/test/regression/v0.4.0/vllm/aibrix-router-2p2d-tp2 .yaml