vllm-project
diff --git a/‎test/regression/v0.4.0/multi-engine/sglang-llama-8b.yaml‎
Lines changed: 60 additions & 0 deletions b/‎test/regression/v0.4.0/multi-engine/sglang-llama-8b.yaml‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎test/regression/v0.4.0/multi-engine/vllm-llama-8b.yaml‎
Lines changed: 59 additions & 0 deletions b/‎test/regression/v0.4.0/multi-engine/vllm-llama-8b.yaml‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎test/regression/v0.4.0/sglang/qwen-8b/aibrix-router.yaml‎
Lines changed: 144 additions & 0 deletions b/‎test/regression/v0.4.0/sglang/qwen-8b/aibrix-router.yaml‎
Lines changed: 144 additions & 0 deletions
@@ -0,0 +1,60 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    model.aibrix.ai/engine: sglang
+    model.aibrix.ai/port: "8000"
+  name: deepseek-r1-distill-llama-8b
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+      model.aibrix.ai/engine: sglang
+      model.aibrix.ai/port: "8000"
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+        model.aibrix.ai/engine: sglang
+        model.aibrix.ai/port: "8000"
+    spec:
+      containers:
+        - command:
+            - python3
+            - -m
+            - sglang.launch_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model-path
+            - /root/models/DeepSeek-R1-Distill-Llama-8B
+            - --served-model-name
+            - DeepSeek-R1-Distill-Llama-8B
+            - --attention-backend
+            - flashinfer
+            - --enable-metrics
+            - --log-level-http
+            - "warning"
+          image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+          imagePullPolicy: IfNotPresent
+          name: sglang
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          volumeMounts:
+            - name: model-vol
+              mountPath: /root/models
+      volumes:
+        - name: model-vol
+          hostPath:
+            path: /root/models
+            type: Directory
@@ -0,0 +1,59 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b # Note: The label value `model.aibrix.ai/name` here must match with the service name.
+    model.aibrix.ai/engine: vllm
+    model.aibrix.ai/port: "8000"
+  name: deepseek-r1-distill-llama-8b
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+      model.aibrix.ai/engine: vllm
+      model.aibrix.ai/port: "8000"
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+        model.aibrix.ai/engine: vllm
+        model.aibrix.ai/port: "8000"
+    spec:
+      containers:
+        - command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --uvicorn-log-level
+            - warning
+            - --model
+            - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+            - --served-model-name
+            - deepseek-r1-distill-llama-8b
+            - --max-model-len
+            - "12288"
+          image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/vllm-openai:v0.9.2-cu128-nixl-v0.4.1-lmcache-0.3.1.post1
+          imagePullPolicy: IfNotPresent
+          name: vllm-openai
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          volumeMounts:
+            - name: model-vol
+              mountPath: /root/models
+      volumes:
+        - name: model-vol
+          hostPath:
+            path: /root/models
+            type: Directory
@@ -0,0 +1,144 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: StormService
+metadata:
+  name: sglang-aibrix-router
+spec:
+  replicas: 1
+  updateStrategy:
+    type: InPlaceUpdate
+  stateful: true
+  selector:
+    matchLabels:
+      app: pool-xpyd
+  template:
+    metadata:
+      labels:
+        app: pool-xpyd
+    spec:
+      roles:
+        - name: prefill
+          replicas: 4
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+            spec:
+              containers:
+                - name: prefill
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-8B \
+                        --served-model-name qwen3-8b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode prefill \
+                        --disaggregation-transfer-backend=nixl \
+                        --trust-remote-code \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 1
+                      vke.volcengine.com/rdma: "1"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /root/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem
+        - name: decode
+          replicas: 3
+          stateful: true
+          template:
+            metadata:
+              annotations:
+                k8s.volcengine.com/pod-networks: |
+                  [
+                    {
+                      "cniConf":{
+                          "name":"rdma"
+                      }
+                    }
+                  ]
+            spec:
+              containers:
+                - name: decode
+                  image: kvcache-container-image-hb2-cn-beijing.cr.volces.com/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
+                  command: ["sh", "-c"]
+                  args:
+                    - |
+                      python3 -m sglang.launch_server \
+                        --model-path /models/Qwen3-8B \
+                        --served-model-name qwen3-8b \
+                        --host 0.0.0.0 \
+                        --port 30000 \
+                        --disaggregation-mode decode \
+                        --disaggregation-transfer-backend=nixl \
+                        --trust-remote-code \
+                        --mem-fraction-static 0.8 \
+                        --log-level debug
+                  env:
+                    - name: GLOO_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_SOCKET_IFNAME
+                      value: eth0
+                    - name: NCCL_IB_DISABLE
+                      value: "0"
+                    - name: NCCL_IB_GID_INDEX
+                      value: "7"
+                    - name: NCCL_DEBUG
+                      value: "INFO"
+                  volumeMounts:
+                    - name: model-vol
+                      mountPath: /models
+                    - mountPath: /dev/shm
+                      name: shared-mem
+                  resources:
+                    limits:
+                      nvidia.com/gpu: 1
+                      vke.volcengine.com/rdma: "1"
+                  securityContext:
+                    capabilities:
+                      add:
+                        - IPC_LOCK
+              volumes:
+                - name: model-vol
+                  hostPath:
+                    path: /root/models
+                    type: Directory
+                - emptyDir:
+                    medium: Memory
+                  name: shared-mem