sallylxl · ziang663 · Jul 11, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,9 @@
 [submodule "thirdparty/Megatron-LM"]
 	path = thirdparty/Megatron-LM
 	url = https://github.com/NVIDIA/Megatron-LM.git
+[submodule "thirdparty/vllm"]
+	path = thirdparty/vllm
+	url = https://github.com/vllm-project/vllm.git
+[submodule "thirdparty/sglang"]
+	path = thirdparty/sglang
+	url = https://github.com/sgl-project/sglang.git
diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml
@@ -0,0 +1,176 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseek-decode
+  namespace: default
+spec:
+  leaderWorkerTemplate:
+    size: 2
+    leaderTemplate:
+      metadata:
+        labels:
+          leaderworkerset.sigs.k8s.io/role: leader
+          role: decode-leader
+      spec:
+        hostNetwork: true
+        hostIPC: true
+        dnsPolicy: ClusterFirstWithHostNet
+        nodeSelector:
+          "scitix.ai/gpu-type": h20xnvlink141
+          "roce.scitix.ai/unit": unit1
+        containers:
+        - name: sglang-leader
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env","bash","-c"]
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            exec python3 -m sglang.launch_server \
+              --host "0.0.0.0" \
+              --port 30000 \
+              --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
+              --chunked-prefill-size 20480 \
+              --page-size 64 \
+              --enable-deepep-moe \
+              --deepep-mode low_latency \
+              --disaggregation-mode decode \
+              --mem-fraction-static 0.85 \
+              --context-length 32768 \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
+              --cuda-graph-max-bs 64 \
+              --max-running-requests 2048 \
+              --eplb-rebalance-layers-per-chunk 29 \
+              --tp-size 16 \
+              --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes 2 \
+              --node-rank 0 \
+              --moe-dense-tp-size 1 \
+              --trust-remote-code \
+              --disaggregation-transfer-backend nixl \
+              --enable-dp-attention \
+              --enable-dp-lm-head \
+              --dp-size 8
+          env:
+          - name: SGLANG_HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            initialDelaySeconds: 100
+            timeoutSeconds: 300
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt
+            name: mnt
+        volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: mnt
+          hostPath:
+            path: /mnt/xstorage
+
+    workerTemplate:
+      metadata:
+        labels:
+          role: decode-worker
+      spec:
+        hostNetwork: true
+        hostIPC: true
+        dnsPolicy: ClusterFirstWithHostNet
+        nodeSelector:
+          "scitix.ai/gpu-type": h20xnvlink141
+          "roce.scitix.ai/unit": unit1
+        containers:
+        - name: sglang-worker
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env","bash","-c"]
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            exec python3 -m sglang.launch_server \
+              --host "0.0.0.0" \
+              --port 30000 \
+              --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
+              --chunked-prefill-size 20480 \
+              --page-size 64 \
+              --enable-deepep-moe \
+              --deepep-mode low_latency \
+              --disaggregation-mode decode \
+              --mem-fraction-static 0.85 \
+              --context-length 32768 \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
+              --cuda-graph-max-bs 64 \
+              --max-running-requests 2048 \
+              --tp-size 16 \
+              --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes ${LWS_GROUP_SIZE} \
+              --node-rank ${LWS_WORKER_INDEX} \
+              --trust-remote-code \
+              --moe-dense-tp-size 1 \
+              --disaggregation-transfer-backend nixl \
+              --enable-dp-attention \
+              --enable-dp-lm-head \
+              --dp-size 8
+
+          env:
+          - name: SGLANG_HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            initialDelaySeconds: 100
+            timeoutSeconds: 300
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt
+            name: mnt
+        volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: mnt
+          hostPath:
+            path: /mnt/xstorage
+
+  replicas: 1
+  rolloutStrategy:
+    type: RollingUpdate
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+  startupPolicy: LeaderCreated
diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseek-lb
+  namespace: default
+  labels: {app: deepseek, tier: lb}
+spec:
+  replicas: 1
+  selector:
+    matchLabels: {app: deepseek, tier: lb}
+  template:
+    metadata:
+      labels: {app: deepseek, tier: lb}
+    spec:
+      hostNetwork: true
+      hostIPC: true
+      dnsPolicy: ClusterFirstWithHostNet
+      nodeSelector:
+        "roce.scitix.ai/unit": unit1
+      containers:
+      - name: sglang-minilb
+        image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+        command: ["/usr/bin/env", "bash", "-c"]
+        args:
+          - |
+            exec python -m sglang.srt.disaggregation.mini_lb \
+              --prefill "http://deepseek-prefill-svc:30000" \
+              --decode "http://deepseek-decode-svc:30000" \
+              --host "0.0.0.0" \
+              --port 8000
+        ports:
+        - containerPort: 8000
+          name: http
+        readinessProbe:
+          tcpSocket: {port: 8000}
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 3
+        volumeMounts:
+        - name: mnt
+          mountPath: /mnt
+      volumes:
+      - name: mnt
+        hostPath:
+          path: /mnt/xstorage
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseek-svc
+  namespace: default
+spec:
+  type: ClusterIP
+  selector:
+    app: deepseek
+    tier: lb
+  ports:
+    - name: http
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml
@@ -0,0 +1,85 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseek-prefill
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseek-prefill
+  template:
+    metadata:
+      labels:
+        app: deepseek-prefill
+    spec:
+      hostNetwork: true
+      hostIPC: true
+      dnsPolicy: ClusterFirstWithHostNet
+      nodeSelector:
+        "scitix.ai/gpu-type": h20xnvlink141
+        "roce.scitix.ai/unit": unit1
+      containers:
+        - name: sglang-prefill
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env", "bash", "-c"]
+          args:
+            - |
+              set -euxo pipefail
+              export POD_IP="$(hostname -i)"
+              exec python3 -m sglang.launch_server \
+                --host "0.0.0.0" \
+                --port 30000 \
+                --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
+                --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
+                --chunked-prefill-size 20480 \
+                --page-size 64 \
+                --disable-radix-cache \
+                --enable-deepep-moe \
+                --deepep-mode normal \
+                --disaggregation-mode prefill \
+                --mem-fraction-static 0.85 \
+                --context-length 32768 \
+                --tp-size 8 \
+                --trust-remote-code \
+                --max-running-requests 1024 \
+                --disaggregation-transfer-backend nixl
+          env:
+            - name: GLOO_SOCKET_IFNAME
+              value: bond0
+            - name: NCCL_SOCKET_IFNAME
+              value: bond0
+            - name: NCCL_IB_GID_INDEX
+              value: "3"
+            - name: NCCL_IB_QPS_PER_CONNECTION
+              value: "8"
+            - name: NCCL_NET_PLUGIN
+              value: none
+            - name: NCCL_MIN_NCHANNELS
+              value: "4"
+            - name: SGLANG_SET_CPU_AFFINITY
+              value: "true"
+            - name: SGL_ENABLE_JIT_DEEPGEMM
+              value: "1"
+          ports:
+            - containerPort: 30000
+            - containerPort: 8998
+          readinessProbe:
+            tcpSocket: { port: 30000 }
+            initialDelaySeconds: 10
+            periodSeconds: 5
+            timeoutSeconds: 3
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+            - { name: dshm, mountPath: /dev/shm }
+            - { name: mnt, mountPath: /mnt }
+      volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: mnt
+          hostPath:
+            path: /mnt/xstorage
diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml
@@ -0,0 +1,35 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseek-prefill-svc
+  namespace: default
+spec:
+  type: ClusterIP
+  selector:
+    app: deepseek-prefill
+  ports:
+  - name: http
+    protocol: TCP
+    port: 30000
+    targetPort: 30000
+  - name: nixl-boot
+    protocol: TCP
+    port: 8998
+    targetPort: 8998
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseek-decode-svc
+  namespace: default
+spec:
+  type: ClusterIP
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseek-decode
+    leaderworkerset.sigs.k8s.io/role: leader
+  ports:
+  - name: http
+    protocol: TCP
+    port: 30000
+    targetPort: 30000