Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@
[submodule "thirdparty/Megatron-LM"]
path = thirdparty/Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
[submodule "thirdparty/vllm"]
path = thirdparty/vllm
url = https://github.com/vllm-project/vllm.git
[submodule "thirdparty/sglang"]
path = thirdparty/sglang
url = https://github.com/sgl-project/sglang.git
176 changes: 176 additions & 0 deletions launcher_scripts/k8s/inference/sglang/1p1d/d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: deepseek-decode
namespace: default
spec:
leaderWorkerTemplate:
size: 2
leaderTemplate:
metadata:
labels:
leaderworkerset.sigs.k8s.io/role: leader
role: decode-leader
spec:
hostNetwork: true
hostIPC: true
dnsPolicy: ClusterFirstWithHostNet
nodeSelector:
"scitix.ai/gpu-type": h20xnvlink141
"roce.scitix.ai/unit": unit1
containers:
- name: sglang-leader
image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
securityContext:
privileged: true
command: ["/usr/bin/env","bash","-c"]
args:
- |-
echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
exec python3 -m sglang.launch_server \
--host "0.0.0.0" \
--port 30000 \
--model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
--chunked-prefill-size 20480 \
--page-size 64 \
--enable-deepep-moe \
--deepep-mode low_latency \
--disaggregation-mode decode \
--mem-fraction-static 0.85 \
--context-length 32768 \
--disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
--cuda-graph-max-bs 64 \
--max-running-requests 2048 \
--eplb-rebalance-layers-per-chunk 29 \
--tp-size 16 \
--dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
--nnodes 2 \
--node-rank 0 \
--moe-dense-tp-size 1 \
--trust-remote-code \
--disaggregation-transfer-backend nixl \
--enable-dp-attention \
--enable-dp-lm-head \
--dp-size 8
env:
- name: SGLANG_HOST_IP
valueFrom: {fieldRef: {fieldPath: status.hostIP}}
- name: HOST_IP
valueFrom: {fieldRef: {fieldPath: status.hostIP}}
- name: MC_TE_METRIC
value: "true"
- name: SGL_ENABLE_JIT_DEEPGEMM
value: "1"
ports:
- containerPort: 30000
protocol: TCP
readinessProbe:
initialDelaySeconds: 100
timeoutSeconds: 300
periodSeconds: 30
tcpSocket:
port: 30000
resources:
limits:
nvidia.com/gpu: "8"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /mnt
name: mnt
volumes:
- name: dshm
emptyDir: { medium: Memory }
- name: mnt
hostPath:
path: /mnt/xstorage

workerTemplate:
metadata:
labels:
role: decode-worker
spec:
hostNetwork: true
hostIPC: true
dnsPolicy: ClusterFirstWithHostNet
nodeSelector:
"scitix.ai/gpu-type": h20xnvlink141
"roce.scitix.ai/unit": unit1
containers:
- name: sglang-worker
image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
securityContext:
privileged: true
command: ["/usr/bin/env","bash","-c"]
args:
- |-
echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
exec python3 -m sglang.launch_server \
--host "0.0.0.0" \
--port 30000 \
--model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
--chunked-prefill-size 20480 \
--page-size 64 \
--enable-deepep-moe \
--deepep-mode low_latency \
--disaggregation-mode decode \
--mem-fraction-static 0.85 \
--context-length 32768 \
--disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
--cuda-graph-max-bs 64 \
--max-running-requests 2048 \
--tp-size 16 \
--dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
--nnodes ${LWS_GROUP_SIZE} \
--node-rank ${LWS_WORKER_INDEX} \
--trust-remote-code \
--moe-dense-tp-size 1 \
--disaggregation-transfer-backend nixl \
--enable-dp-attention \
--enable-dp-lm-head \
--dp-size 8

env:
- name: SGLANG_HOST_IP
valueFrom: {fieldRef: {fieldPath: status.hostIP}}
- name: HOST_IP
valueFrom: {fieldRef: {fieldPath: status.hostIP}}
- name: MC_TE_METRIC
value: "true"
- name: SGL_ENABLE_JIT_DEEPGEMM
value: "1"
ports:
- containerPort: 30000
protocol: TCP
readinessProbe:
initialDelaySeconds: 100
timeoutSeconds: 300
periodSeconds: 30
tcpSocket:
port: 30000
resources:
limits:
nvidia.com/gpu: "8"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /mnt
name: mnt
volumes:
- name: dshm
emptyDir: { medium: Memory }
- name: mnt
hostPath:
path: /mnt/xstorage

replicas: 1
rolloutStrategy:
type: RollingUpdate
rollingUpdateConfiguration:
maxSurge: 0
maxUnavailable: 1
startupPolicy: LeaderCreated
61 changes: 61 additions & 0 deletions launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: deepseek-lb
namespace: default
labels: {app: deepseek, tier: lb}
spec:
replicas: 1
selector:
matchLabels: {app: deepseek, tier: lb}
template:
metadata:
labels: {app: deepseek, tier: lb}
spec:
hostNetwork: true
hostIPC: true
dnsPolicy: ClusterFirstWithHostNet
nodeSelector:
"roce.scitix.ai/unit": unit1
containers:
- name: sglang-minilb
image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
command: ["/usr/bin/env", "bash", "-c"]
args:
- |
exec python -m sglang.srt.disaggregation.mini_lb \
--prefill "http://deepseek-prefill-svc:30000" \
--decode "http://deepseek-decode-svc:30000" \
--host "0.0.0.0" \
--port 8000
ports:
- containerPort: 8000
name: http
readinessProbe:
tcpSocket: {port: 8000}
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
volumeMounts:
- name: mnt
mountPath: /mnt
volumes:
- name: mnt
hostPath:
path: /mnt/xstorage
---
apiVersion: v1
kind: Service
metadata:
name: deepseek-svc
namespace: default
spec:
type: ClusterIP
selector:
app: deepseek
tier: lb
ports:
- name: http
protocol: TCP
port: 8000
targetPort: 8000
85 changes: 85 additions & 0 deletions launcher_scripts/k8s/inference/sglang/1p1d/p.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: deepseek-prefill
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: deepseek-prefill
template:
metadata:
labels:
app: deepseek-prefill
spec:
hostNetwork: true
hostIPC: true
dnsPolicy: ClusterFirstWithHostNet
nodeSelector:
"scitix.ai/gpu-type": h20xnvlink141
"roce.scitix.ai/unit": unit1
containers:
- name: sglang-prefill
image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
securityContext:
privileged: true
command: ["/usr/bin/env", "bash", "-c"]
args:
- |
set -euxo pipefail
export POD_IP="$(hostname -i)"
exec python3 -m sglang.launch_server \
--host "0.0.0.0" \
--port 30000 \
--model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
--disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
--chunked-prefill-size 20480 \
--page-size 64 \
--disable-radix-cache \
--enable-deepep-moe \
--deepep-mode normal \
--disaggregation-mode prefill \
--mem-fraction-static 0.85 \
--context-length 32768 \
--tp-size 8 \
--trust-remote-code \
--max-running-requests 1024 \
--disaggregation-transfer-backend nixl
env:
- name: GLOO_SOCKET_IFNAME
value: bond0
- name: NCCL_SOCKET_IFNAME
value: bond0
- name: NCCL_IB_GID_INDEX
value: "3"
- name: NCCL_IB_QPS_PER_CONNECTION
value: "8"
- name: NCCL_NET_PLUGIN
value: none
- name: NCCL_MIN_NCHANNELS
value: "4"
- name: SGLANG_SET_CPU_AFFINITY
value: "true"
- name: SGL_ENABLE_JIT_DEEPGEMM
value: "1"
ports:
- containerPort: 30000
- containerPort: 8998
readinessProbe:
tcpSocket: { port: 30000 }
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
resources:
limits:
nvidia.com/gpu: "8"
volumeMounts:
- { name: dshm, mountPath: /dev/shm }
- { name: mnt, mountPath: /mnt }
volumes:
- name: dshm
emptyDir: { medium: Memory }
- name: mnt
hostPath:
path: /mnt/xstorage
35 changes: 35 additions & 0 deletions launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
apiVersion: v1
kind: Service
metadata:
name: deepseek-prefill-svc
namespace: default
spec:
type: ClusterIP
selector:
app: deepseek-prefill
ports:
- name: http
protocol: TCP
port: 30000
targetPort: 30000
- name: nixl-boot
protocol: TCP
port: 8998
targetPort: 8998

---
apiVersion: v1
kind: Service
metadata:
name: deepseek-decode-svc
namespace: default
spec:
type: ClusterIP
selector:
leaderworkerset.sigs.k8s.io/name: deepseek-decode
leaderworkerset.sigs.k8s.io/role: leader
ports:
- name: http
protocol: TCP
port: 30000
targetPort: 30000
Loading