diff --git a/.gitmodules b/.gitmodules index 475fe99..e1dbd83 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,9 @@ [submodule "thirdparty/Megatron-LM"] path = thirdparty/Megatron-LM url = https://github.com/NVIDIA/Megatron-LM.git +[submodule "thirdparty/vllm"] + path = thirdparty/vllm + url = https://github.com/vllm-project/vllm.git +[submodule "thirdparty/sglang"] + path = thirdparty/sglang + url = https://github.com/sgl-project/sglang.git diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml new file mode 100644 index 0000000..a2d23b0 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml @@ -0,0 +1,176 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseek-decode + namespace: default +spec: + leaderWorkerTemplate: + size: 2 + leaderTemplate: + metadata: + labels: + leaderworkerset.sigs.k8s.io/role: leader + role: decode-leader + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "scitix.ai/gpu-type": h20xnvlink141 + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-leader + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server \ + --host "0.0.0.0" \ + --port 30000 \ + --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \ + --chunked-prefill-size 20480 \ + --page-size 64 \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --disaggregation-mode decode \ + --mem-fraction-static 0.85 \ + --context-length 32768 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --cuda-graph-max-bs 64 \ + --max-running-requests 2048 \ + --eplb-rebalance-layers-per-chunk 29 \ + --tp-size 16 \ + --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes 2 \ + --node-rank 0 \ + --moe-dense-tp-size 1 \ + --trust-remote-code \ + --disaggregation-transfer-backend nixl \ + --enable-dp-attention \ + --enable-dp-lm-head \ + --dp-size 8 + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: MC_TE_METRIC + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 100 + timeoutSeconds: 300 + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt + name: mnt + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: mnt + hostPath: + path: /mnt/xstorage + + workerTemplate: + metadata: + labels: + role: decode-worker + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "scitix.ai/gpu-type": h20xnvlink141 + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-worker + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server \ + --host "0.0.0.0" \ + --port 30000 \ + --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \ + --chunked-prefill-size 20480 \ + --page-size 64 \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --disaggregation-mode decode \ + --mem-fraction-static 0.85 \ + --context-length 32768 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --cuda-graph-max-bs 64 \ + --max-running-requests 2048 \ + --tp-size 16 \ + --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes ${LWS_GROUP_SIZE} \ + --node-rank ${LWS_WORKER_INDEX} \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --disaggregation-transfer-backend nixl \ + --enable-dp-attention \ + --enable-dp-lm-head \ + --dp-size 8 + + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: MC_TE_METRIC + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 100 + timeoutSeconds: 300 + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt + name: mnt + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: mnt + hostPath: + path: /mnt/xstorage + + replicas: 1 + rolloutStrategy: + type: RollingUpdate + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + startupPolicy: LeaderCreated \ No newline at end of file diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml new file mode 100644 index 0000000..5cc16ce --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseek-lb + namespace: default + labels: {app: deepseek, tier: lb} +spec: + replicas: 1 + selector: + matchLabels: {app: deepseek, tier: lb} + template: + metadata: + labels: {app: deepseek, tier: lb} + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-minilb + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + exec python -m sglang.srt.disaggregation.mini_lb \ + --prefill "http://deepseek-prefill-svc:30000" \ + --decode "http://deepseek-decode-svc:30000" \ + --host "0.0.0.0" \ + --port 8000 + ports: + - containerPort: 8000 + name: http + readinessProbe: + tcpSocket: {port: 8000} + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + volumeMounts: + - name: mnt + mountPath: /mnt + volumes: + - name: mnt + hostPath: + path: /mnt/xstorage +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseek-svc + namespace: default +spec: + type: ClusterIP + selector: + app: deepseek + tier: lb + ports: + - name: http + protocol: TCP + port: 8000 + targetPort: 8000 diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml new file mode 100644 index 0000000..4d47e37 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml @@ -0,0 +1,85 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseek-prefill + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: deepseek-prefill + template: + metadata: + labels: + app: deepseek-prefill + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "scitix.ai/gpu-type": h20xnvlink141 + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-prefill + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + set -euxo pipefail + export POD_IP="$(hostname -i)" + exec python3 -m sglang.launch_server \ + --host "0.0.0.0" \ + --port 30000 \ + --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --chunked-prefill-size 20480 \ + --page-size 64 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode normal \ + --disaggregation-mode prefill \ + --mem-fraction-static 0.85 \ + --context-length 32768 \ + --tp-size 8 \ + --trust-remote-code \ + --max-running-requests 1024 \ + --disaggregation-transfer-backend nixl + env: + - name: GLOO_SOCKET_IFNAME + value: bond0 + - name: NCCL_SOCKET_IFNAME + value: bond0 + - name: NCCL_IB_GID_INDEX + value: "3" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + - containerPort: 8998 + readinessProbe: + tcpSocket: { port: 30000 } + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - { name: dshm, mountPath: /dev/shm } + - { name: mnt, mountPath: /mnt } + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: mnt + hostPath: + path: /mnt/xstorage \ No newline at end of file diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml new file mode 100644 index 0000000..e1db8e3 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: Service +metadata: + name: deepseek-prefill-svc + namespace: default +spec: + type: ClusterIP + selector: + app: deepseek-prefill + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 + - name: nixl-boot + protocol: TCP + port: 8998 + targetPort: 8998 + +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseek-decode-svc + namespace: default +spec: + type: ClusterIP + selector: + leaderworkerset.sigs.k8s.io/name: deepseek-decode + leaderworkerset.sigs.k8s.io/role: leader + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 \ No newline at end of file diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml new file mode 100644 index 0000000..858a3a4 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml @@ -0,0 +1,187 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: alan-deepseek-decode + namespace: t-ai-infra-qqxu03 +spec: + leaderWorkerTemplate: + size: 2 + leaderTemplate: + metadata: + labels: + leaderworkerset.sigs.k8s.io/role: leader + role: decode-leader + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference + containers: + - name: sglang-leader + image: lmsysorg/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /data/DeepSeek-R1 \ + --chunked-prefill-size "262144" --page-size "64" \ + --enable-dp-attention --enable-dp-lm-head --dp-size "16" \ + --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --eplb-rebalance-layers-per-chunk "29" \ + --tp-size "16" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes 2 --node-rank 0 --moe-dense-tp-size "1" \ + --trust-remote-code --disaggregation-transfer-backend nixl + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + # - name: NCCL_IB_GID_INDEX + # value: "3" + # - name: NVSHMEM_IB_GID_INDEX + # value: "3" + # - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + # value: "1" + # - name: NVSHMEM_HCA_PE_MAPPING + # value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + # - name: NCCL_IB_QPS_PER_CONNECTION + # value: "8" + # - name: NCCL_IB_SPLIT_DATA_ON_QPS + # value: "1" + # - name: CUDA_LAUNCH_BLOCKING + # value: "0" + # - name: NCCL_NET_PLUGIN + # value: "none" + # - name: NCCL_MIN_NCHANNELS + # value: "4" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 300 + timeoutSeconds: 300 + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /data + name: model + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: model + persistentVolumeClaim: + claimName: siflow-models + + workerTemplate: + metadata: + labels: + role: decode-worker + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference + containers: + - name: sglang-worker + image: lmsysorg/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /data/DeepSeek-R1 \ + --chunked-prefill-size "262144" --page-size "64" \ + --enable-dp-attention --enable-dp-lm-head --dp-size "16" \ + --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --tp-size "16" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \ + --trust-remote-code --moe-dense-tp-size "1" --disaggregation-transfer-backend nixl + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + # - name: NVSHMEM_IB_TRAFFIC_CLASS + # value: "16" + # - name: NVSHMEM_IB_GID_INDEX + # value: "3" + # - name: NCCL_IB_GID_INDEX + # value: "3" + # - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + # value: "1" + # - name: NVSHMEM_HCA_PE_MAPPING + # value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + # - name: NCCL_IB_QPS_PER_CONNECTION + # value: "#8" + # - name: NCCL_IB_SPLIT_DATA_ON_QPS + # value: "1" + # - name: NCCL_NET_PLUGIN + # value: "none#" + # - name: NCCL_MIN_NCHANNELS + # value: "4" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 300 + timeoutSeconds: 300 + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /data + name: model + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: model + persistentVolumeClaim: + claimName: siflow-models + + replicas: 1 + rolloutStrategy: + type: RollingUpdate + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + startupPolicy: LeaderCreated diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml new file mode 100644 index 0000000..9614da9 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alan-deepseek-lb + namespace: t-ai-infra-qqxu03 + labels: {app: alan-deepseek} +spec: + replicas: 1 + selector: + matchLabels: {app: alan-deepseek, tier: lb} + template: + metadata: + labels: {app: alan-deepseek, tier: lb} + spec: + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference + containers: + - name: sgl-minilb + image: lmsysorg/sglang:v0.5.1.post2-cu126 + command: ["python","-m","sglang.srt.disaggregation.mini_lb", + "--prefill","http://alan-deepseek-prefill-svc:30000", + "--prefill-bootstrap-ports","8998", + "--decode","http://alan-deepseek-decode-svc:30000", + "--host","0.0.0.0","--port","8000"] + ports: + - containerPort: 8000 + readinessProbe: + tcpSocket: {port: 8000} + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + volumeMounts: + - name: model + mountPath: /data + volumes: + - name: model + persistentVolumeClaim: + claimName: siflow-models + +--- +apiVersion: v1 +kind: Service +metadata: + name: alan-deepseek-svc + namespace: t-ai-infra-qqxu03 +spec: + type: ClusterIP + selector: + app: alan-deepseek + tier: lb + ports: + - name: http + protocol: TCP + port: 8000 + targetPort: 8000 diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml new file mode 100644 index 0000000..18b2831 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml @@ -0,0 +1,99 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: alan-deepseek-prefill + namespace: t-ai-infra-qqxu03 +spec: + leaderWorkerTemplate: + size: 1 + leaderTemplate: + metadata: + labels: + app: alan-deepseek + role: prefill-leader + leaderworkerset.sigs.k8s.io/role: leader + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference + containers: + - name: sglang-leader + image: lmsysorg/sglang:v0.5.1.post2-cu126 + command: ["/usr/bin/env","bash","-c"] + args: + - | + set -euxo pipefail + export POD_IP="$(hostname -i)" + exec python3 -m sglang.launch_server \ + --port 30000 \ + --host "0.0.0.0" \ + --model-path /data/DeepSeek-R1 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 \ + --chunked-prefill-size 524288 \ + --page-size 64 \ + --disable-radix-cache \ + --enable-deepep-moe --deepep-mode normal \ + --disaggregation-mode prefill \ + --mem-fraction-static 0.85 --context-length 32768 \ + --tp 8 \ + --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \ + --trust-remote-code --max-running-requests 1024 \ + --disaggregation-transfer-backend nixl + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: GLOO_SOCKET_IFNAME + value: bond0 + - name: NCCL_SOCKET_IFNAME + value: bond0 + - name: NCCL_IB_GID_INDEX + value: "3" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + - containerPort: 8998 # NIXL bootstrap + readinessProbe: + tcpSocket: { port: 30000 } + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - { name: dshm, mountPath: /dev/shm } + - { name: model, mountPath: /data } + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: model + persistentVolumeClaim: + claimName: siflow-models + workerTemplate: + metadata: + labels: + app: alan-deepseek + role: prefill-worker + leaderworkerset.sigs.k8s.io/role: worker + spec: + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference + containers: + - name: noop + image: busybox + command: ["sh","-c","sleep 3600000"] + diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml new file mode 100644 index 0000000..8223063 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml @@ -0,0 +1,212 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-2decode + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --chunked-prefill-size "262144" --page-size "64" --enable-dp-attention --enable-dp-lm-head \ + --dp-size "16" --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --tp-size "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" + env: + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --chunked-prefill-size "262144" --page-size "64" --enable-dp-attention --enable-dp-lm-head \ + --dp-size "16" --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --tp-size "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" + env: + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + # - name: NCCL_IB_SL + # value: "5" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + networkConfig: + subdomainPolicy: Shared + replicas: 1 + rolloutStrategy: + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + type: RollingUpdate + startupPolicy: LeaderCreated +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-2decode-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-2decode + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml new file mode 100644 index 0000000..bf5d960 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml @@ -0,0 +1,60 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseekr10528-2p2d-lb + namespace: t-hisys-xlliu + labels: + app: deepseekr10528-2p2d-lb +spec: + replicas: 1 + selector: + matchLabels: + app: deepseekr10528-2p2d-lb + template: + metadata: + labels: + app: deepseekr10528-2p2d-lb + spec: + containers: + - name: sgl-minilb + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + command: + - python + - -m + - sglang.srt.disaggregation.mini_lb + - --prefill + - http://deepseekr10528-2prefill-svc:30000 + - --decode + - http://deepseekr10528-2decode-svc:30000 + - --host + - 0.0.0.0 + - --port + - "8000" + ports: + - containerPort: 8000 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-2p2d-lb-svc + namespace: t-hisys-xlliu +spec: + type: ClusterIP # NodePort is easy to test, you can also specify `ClusterIP` + selector: + app: deepseekr10528-2p2d-lb + ports: + - protocol: TCP + port: 8000 # Service Port(In-Cluster) + targetPort: 8000 # Exposed Container diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml new file mode 100644 index 0000000..d479bd3 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml @@ -0,0 +1,211 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-2prefill + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --chunked-prefill-size "524288" --max-prefill-tokens "32768" \ + --page-size "64" --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek \ + --enable-dp-lm-head --enable-dp-attention --dp-size "16" --disable-radix-cache \ + --enable-deepep-moe --deepep-mode normal --disaggregation-mode prefill \ + --mem-fraction-static "0.7" --context-length "32768" \ + --tp "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" --max-running-requests "1024" + env: + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "false" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --chunked-prefill-size "524288" --max-prefill-tokens "32768" \ + --page-size "64" --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek \ + --enable-dp-lm-head --enable-dp-attention --dp-size "16" --disable-radix-cache \ + --enable-deepep-moe --deepep-mode normal --disaggregation-mode prefill \ + --mem-fraction-static "0.7" --context-length "32768" \ + --tp "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" --max-running-requests "1024" + env: + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "8" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD + value: "0" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-2prefill-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-2prefill + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml new file mode 100644 index 0000000..0750ccd --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: alan-deepseek-prefill-svc + namespace: t-ai-infra-qqxu03 +spec: + type: ClusterIP + selector: + leaderworkerset.sigs.k8s.io/name: alan-deepseek-prefill + leaderworkerset.sigs.k8s.io/role: leader + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 + - name: nixl-boot + protocol: TCP + port: 8998 + targetPort: 8998 + +--- +apiVersion: v1 +kind: Service +metadata: + name: alan-deepseek-decode-svc + namespace: t-ai-infra-qqxu03 +spec: + type: ClusterIP + selector: + leaderworkerset.sigs.k8s.io/name: alan-deepseek-decode + leaderworkerset.sigs.k8s.io/role: leader + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml new file mode 100644 index 0000000..684e202 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml @@ -0,0 +1,246 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-decode + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + SGLANG_NUM_RESERVED_DECODE_TOKENS=102 MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 \ + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --disaggregation-mode decode --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $LWS_GROUP_SIZE --node-rank $LWS_WORKER_INDEX --tp-size 72 --dp-size 72 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode low_latency \ + --mem-fraction-static 0.835 --max-running-requests 18432 --context-length 4500 \ + --ep-num-redundant-experts 32 --cuda-graph-bs 256 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/decode_${LWS_WORKER_INDEX}.log + # --init-expert-location YOUR_PATH --ep-num-redundant-experts 32 --cuda-graph-bs 256" + env: + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + # - name: NCCL_IB_SL + # value: "5" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu + restartPolicy: RecreateGroupOnPodRestart + size: 9 + workerTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + SGLANG_NUM_RESERVED_DECODE_TOKENS=102 MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 \ + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --disaggregation-mode decode --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $LWS_GROUP_SIZE --node-rank $LWS_WORKER_INDEX --tp-size 72 --dp-size 72 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode low_latency \ + --mem-fraction-static 0.835 --max-running-requests 18432 --context-length 4500 \ + --ep-num-redundant-experts 32 --cuda-graph-bs 256 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/decode_${LWS_WORKER_INDEX}.log + # --init-expert-location YOUR_PATH --ep-num-redundant-experts 32 --cuda-graph-bs 256" + env: + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + # - name: NCCL_IB_SL + # value: "5" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu + networkConfig: + subdomainPolicy: Shared + replicas: 1 + rolloutStrategy: + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + type: RollingUpdate + startupPolicy: LeaderCreated +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-decode-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml new file mode 100644 index 0000000..2d3267a --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseekr10528-lb-main + namespace: t-hisys-xlliu + labels: + app: deepseekr10528-lb +spec: + replicas: 1 + selector: + matchLabels: + app: deepseekr10528-lb + template: + metadata: + labels: + app: deepseekr10528-lb + spec: + # nodeSelector: + # bo: "yes" + # tolerations: + # - key: bopd + # operator: Exists + # - key: node-role + # operator: Exists + containers: + - name: sgl-minilb + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + command: + - python + - -m + - sglang.srt.disaggregation.mini_lb + - --prefill + - http://deepseekr10528-prefill-svc:30000 + - --decode + - http://deepseekr10528-decode-svc:30000 + - --host + - 0.0.0.0 + - --port + - "8000" + ports: + - containerPort: 8000 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-lb-svc + namespace: t-hisys-xlliu +spec: + type: ClusterIP # NodePort is easy to test, you can also specify `ClusterIP` + selector: + app: deepseekr10528-lb + ports: + - protocol: TCP + port: 8000 # Service Port(In-Cluster) + targetPort: 8000 # Exposed Container + # nodePort: 30800 diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml new file mode 100644 index 0000000..887676e --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml @@ -0,0 +1,251 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-prefill + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --disaggregation-mode prefill --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --tp-size 32 --dp-size 32 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode normal \ + --mem-fraction-static 0.85 --chunked-prefill-size 524288 \ + --max-running-requests 8192 --max-total-tokens 131072 \ + --context-length 8192 --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/prefill_${LWS_WORKER_INDEX}.log + # --context-length 8192 --init-expert-location YOUR_PATH \ + # --ep-num-redundant-experts 32 --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek --deepep-config YOUR_PATH + env: + - name: NVSHMEM_HCA_PE_MAPPING + # should modify according your rdma env + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_DEBUG + value: INFO + # - name: NCCL_IB_TC + # value: "160" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "false" + # - name: NCCL_IB_SL + # value: "5" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu + restartPolicy: RecreateGroupOnPodRestart + size: 4 + workerTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --disaggregation-mode prefill --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --tp-size 32 --dp-size 32 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode normal \ + --mem-fraction-static 0.85 --chunked-prefill-size 524288 \ + --max-running-requests 8192 --max-total-tokens 131072 \ + --context-length 8192 --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/prefill_${LWS_WORKER_INDEX}.log + # --context-length 8192 --init-expert-location YOUR_PATH \ + # --ep-num-redundant-experts 32 --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek --deepep-config YOUR_PATH + env: + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: NVSHMEM_HCA_PE_MAPPING + # should modify according your rdma env + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "8" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD + value: "0" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + # - name: NCCL_IB_SL + # value: "5" + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30000 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-prefill-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml b/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml new file mode 100644 index 0000000..9b97c37 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml @@ -0,0 +1,89 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sglang-server + namespace: t-hisys-xlliu +spec: + replicas: 1 + selector: + matchLabels: + app: sglang + template: + metadata: + labels: + app: sglang + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: unit + operator: In + values: + - "2" + restartPolicy: Always + containers: + - name: sglang + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + env: + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: ["python3"] + args: + - "-m" + - "sglang.launch_server" + - "--model" + - "/root/.cache/huggingface/deepseek-ai/DeepSeek-V3-0324" + - "--tp" + - "8" + - "--trust-remote-code" + - "--port" + - "30000" + ports: + - containerPort: 30000 + resources: + resources: + limits: + nvidia.com/gpu: 8 + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: huggingface-cache + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm + volumes: + - name: huggingface-cache + hostPath: + path: /mnt/xstorage/model + - name: shm + emptyDir: + medium: Memory + sizeLimit: 32Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: sglang-service + namespace: t-hisys-xlliu +spec: + type: NodePort + selector: + app: sglang + ports: + - port: 30000 + targetPort: 30000 + nodePort: 30000 diff --git a/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml b/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml new file mode 100644 index 0000000..1ccddd6 --- /dev/null +++ b/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml @@ -0,0 +1,381 @@ +# vllm-2p2d-all.yaml + +--- +# Prefill A +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-prefill-a + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-prefill-a + template: + metadata: + labels: + app: vllm-prefill-a + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: prefill-a + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=1 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + export NCCL_DEBUG=INFO + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8100 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Prefill B +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-prefill-b + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-prefill-b + template: + metadata: + labels: + app: vllm-prefill-b + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: prefill-b + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=1 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5558 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8101 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Decode A +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-decode-a + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-decode-a + template: + metadata: + labels: + app: vllm-decode-a + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: decode-a + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=0 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5559 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8200 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Decode B +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-decode-b + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-decode-b + template: + metadata: + labels: + app: vllm-decode-b + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: decode-b + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=0 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5560 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8201 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Proxy +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-proxy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-proxy + template: + metadata: + labels: + app: vllm-proxy + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - name: proxy + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + python3 /data/vllm/toy_proxy_server.py \ + --port 8192 \ + --prefiller-port 8100 8101 \ + --decoder-port 8200 8201 & + sleep inf + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm diff --git a/thirdparty/sglang b/thirdparty/sglang new file mode 160000 index 0000000..8604471 --- /dev/null +++ b/thirdparty/sglang @@ -0,0 +1 @@ +Subproject commit 86044712c6492df3ceb5a5cf025a575ab3989061 diff --git a/thirdparty/vllm b/thirdparty/vllm new file mode 160000 index 0000000..8020e98 --- /dev/null +++ b/thirdparty/vllm @@ -0,0 +1 @@ +Subproject commit 8020e98c9f033e76c97eb8261f772d59eba49c9a