Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 205 additions & 2 deletions conformance/resources/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ spec:
terminationGracePeriodSeconds: 130
containers:
- name: epp
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v1.0.0
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v20251023-d788a2c
imagePullPolicy: Always
args:
- --pool-name
Expand Down Expand Up @@ -298,7 +298,7 @@ spec:
terminationGracePeriodSeconds: 130
containers:
- name: epp
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v1.0.0
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v20251023-d788a2c
imagePullPolicy: Always
args:
- --pool-name
Expand Down Expand Up @@ -340,6 +340,209 @@ spec:
configMap:
name: plugins-config
---
# -- Data Parallelism (DP) backend deployment: 3 pods, each listening on three ports to simulate ranks ---
apiVersion: apps/v1
kind: Deployment
metadata:
name: dp-inference-model-server-deployment
namespace: inference-conformance-app-backend
labels:
app: dp-inference-model-server
spec:
replicas: 3
selector:
matchLabels:
app: dp-inference-model-server
template:
metadata:
labels:
app: dp-inference-model-server
spec:
containers:
- name: echoserver-3000
image: gcr.io/k8s-staging-gateway-api/echo-basic:v20240412-v1.0.0-394-g40c666fd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other than your use of non-contiguous ports here, why not not use llm-d-inference-sim which supports --data-parallel-size=N ?

ports:
- containerPort: 3000
readinessProbe:
httpGet:
path: /
port: 3000
initialDelaySeconds: 3
periodSeconds: 5
failureThreshold: 2
env:
- name: HTTP_PORT # Default port for HTTP echo server
value: "3000"
- name: H2C_PORT # Default port for HTC echo server
value: "3001"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: echoserver-3002
image: gcr.io/k8s-staging-gateway-api/echo-basic:v20240412-v1.0.0-394-g40c666fd
ports:
- containerPort: 3002
readinessProbe:
httpGet:
path: /
port: 3002
initialDelaySeconds: 3
periodSeconds: 5
failureThreshold: 2
env:
- name: HTTP_PORT
value: "3002"
- name: H2C_PORT
value: "3003"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: echoserver-3004
image: gcr.io/k8s-staging-gateway-api/echo-basic:v20240412-v1.0.0-394-g40c666fd
ports:
- containerPort: 3004
readinessProbe:
httpGet:
path: /
port: 3004
initialDelaySeconds: 3
periodSeconds: 5
failureThreshold: 2
env:
- name: HTTP_PORT
value: "3004"
- name: H2C_PORT
value: "3005"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
---
# --- Data Parallelism (DP) InferencePool Definition ---
apiVersion: inference.networking.k8s.io/v1
kind: InferencePool
metadata:
name: dp-inference-pool
namespace: inference-conformance-app-backend
spec:
selector:
matchLabels:
app: dp-inference-model-server
targetPorts:
- number: 3000
- number: 3002
- number: 3004
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While this is an interesting configuration, I don't think you could do this with a real vLLM server

endpointPickerRef:
name: dp-endpoint-picker-svc
port:
number: 9002
---
# --- Data Parallelism (DP) Conformance EPP service Definition ---
apiVersion: v1
kind: Service
metadata:
name: dp-endpoint-picker-svc
namespace: inference-conformance-app-backend
spec:
selector:
app: dp-app-backend-epp
ports:
- protocol: TCP
port: 9002
targetPort: 9002
appProtocol: http2
type: ClusterIP
---
# --- Data Parallelism (DP) Conformance EPP Deployment ---
apiVersion: apps/v1
kind: Deployment
metadata:
name: dp-app-endpoint-picker
namespace: inference-conformance-app-backend
labels:
app: dp-app-backend-epp
spec:
replicas: 1
selector:
matchLabels:
app: dp-app-backend-epp
template:
metadata:
labels:
app: dp-app-backend-epp
spec:
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
terminationGracePeriodSeconds: 130
containers:
- name: epp
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v20251023-d788a2c
imagePullPolicy: Always
args:
- --pool-name
- "dp-inference-pool"
- --pool-namespace
- "inference-conformance-app-backend"
- --v
- "4"
- --zap-encoder
- "json"
- --grpc-port
- "9002"
- --grpc-health-port
- "9003"
- "--config-file"
- "/config/conformance-plugins.yaml"
ports:
- containerPort: 9002
- containerPort: 9003
- name: metrics
containerPort: 9090
livenessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
volumeMounts:
- name: plugins-config-volume
mountPath: "/config"
volumes:
- name: plugins-config-volume
configMap:
name: plugins-config
---
apiVersion: v1
kind: ConfigMap
metadata:
Expand Down
Loading