@@ -177,7 +177,6 @@ spec:
177177 terminationGracePeriodSeconds : 130
178178 nodeSelector :
179179 cloud.google.com/gke-accelerator : " nvidia-h100-80gb"
180-
181180 volumes :
182181 - name : data
183182 emptyDir : {}
@@ -250,40 +249,133 @@ spec:
250249 spec :
251250 terminationGracePeriodSeconds : 130
252251 containers :
253- - name : epp
254- image : us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
255- imagePullPolicy : Always
256- args :
257- - -poolName
258- - " vllm-llama3-8b-instruct-new"
259- - " -poolNamespace"
260- - " default"
261- - -v
262- - " 4"
263- - --zap-encoder
264- - " json"
265- - -grpcPort
266- - " 9002"
267- - -grpcHealthPort
268- - " 9003"
269- ports :
270- - containerPort : 9002
271- - containerPort : 9003
272- - name : metrics
273- containerPort : 9090
274- livenessProbe :
275- grpc :
276- port : 9003
277- service : inference-extension
278- initialDelaySeconds : 5
279- periodSeconds : 10
280- readinessProbe :
281- grpc :
282- port : 9003
283- service : inference-extension
284- initialDelaySeconds : 5
285- periodSeconds : 10
286- EOF
252+ - name : epp
253+ image : us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
254+ imagePullPolicy : Always
255+ args :
256+ - -poolName
257+ - " vllm-llama3-8b-instruct-new"
258+ - -poolNamespace
259+ - " default"
260+ - -v
261+ - " 4"
262+ - --zap-encoder
263+ - " json"
264+ - -grpcPort
265+ - " 9002"
266+ - -grpcHealthPort
267+ - " 9003"
268+ - -configFile
269+ - " /config/default-plugins.yaml"
270+ ports :
271+ - containerPort : 9002
272+ name : grpc
273+ - containerPort : 9003
274+ name : grpc-health
275+ - containerPort : 9090
276+ name : metrics
277+ livenessProbe :
278+ grpc :
279+ port : 9003
280+ service : inference-extension
281+ initialDelaySeconds : 5
282+ periodSeconds : 10
283+ readinessProbe :
284+ grpc :
285+ port : 9003
286+ service : inference-extension
287+ initialDelaySeconds : 5
288+ periodSeconds : 10
289+ volumeMounts :
290+ - name : plugins-config-volume
291+ mountPath : /config
292+ volumes :
293+ - name : plugins-config-volume
294+ configMap :
295+ name : plugins-config
296+ ---
297+ apiVersion : v1
298+ kind : ConfigMap
299+ metadata :
300+ name : plugins-config
301+ namespace : default
302+ data :
303+ default-plugins.yaml : |
304+ apiVersion: inference.networking.x-k8s.io/v1alpha1
305+ kind: EndpointPickerConfig
306+ plugins:
307+ - type: low-queue-filter
308+ parameters:
309+ threshold: 128
310+ - type: lora-affinity-filter
311+ parameters:
312+ threshold: 0.999
313+ - type: least-queue-filter
314+ - type: least-kv-cache-filter
315+ - type: decision-tree-filter
316+ name: low-latency-filter
317+ parameters:
318+ current:
319+ pluginRef: low-queue-filter
320+ nextOnSuccess:
321+ decisionTree:
322+ current:
323+ pluginRef: lora-affinity-filter
324+ nextOnSuccessOrFailure:
325+ decisionTree:
326+ current:
327+ pluginRef: least-queue-filter
328+ nextOnSuccessOrFailure:
329+ decisionTree:
330+ current:
331+ pluginRef: least-kv-cache-filter
332+ nextOnFailure:
333+ decisionTree:
334+ current:
335+ pluginRef: least-queue-filter
336+ nextOnSuccessOrFailure:
337+ decisionTree:
338+ current:
339+ pluginRef: lora-affinity-filter
340+ nextOnSuccessOrFailure:
341+ decisionTree:
342+ current:
343+ pluginRef: least-kv-cache-filter
344+ - type: random-picker
345+ parameters:
346+ maxNumOfEndpoints: 1
347+ - type: single-profile-handler
348+ schedulingProfiles:
349+ - name: default
350+ plugins:
351+ - pluginRef: low-latency-filter
352+ - pluginRef: random-picker
353+ plugins-v2.yaml : |
354+ apiVersion: inference.networking.x-k8s.io/v1alpha1
355+ kind: EndpointPickerConfig
356+ plugins:
357+ - type: queue-scorer
358+ - type: kv-cache-scorer
359+ - type: prefix-cache-scorer
360+ parameters:
361+ hashBlockSize: 64
362+ maxPrefixBlocksToMatch: 256
363+ lruCapacityPerServer: 31250
364+ - type: max-score-picker
365+ parameters:
366+ maxNumOfEndpoints: 1
367+ - type: single-profile-handler
368+ schedulingProfiles:
369+ - name: default
370+ plugins:
371+ - pluginRef: queue-scorer
372+ weight: 1
373+ - pluginRef: kv-cache-scorer
374+ weight: 1
375+ - pluginRef: prefix-cache-scorer
376+ weight: 1
377+ - pluginRef: max-score-picker
378+ EOF
287379```
288380
289381### Direct traffic to the new inference pool
0 commit comments