Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
ecfe869
remove empty request_test.go file. (#796)
nirrozenbaum May 8, 2025
2ed990b
Clean up filters (#802)
liu-cong May 8, 2025
cb52769
Refactor: Improve env utility (#803)
LukeAVanDrie May 8, 2025
d212757
refactor scheduler filters package (#797)
nirrozenbaum May 8, 2025
2b66451
fix labels not cloned bug (#804)
nirrozenbaum May 8, 2025
7beb471
fixed datastore bug to clean all go routines when pool is unset. (#810)
nirrozenbaum May 9, 2025
4029a37
Optimize Dockerfile for Multiple Extensions (#811)
GunaKKIBM May 9, 2025
2dce3ea
merge has capacity filter with sheddable filter. (#809)
nirrozenbaum May 9, 2025
64a37d1
feat(conformance): Add initial InferencePool tests and shared Gateway…
SinaChavoshi May 9, 2025
10ec261
Add prefix cache aware scheduling (#768)
liu-cong May 10, 2025
62f226c
merge functions in env utils (#819)
nirrozenbaum May 11, 2025
bc29bd0
generalize scheduling cycle state concept (#818)
nirrozenbaum May 11, 2025
8df511a
remove Model field from LLMRequest (#782)
nirrozenbaum May 11, 2025
80ce385
feat: Add support to invoke PostResponse plugins (#800)
shmuelk May 12, 2025
baf3d7d
Add prefix aware request scheduling proposal (#602)
liu-cong May 12, 2025
7207ed6
Docs: Bumps Kgateway to v2.0.2 (#823)
danehans May 12, 2025
519bee8
renamed Metrics to MetricsState and move to a separate file (#822)
nirrozenbaum May 12, 2025
8687feb
feat: Add build reference to the info metrics (#817)
JeffLuoo May 12, 2025
3d99aa1
Introduce SaturationDetector component (#808)
LukeAVanDrie May 13, 2025
2b2b4a6
support extracting prompt from chat completions API (#798)
delavet May 13, 2025
8baf74c
Fix: Add sleep to TestMetricsRefresh for flakes. (#824)
LukeAVanDrie May 13, 2025
1f62b02
chore(conformance): Add timeout configuration (#795)
SinaChavoshi May 13, 2025
409fc3f
Scheduler subsystem high level design proposal (#603)
smarterclayton May 14, 2025
c2e3fa9
Updating Readme (#831)
kfswain May 14, 2025
5f95113
Update index.md (#836)
alexsnaps May 15, 2025
97bad77
docs: roll out guide (#829)
capri-xiyue May 15, 2025
77f8564
reduce log level of "prefix cached servers" to TRACE (#842)
nirrozenbaum May 15, 2025
7ef0ab1
merge https://github.com/AI-Hypercomputer/inference-benchmark/tree/46…
kaushikmitr May 15, 2025
6e8a2ef
fixed log before picker (#844)
nirrozenbaum May 15, 2025
7c63c0d
Reorganize scheduling plugins (#837)
liu-cong May 16, 2025
46c5c5e
updated godoc on filters, pickers and prefix. (#850)
nirrozenbaum May 18, 2025
e8834c3
Fix: Ignore header order in hermetic test (#849)
LukeAVanDrie May 18, 2025
bd457e1
Bump the kubernetes group with 6 updates (#851)
dependabot[bot] May 20, 2025
03a4177
Bump github.com/prometheus/common from 0.63.0 to 0.64.0 (#853)
dependabot[bot] May 20, 2025
9f15441
Updating readme to show llm-d collab (#855)
kfswain May 20, 2025
acb21c7
fix: typo ('endpoing' -> 'endpoint') (#857)
t3hmrman May 20, 2025
8958028
Updating readme wording (#858)
kfswain May 20, 2025
87b3a08
adding logging & support for better response when requests are not va…
kfswain May 20, 2025
70285f1
Adding util func for splitting large bodies into chunks (#859)
kfswain May 21, 2025
8770afe
Scheduler config refactor for simplifying plugins registration (#835)
nirrozenbaum May 21, 2025
a5bf0ac
wiring up chunked response logic (#860)
kfswain May 21, 2025
d55ead7
feat: merge two metric servers (#728)
nayihz May 22, 2025
ed32a43
docs: added examples to address various generative AI application sce…
capri-xiyue May 22, 2025
28229bf
docs: Update link to Slack channel (#867)
terrytangyuan May 23, 2025
5bc7425
Multi cycle scheduler (#862)
nirrozenbaum May 23, 2025
48b6c97
feat(conformance): Add test for HTTPRouteInvalidInferencePoolRef (#807)
SinaChavoshi May 27, 2025
440ca87
feat(conformance): tests for inferencepool_resolvedrefs_condition (#832)
SinaChavoshi May 28, 2025
856af6a
Update `002-api-proposal/` to reflect `api/v1alpha2` inferencePool a…
shotarok May 28, 2025
60c4674
use namespacedname instead of name/namespace as separate args (#873)
nirrozenbaum May 28, 2025
7c830cb
remove the PreCycle plugin from scheduler (#876)
nirrozenbaum May 28, 2025
a1b7f59
feat(conformance): Update InferencePoolResolvedRefsCondition test for…
SinaChavoshi May 28, 2025
8d4c23f
minor changes to saturation detector (#882)
nirrozenbaum May 29, 2025
3491ddd
Add flow controller.
LukeAVanDrie Apr 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@ ENV CGO_ENABLED=0
ENV GOOS=linux
ENV GOARCH=amd64
ARG COMMIT_SHA=unknown
ARG BUILD_REF

# Dependencies
WORKDIR /src
COPY go.mod go.sum ./
RUN go mod download

# Sources
COPY cmd ./cmd
COPY pkg ./pkg
COPY cmd/epp ./cmd
COPY pkg/epp ./pkg/epp
COPY internal ./internal
COPY api ./api
WORKDIR /src/cmd/epp
RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA}" -o /epp
WORKDIR /src/cmd
RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" -o /epp

## Multistage deploy
FROM ${BASE_IMAGE}
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ ifdef GO_VERSION
BUILDER_IMAGE = golang:$(GO_VERSION)
endif

BUILD_REF ?= $(shell git describe --abbrev=0 2>/dev/null)
ifdef EXTRA_TAG
IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG)
BBR_IMAGE_EXTRA_TAG ?= $(BBR_IMAGE_REPO):$(EXTRA_TAG)
BUILD_REF = $(EXTRA_TAG)
endif
ifdef IMAGE_EXTRA_TAG
IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG)
Expand Down Expand Up @@ -177,6 +179,7 @@ image-build: ## Build the EPP image using Docker Buildx.
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
--build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \
--build-arg BUILD_REF=${BUILD_REF} \
$(PUSH) \
$(LOAD) \
$(IMAGE_BUILD_EXTRA_OPTS) ./
Expand Down
63 changes: 38 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,35 @@
[![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension)
[![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE)

# Gateway API Inference Extension (GIE)
# Gateway API Inference Extension

This project offers tools for AI Inference, enabling developers to build [Inference Gateways].
Gateway API Inference Extension optimizes self-hosting Generative Models on Kubernetes.
This is achieved by leveraging Envoy's [External Processing] (ext-proc) to extend any gateway that supports both ext-proc and [Gateway API] into an **[inference gateway]**.

[Inference Gateways]:#concepts-and-definitions
[Inference Gateway]:#concepts-and-definitions

## New!
Inference Gateway has partnered with vLLM to accelerate LLM serving optimizations with [llm-d](https://llm-d.ai/blog/llm-d-announce)!

## Concepts and Definitions

The following are some key industry terms that are important to understand for
The following specific terms to this project:

- **Inference Gateway (IGW)**: A proxy/load-balancer which has been coupled with an
`Endpoint Picker`. It provides optimized routing and load balancing for
serving Kubernetes self-hosted generative Artificial Intelligence (AI)
workloads. It simplifies the deployment, management, and observability of AI
inference workloads.
- **Inference Scheduler**: An extendable component that makes decisions about which endpoint is optimal (best cost /
best performance) for an inference request based on `Metrics and Capabilities`
from [Model Serving](/docs/proposals/003-model-server-protocol/README.md).
- **Metrics and Capabilities**: Data provided by model serving platforms about
performance, availability and capabilities to optimize routing. Includes
things like [Prefix Cache] status or [LoRA Adapters] availability.
- **Endpoint Picker(EPP)**: An implementation of an `Inference Scheduler` with additional Routing, Flow, and Request Control layers to allow for sophisticated routing strategies. Additional info on the architecture of the EPP [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal).


The following are key industry terms that are important to understand for
this project:

- **Model**: A generative AI model that has learned patterns from data and is
Expand All @@ -26,36 +46,22 @@ this project:
(GPUs) that can be attached to Kubernetes nodes to speed up computations,
particularly for training and inference tasks.

And the following are more specific terms to this project:

- **Scheduler**: Makes decisions about which endpoint is optimal (best cost /
best performance) for an inference request based on `Metrics and Capabilities`
from [Model Serving](/docs/proposals/003-model-server-protocol/README.md).
- **Metrics and Capabilities**: Data provided by model serving platforms about
performance, availability and capabilities to optimize routing. Includes
things like [Prefix Cache] status or [LoRA Adapters] availability.
- **Endpoint Selector**: A `Scheduler` combined with `Metrics and Capabilities`
systems is often referred to together as an [Endpoint Selection Extension]
(this is also sometimes referred to as an "endpoint picker", or "EPP").
- **Inference Gateway**: A proxy/load-balancer which has been coupled with a
`Endpoint Selector`. It provides optimized routing and load balancing for
serving Kubernetes self-hosted generative Artificial Intelligence (AI)
workloads. It simplifies the deployment, management, and observability of AI
inference workloads.

For deeper insights and more advanced concepts, refer to our [proposals](/docs/proposals).

[Inference]:https://www.digitalocean.com/community/tutorials/llm-inference-optimization
[Gateway API]:https://github.com/kubernetes-sigs/gateway-api
[Prefix Cache]:https://docs.vllm.ai/en/stable/design/v1/prefix_caching.html
[LoRA Adapters]:https://docs.vllm.ai/en/stable/features/lora.html
[Endpoint Selection Extension]:https://gateway-api-inference-extension.sigs.k8s.io/#endpoint-selection-extension
[External Processing]:https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter



## Technical Overview

This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.
This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **[inference gateway]** - supporting inference platform teams self-hosting Generative Models (with a current focus on large language models) on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.

The inference gateway:
The Inference Gateway:

* Improves the tail latency and throughput of LLM completion requests against Kubernetes-hosted model servers using an extensible request scheduling alogrithm that is kv-cache and request cost aware, avoiding evictions or queueing as load increases
* Provides [Kubernetes-native declarative APIs](https://gateway-api-inference-extension.sigs.k8s.io/concepts/api-overview/) to route client model names to use-case specific LoRA adapters and control incremental rollout of new adapter versions, A/B traffic splitting, and safe blue-green base model and model server upgrades
Expand All @@ -64,7 +70,14 @@ The inference gateway:

![Architecture Diagram](./docs/inference-gateway-architecture.svg)

It currently requires a version of vLLM that supports the necessary metrics to predict traffic load which is defined in the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol). Support for Google's Jetstream, nVidia Triton, text-generation-inference, and SGLang is coming soon.
### Model Server Integration

IGW’s pluggable architecture was leveraged to enable the [llm-d Inference Scheduler](https://github.com/llm-d/llm-d-inference-scheduler).

Llm-d customizes vLLM & IGW to create a disaggregated serving solution. We've worked closely with this team to enable this integration. IGW will continue to work closely with llm-d to generalize the disaggregated serving plugin(s), & set a standard for disaggregated serving to be used across any [protocol-adherent](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol) model server.

IGW has enhanced support for vLLM via llm-d, and broad support for any model servers implementing the protocol. More details can be found in [model server integration](https://gateway-api-inference-extension.sigs.k8s.io/implementations/model-servers/).


## Status

Expand Down Expand Up @@ -97,7 +110,7 @@ Follow this [README](./test/e2e/epp/README.md) to learn more about running the i

Our community meeting is weekly at Thursday 10AM PDT ([Zoom](https://zoom.us/j/9955436256?pwd=Z2FQWU1jeDZkVC9RRTN4TlZyZTBHZz09), [Meeting Notes](https://www.google.com/url?q=https://docs.google.com/document/d/1frfPE5L1sI3737rdQV04IcDGeOcGJj2ItjMg6z2SRH0/edit?usp%3Dsharing&sa=D&source=calendar&usd=2&usg=AOvVaw1pUVy7UN_2PMj8qJJcFm1U)).

We currently utilize the [#wg-serving](https://kubernetes.slack.com/?redir=%2Fmessages%2Fwg-serving) slack channel for communications.
We currently utilize the [#gateway-api-inference-extension](https://kubernetes.slack.com/?redir=%2Fmessages%2Fgateway-api-inference-extension) channel in Kubernetes Slack workspace for communications.

Contributions are readily welcomed, follow the [dev guide](./docs/dev.md) to start contributing!

Expand Down
4 changes: 2 additions & 2 deletions bbr.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ COPY go.mod go.sum ./
RUN go mod download

# Sources
COPY cmd ./cmd
COPY cmd/bbr ./cmd
COPY pkg ./pkg
COPY internal ./internal
WORKDIR /src/cmd/bbr
WORKDIR /src/cmd
RUN go build -o /bbr

## Multistage deploy
Expand Down
4 changes: 3 additions & 1 deletion cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ steps:
- GIT_TAG=$_GIT_TAG
- EXTRA_TAG=$_PULL_BASE_REF
- DOCKER_BUILDX_CMD=/buildx-entrypoint
- GIT_COMMIT_SHA=$COMMIT_SHA
- GIT_COMMIT_SHA=$_PULL_BASE_SHA
- name: gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20240718-5ef92b5c36
entrypoint: make
args:
Expand Down Expand Up @@ -44,5 +44,7 @@ substitutions:
# _PULL_BASE_REF will contain the ref that was pushed to trigger this build -
# a branch like 'main' or 'release-0.2', or a tag like 'v0.2'.
_PULL_BASE_REF: 'main'
# _PULL_BASE_SHA will contain the Git SHA of the commit that was pushed to trigger this build.
_PULL_BASE_SHA: 'abcdef'
options:
substitution_option: ALLOW_LOOSE
84 changes: 16 additions & 68 deletions cmd/bbr/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,23 @@ package main

import (
"flag"
"net"
"net/http"
"fmt"
"os"
"strconv"

"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus/promhttp"
uberzap "go.uber.org/zap"
"go.uber.org/zap/zapcore"
"google.golang.org/grpc"
healthPb "google.golang.org/grpc/health/grpc_health_v1"
"k8s.io/client-go/rest"
"k8s.io/component-base/metrics/legacyregistry"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
"sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/metrics"
runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/server"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
)

Expand Down Expand Up @@ -85,7 +82,18 @@ func run() error {
return err
}

mgr, err := ctrl.NewManager(cfg, ctrl.Options{})
metrics.Register()

// Register metrics handler.
// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
// More info:
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/metrics/server
// - https://book.kubebuilder.io/reference/metrics.html
metricsServerOptions := metricsserver.Options{
BindAddress: fmt.Sprintf(":%d", *metricsPort),
FilterProvider: filters.WithAuthenticationAndAuthorization,
}
mgr, err := ctrl.NewManager(cfg, ctrl.Options{Metrics: metricsServerOptions})
if err != nil {
setupLog.Error(err, "Failed to create manager", "config", cfg)
return err
Expand All @@ -107,11 +115,6 @@ func run() error {
return err
}

// Register metrics handler.
if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil {
return err
}

// Start the manager. This blocks until a signal is received.
setupLog.Info("Manager starting")
if err := mgr.Start(ctx); err != nil {
Expand Down Expand Up @@ -152,58 +155,3 @@ func initLogging(opts *zap.Options) {
logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller()))
ctrl.SetLogger(logger)
}

const metricsEndpoint = "/metrics"

// registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager.
func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error {
metrics.Register()

// Init HTTP server.
h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg)
if err != nil {
return err
}

mux := http.NewServeMux()
mux.Handle(metricsEndpoint, h)

srv := &http.Server{
Addr: net.JoinHostPort("", strconv.Itoa(port)),
Handler: mux,
}

if err := mgr.Add(&manager.Server{
Name: "metrics",
Server: srv,
}); err != nil {
setupLog.Error(err, "Failed to register metrics HTTP handler")
return err
}
return nil
}

func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) {
h := promhttp.HandlerFor(
legacyregistry.DefaultGatherer,
promhttp.HandlerOpts{},
)
httpClient, err := rest.HTTPClientFor(cfg)
if err != nil {
setupLog.Error(err, "Failed to create http client for metrics auth")
return nil, err
}

filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient)
if err != nil {
setupLog.Error(err, "Failed to create metrics filter for auth")
return nil, err
}
metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", metricsEndpoint)
metricsAuthHandler, err := filter(metricsLogger, h)
if err != nil {
setupLog.Error(err, "Failed to create metrics auth handler")
return nil, err
}
return metricsAuthHandler, nil
}
Loading