LukeAVanDrie · LukeAVanDrie · May 8, 2025 · May 8, 2025 · May 8, 2025 · May 8, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -9,19 +9,20 @@ ENV CGO_ENABLED=0
 ENV GOOS=linux
 ENV GOARCH=amd64
 ARG COMMIT_SHA=unknown
+ARG BUILD_REF
 
 # Dependencies
 WORKDIR /src
 COPY go.mod go.sum ./
 RUN go mod download
 
 # Sources
-COPY cmd ./cmd
-COPY pkg ./pkg
+COPY cmd/epp ./cmd
+COPY pkg/epp ./pkg/epp
 COPY internal ./internal
 COPY api ./api
-WORKDIR /src/cmd/epp
-RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA}" -o /epp
+WORKDIR /src/cmd
+RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" -o /epp
 
 ## Multistage deploy
 FROM ${BASE_IMAGE}

diff --git a/Makefile b/Makefile
@@ -51,10 +51,12 @@ ifdef GO_VERSION
 BUILDER_IMAGE = golang:$(GO_VERSION)
 endif
 
+BUILD_REF ?= $(shell git describe --abbrev=0 2>/dev/null)
 ifdef EXTRA_TAG
 IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
 SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG)
 BBR_IMAGE_EXTRA_TAG ?= $(BBR_IMAGE_REPO):$(EXTRA_TAG)
+BUILD_REF = $(EXTRA_TAG)
 endif
 ifdef IMAGE_EXTRA_TAG
 IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG)
@@ -177,6 +179,7 @@ image-build: ## Build the EPP image using Docker Buildx.
 		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
 		--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
 		--build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \
+		--build-arg BUILD_REF=${BUILD_REF} \
 		$(PUSH) \
 		$(LOAD) \
 		$(IMAGE_BUILD_EXTRA_OPTS) ./

diff --git a/README.md b/README.md
@@ -2,15 +2,35 @@
 [![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension)
 [![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE)
 
-# Gateway API Inference Extension (GIE)
+# Gateway API Inference Extension
 
-This project offers tools for AI Inference, enabling developers to build [Inference Gateways].
+Gateway API Inference Extension optimizes self-hosting Generative Models on Kubernetes.
+This is achieved by leveraging Envoy's [External Processing] (ext-proc) to extend any gateway that supports both ext-proc and [Gateway API] into an **[inference gateway]**. 
 
-[Inference Gateways]:#concepts-and-definitions
+[Inference Gateway]:#concepts-and-definitions
+
+## New!
+Inference Gateway has partnered with vLLM to accelerate LLM serving optimizations with [llm-d](https://llm-d.ai/blog/llm-d-announce)!
 
 ## Concepts and Definitions
 
-The following are some key industry terms that are important to understand for
+The following specific terms to this project:
+
+- **Inference Gateway (IGW)**: A proxy/load-balancer which has been coupled with an
+  `Endpoint Picker`. It provides optimized routing and load balancing for
+  serving Kubernetes self-hosted generative Artificial Intelligence (AI)
+  workloads. It simplifies the deployment, management, and observability of AI
+  inference workloads.
+- **Inference Scheduler**: An extendable component that makes decisions about which endpoint is optimal (best cost /
+  best performance) for an inference request based on `Metrics and Capabilities`
+  from [Model Serving](/docs/proposals/003-model-server-protocol/README.md).
+- **Metrics and Capabilities**: Data provided by model serving platforms about
+  performance, availability and capabilities to optimize routing. Includes
+  things like [Prefix Cache] status or [LoRA Adapters] availability.
+- **Endpoint Picker(EPP)**: An implementation of an `Inference Scheduler` with additional Routing, Flow, and Request Control layers to allow for sophisticated routing strategies. Additional info on the architecture of the EPP [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal).
+
+
+The following are key industry terms that are important to understand for
 this project:
 
 - **Model**: A generative AI model that has learned patterns from data and is
@@ -26,36 +46,22 @@ this project:
   (GPUs) that can be attached to Kubernetes nodes to speed up computations,
   particularly for training and inference tasks.
 
-And the following are more specific terms to this project:
-
-- **Scheduler**: Makes decisions about which endpoint is optimal (best cost /
-  best performance) for an inference request based on `Metrics and Capabilities`
-  from [Model Serving](/docs/proposals/003-model-server-protocol/README.md).
-- **Metrics and Capabilities**: Data provided by model serving platforms about
-  performance, availability and capabilities to optimize routing. Includes
-  things like [Prefix Cache] status or [LoRA Adapters] availability.
-- **Endpoint Selector**: A `Scheduler` combined with `Metrics and Capabilities`
-  systems is often referred to together as an [Endpoint Selection Extension]
-  (this is also sometimes referred to as an "endpoint picker", or "EPP").
-- **Inference Gateway**: A proxy/load-balancer which has been coupled with a
-  `Endpoint Selector`. It provides optimized routing and load balancing for
-  serving Kubernetes self-hosted generative Artificial Intelligence (AI)
-  workloads. It simplifies the deployment, management, and observability of AI
-  inference workloads.
 
 For deeper insights and more advanced concepts, refer to our [proposals](/docs/proposals).
 
 [Inference]:https://www.digitalocean.com/community/tutorials/llm-inference-optimization
 [Gateway API]:https://github.com/kubernetes-sigs/gateway-api
 [Prefix Cache]:https://docs.vllm.ai/en/stable/design/v1/prefix_caching.html
 [LoRA Adapters]:https://docs.vllm.ai/en/stable/features/lora.html
-[Endpoint Selection Extension]:https://gateway-api-inference-extension.sigs.k8s.io/#endpoint-selection-extension
+[External Processing]:https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter
+
+
 
 ## Technical Overview
 
-This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.
+This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **[inference gateway]** - supporting inference platform teams self-hosting Generative Models (with a current focus on large language models) on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee.
 
-The inference gateway:
+The Inference Gateway:
 
 * Improves the tail latency and throughput of LLM completion requests against Kubernetes-hosted model servers using an extensible request scheduling alogrithm that is kv-cache and request cost aware, avoiding evictions or queueing as load increases
 * Provides [Kubernetes-native declarative APIs](https://gateway-api-inference-extension.sigs.k8s.io/concepts/api-overview/) to route client model names to use-case specific LoRA adapters and control incremental rollout of new adapter versions, A/B traffic splitting, and safe blue-green base model and model server upgrades
@@ -64,7 +70,14 @@ The inference gateway:
 
 ![Architecture Diagram](./docs/inference-gateway-architecture.svg)
 
-It currently requires a version of vLLM that supports the necessary metrics to predict traffic load which is defined in the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol).  Support for Google's Jetstream, nVidia Triton, text-generation-inference, and SGLang is coming soon.
+### Model Server Integration
+
+IGW’s pluggable architecture was leveraged to enable the [llm-d Inference Scheduler](https://github.com/llm-d/llm-d-inference-scheduler).  
+
+Llm-d customizes vLLM & IGW to create a disaggregated serving solution. We've worked closely with this team to enable this integration. IGW will continue to work closely with llm-d to generalize the disaggregated serving plugin(s), & set a standard for disaggregated serving to be used across any [protocol-adherent](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol) model server. 
+
+IGW has enhanced support for vLLM via llm-d, and broad support for any model servers implementing the protocol. More details can be found in [model server integration](https://gateway-api-inference-extension.sigs.k8s.io/implementations/model-servers/).
+
 
 ## Status
 
@@ -97,7 +110,7 @@ Follow this [README](./test/e2e/epp/README.md) to learn more about running the i
 
 Our community meeting is weekly at Thursday 10AM PDT ([Zoom](https://zoom.us/j/9955436256?pwd=Z2FQWU1jeDZkVC9RRTN4TlZyZTBHZz09), [Meeting Notes](https://www.google.com/url?q=https://docs.google.com/document/d/1frfPE5L1sI3737rdQV04IcDGeOcGJj2ItjMg6z2SRH0/edit?usp%3Dsharing&sa=D&source=calendar&usd=2&usg=AOvVaw1pUVy7UN_2PMj8qJJcFm1U)).
 
-We currently utilize the [#wg-serving](https://kubernetes.slack.com/?redir=%2Fmessages%2Fwg-serving) slack channel for communications.
+We currently utilize the [#gateway-api-inference-extension](https://kubernetes.slack.com/?redir=%2Fmessages%2Fgateway-api-inference-extension) channel in Kubernetes Slack workspace for communications.
 
 Contributions are readily welcomed, follow the [dev guide](./docs/dev.md) to start contributing!
 

diff --git a/bbr.Dockerfile b/bbr.Dockerfile
@@ -15,10 +15,10 @@ COPY go.mod go.sum ./
 RUN go mod download
 
 # Sources
-COPY cmd ./cmd
+COPY cmd/bbr ./cmd
 COPY pkg ./pkg
 COPY internal ./internal
-WORKDIR /src/cmd/bbr
+WORKDIR /src/cmd
 RUN go build -o /bbr
 
 ## Multistage deploy

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -12,7 +12,7 @@ steps:
     - GIT_TAG=$_GIT_TAG
     - EXTRA_TAG=$_PULL_BASE_REF
     - DOCKER_BUILDX_CMD=/buildx-entrypoint
-    - GIT_COMMIT_SHA=$COMMIT_SHA
+    - GIT_COMMIT_SHA=$_PULL_BASE_SHA
   - name: gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20240718-5ef92b5c36
     entrypoint: make
     args:
@@ -44,5 +44,7 @@ substitutions:
   # _PULL_BASE_REF will contain the ref that was pushed to trigger this build -
   # a branch like 'main' or 'release-0.2', or a tag like 'v0.2'.
   _PULL_BASE_REF: 'main'
+  # _PULL_BASE_SHA will contain the Git SHA of the commit that was pushed to trigger this build.
+  _PULL_BASE_SHA: 'abcdef'
 options:
   substitution_option: ALLOW_LOOSE
diff --git a/cmd/bbr/main.go b/cmd/bbr/main.go
@@ -18,26 +18,23 @@ package main
 
 import (
 	"flag"
-	"net"
-	"net/http"
+	"fmt"
 	"os"
-	"strconv"
 
 	"github.com/go-logr/logr"
-	"github.com/prometheus/client_golang/prometheus/promhttp"
 	uberzap "go.uber.org/zap"
 	"go.uber.org/zap/zapcore"
 	"google.golang.org/grpc"
 	healthPb "google.golang.org/grpc/health/grpc_health_v1"
-	"k8s.io/client-go/rest"
-	"k8s.io/component-base/metrics/legacyregistry"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 	"sigs.k8s.io/controller-runtime/pkg/manager"
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+
 	"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/metrics"
 	runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/server"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
@@ -85,7 +82,18 @@ func run() error {
 		return err
 	}
 
-	mgr, err := ctrl.NewManager(cfg, ctrl.Options{})
+	metrics.Register()
+
+	// Register metrics handler.
+	// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
+	// More info:
+	// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/metrics/server
+	// - https://book.kubebuilder.io/reference/metrics.html
+	metricsServerOptions := metricsserver.Options{
+		BindAddress:    fmt.Sprintf(":%d", *metricsPort),
+		FilterProvider: filters.WithAuthenticationAndAuthorization,
+	}
+	mgr, err := ctrl.NewManager(cfg, ctrl.Options{Metrics: metricsServerOptions})
 	if err != nil {
 		setupLog.Error(err, "Failed to create manager", "config", cfg)
 		return err
@@ -107,11 +115,6 @@ func run() error {
 		return err
 	}
 
-	// Register metrics handler.
-	if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil {
-		return err
-	}
-
 	// Start the manager. This blocks until a signal is received.
 	setupLog.Info("Manager starting")
 	if err := mgr.Start(ctx); err != nil {
@@ -152,58 +155,3 @@ func initLogging(opts *zap.Options) {
 	logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller()))
 	ctrl.SetLogger(logger)
 }
-
-const metricsEndpoint = "/metrics"
-
-// registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager.
-func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error {
-	metrics.Register()
-
-	// Init HTTP server.
-	h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg)
-	if err != nil {
-		return err
-	}
-
-	mux := http.NewServeMux()
-	mux.Handle(metricsEndpoint, h)
-
-	srv := &http.Server{
-		Addr:    net.JoinHostPort("", strconv.Itoa(port)),
-		Handler: mux,
-	}
-
-	if err := mgr.Add(&manager.Server{
-		Name:   "metrics",
-		Server: srv,
-	}); err != nil {
-		setupLog.Error(err, "Failed to register metrics HTTP handler")
-		return err
-	}
-	return nil
-}
-
-func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) {
-	h := promhttp.HandlerFor(
-		legacyregistry.DefaultGatherer,
-		promhttp.HandlerOpts{},
-	)
-	httpClient, err := rest.HTTPClientFor(cfg)
-	if err != nil {
-		setupLog.Error(err, "Failed to create http client for metrics auth")
-		return nil, err
-	}
-
-	filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient)
-	if err != nil {
-		setupLog.Error(err, "Failed to create metrics filter for auth")
-		return nil, err
-	}
-	metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", metricsEndpoint)
-	metricsAuthHandler, err := filter(metricsLogger, h)
-	if err != nil {
-		setupLog.Error(err, "Failed to create metrics auth handler")
-		return nil, err
-	}
-	return metricsAuthHandler, nil
-}