LukeAVanDrie
diff --git a/‎cmd/epp/main.go
Lines changed: 28 additions & 0 deletions b/‎cmd/epp/main.go
Lines changed: 28 additions & 0 deletions
diff --git a/‎pkg/epp/datastore/datastore.go
Lines changed: 0 additions & 7 deletions b/‎pkg/epp/datastore/datastore.go
Lines changed: 0 additions & 7 deletions
diff --git a/‎pkg/epp/handlers/request.go
Lines changed: 39 additions & 4 deletions b/‎pkg/epp/handlers/request.go
Lines changed: 39 additions & 4 deletions
diff --git a/‎pkg/epp/handlers/server.go
Lines changed: 43 additions & 5 deletions b/‎pkg/epp/handlers/server.go
Lines changed: 43 additions & 5 deletions
diff --git a/‎pkg/epp/handlers/streamingserver.go
Lines changed: 43 additions & 7 deletions b/‎pkg/epp/handlers/streamingserver.go
Lines changed: 43 additions & 7 deletions
diff --git a/‎pkg/epp/scheduling/filter.go
Lines changed: 0 additions & 5 deletions b/‎pkg/epp/scheduling/filter.go
Lines changed: 0 additions & 5 deletions
@@ -40,6 +40,7 @@ import (
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
@@ -102,6 +103,27 @@ var (
 	loraInfoMetric = flag.String("loraInfoMetric",
 		"vllm:lora_requests_info",
 		"Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
+	// Scheduling config flags
+	totalQueueCapacity = flag.Uint64(
+		"totalQueueCapacity",
+		scheduling.DefaultTotalQueueCapacity,
+		"Total capacity (in bytes) of the queue across all models and criticality bands.",
+	)
+	modelQueueCapacity = flag.Uint64(
+		"modelQueueCapacity",
+		scheduling.DefaultModelQueueCapacity,
+		"Capacity (in bytes) of the per-model queues.",
+	)
+	queueTTL = flag.Duration(
+		"queueTTL",
+		scheduling.DefaultQueueTTL,
+		"TTL for requests in the queue.",
+	)
+	expiryCleanupInterval = flag.Duration(
+		"expiryCleanupInterval",
+		scheduling.DefaultExpiryCleanupInterval,
+		"Interval for cleaning up expired requests from the queue.",
+	)
 
 	setupLog = ctrl.Log.WithName("setup")
 )
@@ -180,6 +202,12 @@ func run() error {
 		CertPath:                                 *certPath,
 		UseStreaming:                             useStreamingServer,
 		RefreshPrometheusMetricsInterval:         *refreshPrometheusMetricsInterval,
+		QueueConfig: scheduling.QueueConfig{
+			TotalQueueCapacity:    *totalQueueCapacity,
+			ModelQueueCapacity:    *modelQueueCapacity,
+			QueueTTL:              *queueTTL,
+			ExpiryCleanupInterval: *expiryCleanupInterval,
+		},
 	}
 	if err := serverRunner.SetupWithManager(ctx, mgr); err != nil {
 		setupLog.Error(err, "Failed to setup ext-proc controllers")
 
@@ -302,13 +302,6 @@ func stripLabelKeyAliasFromLabelMap(labels map[v1alpha2.LabelKey]v1alpha2.LabelV
 	return outMap
 }
 
-func IsCritical(model *v1alpha2.InferenceModel) bool {
-	if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha2.Critical {
-		return true
-	}
-	return false
-}
-
 // TODO: move out to share with pod_reconciler.go
 func podIsReady(pod *corev1.Pod) bool {
 	for _, condition := range pod.Status.Conditions {
 
@@ -19,14 +19,15 @@ package handlers
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"strconv"
 
 	configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
 	"google.golang.org/protobuf/types/known/structpb"
 	"sigs.k8s.io/controller-runtime/pkg/log"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 	errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
@@ -77,7 +78,7 @@ func (s *Server) HandleRequestBody(
 	llmReq := &schedulingtypes.LLMRequest{
 		Model:               model,
 		ResolvedTargetModel: modelName,
-		Critical:            datastore.IsCritical(modelObj),
+		Criticality:            *modelObj.Spec.Criticality,
 	}
 	loggerVerbose.Info("LLM request assembled", "request", llmReq)
 
@@ -94,9 +95,43 @@ func (s *Server) HandleRequestBody(
 		loggerVerbose.Info("Updated request body marshalled", "body", string(requestBody))
 	}
 
-	target, err := s.scheduler.Schedule(ctx, llmReq)
+	schedulableReq, err := newSchedulableRequestFromContext(reqCtx)
 	if err != nil {
-		return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()}
+		return nil, errutil.Error{Code: errutil.Internal, Msg: err.Error()}
+	}
+	target, evictionReason, err := s.queueController.Schedule(schedulableReq)
+	if err != nil {
+		logger.Error(err, "Failed to schedule request", "evictionReason", evictionReason.String())
+		switch {
+		case errors.Is(err, scheduling.ErrEvicted):
+			// Handle eviction errors, including the eviction reason.
+			switch evictionReason {
+			case scheduling.ReasonTTLExpiry:
+				return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to TTL expiry: %v", err)}
+			case scheduling.ReasonExternalContextExpiry:
+				// TODO: determine if this is an appropriate code. For expiry due to
+				// gateway timeout, I think it makes sense. For manual cancellation, I
+				// am not certain.
+				return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to external context expiry: %v", err)}
+			case scheduling.ReasonPreempted:
+				return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to preemption: %v", err)}
+			case scheduling.ReasonCannotFindBackend:
+				return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to failure to find a suitable backend: %v", err)}
+			default:
+				return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted for unknown reason: %v", err)}
+			}
+		case errors.Is(err, scheduling.ErrModelAtCapacity):
+			return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("model at capacity: %v", err)}
+		case errors.Is(err, scheduling.ErrCannotFindBackend):
+			return nil, errutil.Error{Code: errutil.Unknown, Msg: fmt.Sprintf("cannot find suitable backend for non-pool exhaustion reason: %v", err)}
+		default:
+			// Handle other errors.
+			return nil, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("failed to schedule request: %v", err)}
+		}
+	}
+	if target == nil || target.GetPod() == nil {
+		// This should be unreachable.
+		return nil, errutil.Error{Code: errutil.Internal, Msg: "target pod is nil, request was likely evicted"}
 	}
 	targetPod := target.GetPod()
 
 
@@ -18,6 +18,7 @@ package handlers
 
 import (
 	"context"
+	"fmt"
 	"io"
 	"time"
 
@@ -28,14 +29,15 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 	errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-func NewServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *Server {
+func NewServer(queueController QueueController, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *Server {
 	return &Server{
-		scheduler:                                scheduler,
+		queueController:                                queueController,
 		destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace,
 		destinationEndpointHintKey:               destinationEndpointHintKey,
 		datastore:                                datastore,
@@ -45,7 +47,7 @@ func NewServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, de
 // Server implements the Envoy external processing server.
 // https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto
 type Server struct {
-	scheduler Scheduler
+	queueController QueueController
 	// The key of the header to specify the target pod address. This value needs to match Envoy
 	// configuration.
 	destinationEndpointHintKey string
@@ -55,8 +57,8 @@ type Server struct {
 	datastore                                datastore.Datastore
 }
 
-type Scheduler interface {
-	Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (targetPod schedulingtypes.Pod, err error)
+type QueueController interface {
+	Schedule(req scheduling.SchedulableRequest) (targetPod schedulingtypes.Pod, evictionReason scheduling.EvictionReason, err error)
 }
 
 func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
@@ -217,12 +219,15 @@ func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) {
 
 // RequestContext stores context information during the life time of an HTTP request.
 type RequestContext struct {
+	Context                   context.Context
+	CancelFunc                context.CancelFunc
 	TargetPod                 string
 	TargetEndpoint            string
 	Model                     string
 	ResolvedTargetModel       string
 	RequestReceivedTimestamp  time.Time
 	ResponseCompleteTimestamp time.Time
+	Request                   schedulingtypes.LLMRequest
 	RequestSize               int
 	Usage                     Usage
 	ResponseSize              int
@@ -254,3 +259,36 @@ const (
 	BodyResponseResponsesComplete    StreamRequestState = 6
 	TrailerResponseResponsesComplete StreamRequestState = 7
 )
+
+type schedulableRequest struct {
+	schedulingtypes.LLMRequest
+	size uint64
+	ctx  context.Context
+}
+
+func (s *schedulableRequest) Context() context.Context {
+	return s.ctx
+}
+
+func (s *schedulableRequest) Request() *schedulingtypes.LLMRequest {
+	return &s.LLMRequest
+}
+
+func (s *schedulableRequest) Size() uint64 {
+	return uint64(s.size)
+}
+
+// newSchedulableRequestFromContext creates a new schedulableRequest from a
+// RequestContext. It contains the minimal RequestContext information necessary
+// for scheduling to reduce memory in the queue.
+// It returns an error if the RequestContext is invalid.
+func newSchedulableRequestFromContext(reqCtx *RequestContext) (*schedulableRequest, error) {
+	if reqCtx == nil || reqCtx.Context == nil || reqCtx.Request.Model == "" || reqCtx.RequestSize == 0 {
+		return nil, fmt.Errorf("invalid RequestContext")
+	}
+	return &schedulableRequest{
+		ctx:        reqCtx.Context,
+		size:       uint64(reqCtx.RequestSize),
+		LLMRequest: reqCtx.Request,
+	}, nil
+}
@@ -19,6 +19,7 @@ package handlers
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"math/rand"
@@ -37,22 +38,23 @@ import (
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 	errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-func NewStreamingServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *StreamingServer {
+func NewStreamingServer(queueController QueueController, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *StreamingServer {
 	return &StreamingServer{
-		scheduler:                                scheduler,
+		queueController:                                queueController,
 		destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace,
 		destinationEndpointHintKey:               destinationEndpointHintKey,
 		datastore:                                datastore,
 	}
 }
 
 type StreamingServer struct {
-	scheduler Scheduler
+	queueController QueueController
 	// The key of the header to specify the target pod address. This value needs to match Envoy
 	// configuration.
 	destinationEndpointHintKey string
@@ -348,9 +350,9 @@ func (s *StreamingServer) HandleRequestBody(
 	llmReq := &schedulingtypes.LLMRequest{
 		Model:               model,
 		ResolvedTargetModel: modelName,
-		Critical:            datastore.IsCritical(modelObj),
+		Criticality:             *modelObj.Spec.Criticality,
 	}
-	logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical)
+	logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "criticality", llmReq.Criticality)
 
 	var err error
 	// Update target models in the body.
@@ -364,9 +366,43 @@ func (s *StreamingServer) HandleRequestBody(
 		return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)}
 	}
 
-	target, err := s.scheduler.Schedule(ctx, llmReq)
+	schedulableReq, err := newSchedulableRequestFromContext(reqCtx)
 	if err != nil {
-		return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()}
+		return nil, errutil.Error{Code: errutil.Internal, Msg: err.Error()}
+	}
+	target, evictionReason, err := s.queueController.Schedule(schedulableReq)
+	if err != nil {
+		logger.Error(err, "Failed to schedule request", "evictionReason", evictionReason.String())
+		switch {
+		case errors.Is(err, scheduling.ErrEvicted):
+			// Handle eviction errors, including the eviction reason.
+			switch evictionReason {
+			case scheduling.ReasonTTLExpiry:
+				return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to TTL expiry: %v", err)}
+			case scheduling.ReasonExternalContextExpiry:
+				// TODO: determine if this is an appropriate code. For expiry due to
+				// gateway timeout, I think it makes sense. For manual cancellation, I
+				// am not certain.
+				return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to external context expiry: %v", err)}
+			case scheduling.ReasonPreempted:
+				return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to preemption: %v", err)}
+			case scheduling.ReasonCannotFindBackend:
+				return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted due to failure to find a suitable backend: %v", err)}
+			default:
+				return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("request evicted for unknown reason: %v", err)}
+			}
+		case errors.Is(err, scheduling.ErrModelAtCapacity):
+			return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Sprintf("model at capacity: %v", err)}
+		case errors.Is(err, scheduling.ErrCannotFindBackend):
+			return reqCtx, errutil.Error{Code: errutil.Unknown, Msg: fmt.Sprintf("cannot find suitable backend for non-pool exhaustion reason: %v", err)}
+		default:
+			// Handle other errors.
+			return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("failed to schedule request: %v", err)}
+		}
+	}
+	if target == nil || target.GetPod() == nil {
+		// This should be unreachable.
+		return reqCtx, errutil.Error{Code: errutil.Internal, Msg: "target pod is nil, request was likely evicted"}
 	}
 	targetPod := target.GetPod()
 
 
@@ -160,11 +160,6 @@ func leastQueuingFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([]*ty
 	return filtered, nil
 }
 
-var lowQueueFilter = &basicFilter{
-	name:   "low queueing filter",
-	filter: toFilterFunc((queueThresholdPredicate(config.QueueingThresholdLoRA))),
-}
-
 var leastKVCacheFilter = &basicFilter{
 	name:   "least KV cache percent",
 	filter: leastKVCacheFilterFunc,