Update lora affinity to be a scorer.

rlakhtakia · rlakhtakia · commit e3f2eb7ce0f9 · 2025-07-22T18:23:47.000Z
diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go
@@ -377,6 +377,7 @@ func (r *Runner) registerInTreePlugins() {
 	plugins.Register(profile.SingleProfileHandlerType, profile.SingleProfileHandlerFactory)
 	plugins.Register(scorer.KvCacheScorerType, scorer.KvCacheScorerFactory)
 	plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory)
+	plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory)
 	// register filter for test purpose only (used in conformance tests)
 	plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)
 }
diff --git a/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity.go b/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity.go
@@ -0,0 +1,90 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scorer
+
+import (
+	"context"
+	"encoding/json"
+
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+const (
+	DefaultLoraAffinityScorerWeight = 1
+	LoraAffinityScorerType          = "lora-affinity"
+)
+
+// compile-time type assertion
+var _ framework.Scorer = &LoraAffinityScorer{}
+
+// LoraAffinityScorerFactory defines the factory function for LoraAffinityScorer.
+func LoraAffinityScorerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+	return NewLoraAffinityScorer().WithName(name), nil
+}
+
+// NewLoraAffinityScorer initializes a new LoraAffinityScorer and returns its pointer.
+func NewLoraAffinityScorer() *LoraAffinityScorer {
+	return &LoraAffinityScorer{
+		tn: plugins.TypedName{Type: LoraAffinityScorerType, Name: LoraAffinityScorerType},
+	}
+}
+
+// LoraAffinityScorer scores list of candidate pods based on Lora affinity and availability.
+type LoraAffinityScorer struct {
+	tn plugins.TypedName
+}
+
+// TypedName returns the type and name tuple of this plugin instance.
+func (s *LoraAffinityScorer) TypedName() plugins.TypedName {
+	return s.tn
+}
+
+// WithName sets the name of the scorer.
+func (s *LoraAffinityScorer) WithName(name string) *LoraAffinityScorer {
+	s.tn.Name = name
+	return s
+}
+
+func (s *LoraAffinityScorer) Score(_ context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
+	scores := make(map[types.Pod]float64, len(pods))
+
+	// Assign a score to each pod for loading the target adapter.
+	for _, pod := range pods {
+		_, active := pod.GetMetrics().ActiveModels[request.TargetModel]
+		_, waiting := pod.GetMetrics().WaitingModels[request.TargetModel]
+
+		// Determine the model server's suitability score based on adapter load status and capacity.
+		switch {
+		// Ideal: The adapter is already active on this model server.
+		case active:
+			scores[pod] = 1.0
+		// Good: The model server has capacity to load at least one more adapter.
+		case len(pod.GetMetrics().ActiveModels)+len(pod.GetMetrics().WaitingModels) < pod.GetMetrics().MaxActiveModels:
+			scores[pod] = 0.8
+		// Moderate: The adapter is already in the queue to be loaded on this model server.
+		case waiting:
+			scores[pod] = 0.6
+		// Unsuitable: The model server has reached its maximum capacity and cannot load the adapter.
+		default:
+			scores[pod] = 0.0
+		}
+	}
+
+	return scores
+}
diff --git a/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity_test.go b/pkg/epp/scheduling/framework/plugins/scorer/lora_affinity_test.go
@@ -0,0 +1,173 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scorer
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	k8stypes "k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+func TestLoraAffinityScorer(t *testing.T) {
+	tests := []struct {
+		name              string
+		request           *types.LLMRequest
+		pods              []types.Pod
+		expectedScoresPod map[string]float64 // Map of pod name to expected score
+	}{
+		{
+			name:    "Target model is active",
+			request: &types.LLMRequest{TargetModel: "active-model-1"},
+			pods: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-1": 1},
+						WaitingModels:   map[string]int{},
+						MaxActiveModels: 5,
+					},
+				},
+			},
+			expectedScoresPod: map[string]float64{
+				"pod1": 1.0,
+			},
+		},
+		{
+			name:    "Target model is waiting",
+			request: &types.LLMRequest{TargetModel: "active-model-1"},
+			pods: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-2": 2},
+						WaitingModels:   map[string]int{"active-model-1": 1},
+						MaxActiveModels: 2,
+					},
+				},
+			},
+			expectedScoresPod: map[string]float64{
+				"pod1": 0.6,
+			},
+		},
+		{
+			name:    "Pods have no space for new model",
+			request: &types.LLMRequest{TargetModel: "active-model-1"},
+			pods: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-2": 2},
+						WaitingModels:   map[string]int{"active-model-3": 1},
+						MaxActiveModels: 2,
+					},
+				},
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{},
+						WaitingModels:   map[string]int{},
+						MaxActiveModels: 0,
+					},
+				},
+			},
+			expectedScoresPod: map[string]float64{
+				"pod1": 0.0,
+				"pod2": 0.0,
+			},
+		},
+		{
+			name:    "Multiple pods with mixed active and waiting models",
+			request: &types.LLMRequest{TargetModel: "active-model-1"},
+			pods: []types.Pod{
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-1": 1},
+						WaitingModels:   map[string]int{},
+						MaxActiveModels: 5,
+					},
+				},
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-2": 4},
+						WaitingModels:   map[string]int{"active-model-1": 1},
+						MaxActiveModels: 5,
+					},
+				},
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-2": 1},
+						WaitingModels:   map[string]int{},
+						MaxActiveModels: 2,
+					},
+				},
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod4"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-3": 1},
+						WaitingModels:   map[string]int{"active-model-1": 1},
+						MaxActiveModels: 2,
+					},
+				},
+				&types.PodMetrics{
+					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod5"}},
+					MetricsState: &backendmetrics.MetricsState{
+						ActiveModels:    map[string]int{"active-model-4": 1, "active-model-5": 1},
+						WaitingModels:   map[string]int{},
+						MaxActiveModels: 2,
+					},
+				},
+			},
+			expectedScoresPod: map[string]float64{
+				"pod1": 1.0,
+				"pod2": 0.8,
+				"pod3": 0.8,
+				"pod4": 0.6,
+				"pod5": 0.0,
+			},
+		},
+		{
+			name:              "Empty pods slice",
+			request:           &types.LLMRequest{TargetModel: "modelA"},
+			pods:              []types.Pod{},
+			expectedScoresPod: map[string]float64{}, // No pods, no scores
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			scorer := &LoraAffinityScorer{}
+			scores := scorer.Score(context.Background(), types.NewCycleState(), test.request, test.pods)
+
+			for _, pod := range test.pods {
+				expectedScore, ok := test.expectedScoresPod[pod.GetPod().NamespacedName.Name]
+				if !ok {
+					t.Fatalf("Expected score not found for pod %s in test %s", pod.GetPod().NamespacedName, test.name)
+				}
+				assert.InDelta(t, expectedScore, scores[pod], 0.0001, "Pod %s should have score %f", pod.GetPod().NamespacedName.Name, expectedScore)
+			}
+			assert.Len(t, scores, len(test.expectedScoresPod), "Number of scored pods should match expected")
+		})
+	}
+}
diff --git a/site-src/guides/epp-configuration/config-text.md b/site-src/guides/epp-configuration/config-text.md
@@ -253,3 +253,13 @@ available to serve new request).
 
 - *Type*: queue-scorer
 - *Parameters*: none
+
+
+#### **LoraAffinityScorer**
+
+Scores list of candidate pods based on the LoRA adapters loaded on the pod. 
+Pods with the adapter already loaded or able to be actively loaded will be 
+scored higher (since it's more available to serve new request).
+
+- *Type*: lora-affinity-scorer
+- *Parameters*: none

Original file line number	Diff line number	Diff line change
`@@ -377,6 +377,7 @@ func (r *Runner) registerInTreePlugins() {`
`377`	`377`	`plugins.Register(profile.SingleProfileHandlerType, profile.SingleProfileHandlerFactory)`
`378`	`378`	`plugins.Register(scorer.KvCacheScorerType, scorer.KvCacheScorerFactory)`
`379`	`379`	`plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory)`
	`380`	`+ plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory)`
`380`	`381`	`// register filter for test purpose only (used in conformance tests)`
`381`	`382`	`plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)`
`382`	`383`	`}`