Skip to content

Commit e3f2eb7

Browse files
committed
Update lora affinity to be a scorer.
1 parent 7f5ccbf commit e3f2eb7

File tree

4 files changed

+274
-0
lines changed

4 files changed

+274
-0
lines changed

cmd/epp/runner/runner.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ func (r *Runner) registerInTreePlugins() {
377377
plugins.Register(profile.SingleProfileHandlerType, profile.SingleProfileHandlerFactory)
378378
plugins.Register(scorer.KvCacheScorerType, scorer.KvCacheScorerFactory)
379379
plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory)
380+
plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory)
380381
// register filter for test purpose only (used in conformance tests)
381382
plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)
382383
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package scorer
18+
19+
import (
20+
"context"
21+
"encoding/json"
22+
23+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
24+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
25+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
26+
)
27+
28+
const (
29+
DefaultLoraAffinityScorerWeight = 1
30+
LoraAffinityScorerType = "lora-affinity"
31+
)
32+
33+
// compile-time type assertion
34+
var _ framework.Scorer = &LoraAffinityScorer{}
35+
36+
// LoraAffinityScorerFactory defines the factory function for LoraAffinityScorer.
37+
func LoraAffinityScorerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
38+
return NewLoraAffinityScorer().WithName(name), nil
39+
}
40+
41+
// NewLoraAffinityScorer initializes a new LoraAffinityScorer and returns its pointer.
42+
func NewLoraAffinityScorer() *LoraAffinityScorer {
43+
return &LoraAffinityScorer{
44+
tn: plugins.TypedName{Type: LoraAffinityScorerType, Name: LoraAffinityScorerType},
45+
}
46+
}
47+
48+
// LoraAffinityScorer scores list of candidate pods based on Lora affinity and availability.
49+
type LoraAffinityScorer struct {
50+
tn plugins.TypedName
51+
}
52+
53+
// TypedName returns the type and name tuple of this plugin instance.
54+
func (s *LoraAffinityScorer) TypedName() plugins.TypedName {
55+
return s.tn
56+
}
57+
58+
// WithName sets the name of the scorer.
59+
func (s *LoraAffinityScorer) WithName(name string) *LoraAffinityScorer {
60+
s.tn.Name = name
61+
return s
62+
}
63+
64+
func (s *LoraAffinityScorer) Score(_ context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
65+
scores := make(map[types.Pod]float64, len(pods))
66+
67+
// Assign a score to each pod for loading the target adapter.
68+
for _, pod := range pods {
69+
_, active := pod.GetMetrics().ActiveModels[request.TargetModel]
70+
_, waiting := pod.GetMetrics().WaitingModels[request.TargetModel]
71+
72+
// Determine the model server's suitability score based on adapter load status and capacity.
73+
switch {
74+
// Ideal: The adapter is already active on this model server.
75+
case active:
76+
scores[pod] = 1.0
77+
// Good: The model server has capacity to load at least one more adapter.
78+
case len(pod.GetMetrics().ActiveModels)+len(pod.GetMetrics().WaitingModels) < pod.GetMetrics().MaxActiveModels:
79+
scores[pod] = 0.8
80+
// Moderate: The adapter is already in the queue to be loaded on this model server.
81+
case waiting:
82+
scores[pod] = 0.6
83+
// Unsuitable: The model server has reached its maximum capacity and cannot load the adapter.
84+
default:
85+
scores[pod] = 0.0
86+
}
87+
}
88+
89+
return scores
90+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package scorer
18+
19+
import (
20+
"context"
21+
"testing"
22+
23+
"github.com/stretchr/testify/assert"
24+
k8stypes "k8s.io/apimachinery/pkg/types"
25+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
26+
backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
27+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
28+
)
29+
30+
func TestLoraAffinityScorer(t *testing.T) {
31+
tests := []struct {
32+
name string
33+
request *types.LLMRequest
34+
pods []types.Pod
35+
expectedScoresPod map[string]float64 // Map of pod name to expected score
36+
}{
37+
{
38+
name: "Target model is active",
39+
request: &types.LLMRequest{TargetModel: "active-model-1"},
40+
pods: []types.Pod{
41+
&types.PodMetrics{
42+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
43+
MetricsState: &backendmetrics.MetricsState{
44+
ActiveModels: map[string]int{"active-model-1": 1},
45+
WaitingModels: map[string]int{},
46+
MaxActiveModels: 5,
47+
},
48+
},
49+
},
50+
expectedScoresPod: map[string]float64{
51+
"pod1": 1.0,
52+
},
53+
},
54+
{
55+
name: "Target model is waiting",
56+
request: &types.LLMRequest{TargetModel: "active-model-1"},
57+
pods: []types.Pod{
58+
&types.PodMetrics{
59+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
60+
MetricsState: &backendmetrics.MetricsState{
61+
ActiveModels: map[string]int{"active-model-2": 2},
62+
WaitingModels: map[string]int{"active-model-1": 1},
63+
MaxActiveModels: 2,
64+
},
65+
},
66+
},
67+
expectedScoresPod: map[string]float64{
68+
"pod1": 0.6,
69+
},
70+
},
71+
{
72+
name: "Pods have no space for new model",
73+
request: &types.LLMRequest{TargetModel: "active-model-1"},
74+
pods: []types.Pod{
75+
&types.PodMetrics{
76+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
77+
MetricsState: &backendmetrics.MetricsState{
78+
ActiveModels: map[string]int{"active-model-2": 2},
79+
WaitingModels: map[string]int{"active-model-3": 1},
80+
MaxActiveModels: 2,
81+
},
82+
},
83+
&types.PodMetrics{
84+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
85+
MetricsState: &backendmetrics.MetricsState{
86+
ActiveModels: map[string]int{},
87+
WaitingModels: map[string]int{},
88+
MaxActiveModels: 0,
89+
},
90+
},
91+
},
92+
expectedScoresPod: map[string]float64{
93+
"pod1": 0.0,
94+
"pod2": 0.0,
95+
},
96+
},
97+
{
98+
name: "Multiple pods with mixed active and waiting models",
99+
request: &types.LLMRequest{TargetModel: "active-model-1"},
100+
pods: []types.Pod{
101+
&types.PodMetrics{
102+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
103+
MetricsState: &backendmetrics.MetricsState{
104+
ActiveModels: map[string]int{"active-model-1": 1},
105+
WaitingModels: map[string]int{},
106+
MaxActiveModels: 5,
107+
},
108+
},
109+
&types.PodMetrics{
110+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
111+
MetricsState: &backendmetrics.MetricsState{
112+
ActiveModels: map[string]int{"active-model-2": 4},
113+
WaitingModels: map[string]int{"active-model-1": 1},
114+
MaxActiveModels: 5,
115+
},
116+
},
117+
&types.PodMetrics{
118+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
119+
MetricsState: &backendmetrics.MetricsState{
120+
ActiveModels: map[string]int{"active-model-2": 1},
121+
WaitingModels: map[string]int{},
122+
MaxActiveModels: 2,
123+
},
124+
},
125+
&types.PodMetrics{
126+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod4"}},
127+
MetricsState: &backendmetrics.MetricsState{
128+
ActiveModels: map[string]int{"active-model-3": 1},
129+
WaitingModels: map[string]int{"active-model-1": 1},
130+
MaxActiveModels: 2,
131+
},
132+
},
133+
&types.PodMetrics{
134+
Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod5"}},
135+
MetricsState: &backendmetrics.MetricsState{
136+
ActiveModels: map[string]int{"active-model-4": 1, "active-model-5": 1},
137+
WaitingModels: map[string]int{},
138+
MaxActiveModels: 2,
139+
},
140+
},
141+
},
142+
expectedScoresPod: map[string]float64{
143+
"pod1": 1.0,
144+
"pod2": 0.8,
145+
"pod3": 0.8,
146+
"pod4": 0.6,
147+
"pod5": 0.0,
148+
},
149+
},
150+
{
151+
name: "Empty pods slice",
152+
request: &types.LLMRequest{TargetModel: "modelA"},
153+
pods: []types.Pod{},
154+
expectedScoresPod: map[string]float64{}, // No pods, no scores
155+
},
156+
}
157+
158+
for _, test := range tests {
159+
t.Run(test.name, func(t *testing.T) {
160+
scorer := &LoraAffinityScorer{}
161+
scores := scorer.Score(context.Background(), types.NewCycleState(), test.request, test.pods)
162+
163+
for _, pod := range test.pods {
164+
expectedScore, ok := test.expectedScoresPod[pod.GetPod().NamespacedName.Name]
165+
if !ok {
166+
t.Fatalf("Expected score not found for pod %s in test %s", pod.GetPod().NamespacedName, test.name)
167+
}
168+
assert.InDelta(t, expectedScore, scores[pod], 0.0001, "Pod %s should have score %f", pod.GetPod().NamespacedName.Name, expectedScore)
169+
}
170+
assert.Len(t, scores, len(test.expectedScoresPod), "Number of scored pods should match expected")
171+
})
172+
}
173+
}

site-src/guides/epp-configuration/config-text.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,3 +253,13 @@ available to serve new request).
253253

254254
- *Type*: queue-scorer
255255
- *Parameters*: none
256+
257+
258+
#### **LoraAffinityScorer**
259+
260+
Scores list of candidate pods based on the LoRA adapters loaded on the pod.
261+
Pods with the adapter already loaded or able to be actively loaded will be
262+
scored higher (since it's more available to serve new request).
263+
264+
- *Type*: lora-affinity-scorer
265+
- *Parameters*: none

0 commit comments

Comments
 (0)