Skip to content

Commit 190f08b

Browse files
authored
Merge pull request #8430 from vitanovs/feat/vpa-updater-failed-evictions-counter-metric
feat(vpa/updater): Add a new counter metric to measure the total number of failed Pods evictions attempts
2 parents a7df3cf + cc7a83c commit 190f08b

File tree

3 files changed

+64
-1
lines changed

3 files changed

+64
-1
lines changed

vertical-pod-autoscaler/pkg/updater/logic/updater.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ func (u *updater) RunOnce(ctx context.Context) {
316316
evictErr := evictionLimiter.Evict(pod, vpa, u.eventRecorder)
317317
if evictErr != nil {
318318
klog.V(0).InfoS("Eviction failed", "error", evictErr, "pod", klog.KObj(pod))
319+
metrics_updater.RecordFailedEviction(vpaSize, updateMode, "EvictionError")
319320
} else {
320321
withEvicted = true
321322
metrics_updater.AddEvictedPod(vpaSize, updateMode)

vertical-pod-autoscaler/pkg/utils/metrics/updater/updater.go

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@ var (
8484
}, []string{"vpa_size_log2", "update_mode"},
8585
)
8686

87+
failedEvictionAttempts = prometheus.NewCounterVec(
88+
prometheus.CounterOpts{
89+
Namespace: metricsNamespace,
90+
Name: "failed_eviction_attempts_total",
91+
Help: "Number of failed attempts to update Pods by eviction",
92+
}, []string{"vpa_size_log2", "update_mode", "reason"},
93+
)
94+
8795
inPlaceUpdatableCount = prometheus.NewGaugeVec(
8896
prometheus.GaugeOpts{
8997
Namespace: metricsNamespace,
@@ -130,7 +138,21 @@ var (
130138

131139
// Register initializes all metrics for VPA Updater
132140
func Register() {
133-
prometheus.MustRegister(controlledCount, evictableCount, evictedCount, vpasWithEvictablePodsCount, vpasWithEvictedPodsCount, inPlaceUpdatableCount, inPlaceUpdatedCount, vpasWithInPlaceUpdatablePodsCount, vpasWithInPlaceUpdatedPodsCount, failedInPlaceUpdateAttempts, functionLatency)
141+
collectors := []prometheus.Collector{
142+
controlledCount,
143+
evictableCount,
144+
evictedCount,
145+
vpasWithEvictablePodsCount,
146+
vpasWithEvictedPodsCount,
147+
failedEvictionAttempts,
148+
inPlaceUpdatableCount,
149+
inPlaceUpdatedCount,
150+
vpasWithInPlaceUpdatablePodsCount,
151+
vpasWithInPlaceUpdatedPodsCount,
152+
failedInPlaceUpdateAttempts,
153+
functionLatency,
154+
}
155+
prometheus.MustRegister(collectors...)
134156
}
135157

136158
// NewExecutionTimer provides a timer for Updater's RunOnce execution
@@ -183,6 +205,12 @@ func AddEvictedPod(vpaSize int, mode vpa_types.UpdateMode) {
183205
evictedCount.WithLabelValues(strconv.Itoa(log2), string(mode)).Inc()
184206
}
185207

208+
// RecordFailedEviction increases the counter of failed eviction attempts by given VPA size, update mode and reason
209+
func RecordFailedEviction(vpaSize int, mode vpa_types.UpdateMode, reason string) {
210+
log2 := metrics.GetVpaSizeLog2(vpaSize)
211+
failedEvictionAttempts.WithLabelValues(strconv.Itoa(log2), string(mode), reason).Inc()
212+
}
213+
186214
// NewInPlaceUpdatablePodsCounter returns a wrapper for counting Pods which are matching in-place update criteria
187215
func NewInPlaceUpdatablePodsCounter() *SizeBasedGauge {
188216
return newSizeBasedGauge(inPlaceUpdatableCount)

vertical-pod-autoscaler/pkg/utils/metrics/updater/updater_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,40 @@ func TestAddEvictedPod(t *testing.T) {
5858
}
5959
}
6060

61+
func TestRecordFailedEviction(t *testing.T) {
62+
testCases := []struct {
63+
desc string
64+
vpaSize int
65+
mode vpa_types.UpdateMode
66+
reason string
67+
log2 string
68+
}{
69+
{
70+
desc: "VPA size 2, some reason",
71+
vpaSize: 2,
72+
reason: "some_reason",
73+
log2: "1",
74+
},
75+
{
76+
desc: "VPA size 20, another reason",
77+
vpaSize: 20,
78+
reason: "another_reason",
79+
log2: "4",
80+
},
81+
}
82+
83+
for _, tc := range testCases {
84+
t.Run(tc.desc, func(t *testing.T) {
85+
t.Cleanup(failedEvictionAttempts.Reset)
86+
RecordFailedEviction(tc.vpaSize, tc.mode, tc.reason)
87+
val := testutil.ToFloat64(failedEvictionAttempts.WithLabelValues(tc.log2, string(tc.mode), tc.reason))
88+
if val != 1 {
89+
t.Errorf("Unexpected value for FailedEviction metric with labels (%s, %s): got %v, want 1", tc.log2, tc.reason, val)
90+
}
91+
})
92+
}
93+
}
94+
6195
func TestAddInPlaceUpdatedPod(t *testing.T) {
6296
testCases := []struct {
6397
desc string

0 commit comments

Comments
 (0)