From 8a6b5c98ca1fe5d3e5f90947e60e3cf0d8422f7a Mon Sep 17 00:00:00 2001 From: MenD32 Date: Sat, 24 May 2025 15:09:21 +0300 Subject: [PATCH 1/3] fix: hard topology spread constraints stop scaledown Signed-off-by: MenD32 --- cluster-autoscaler/simulator/cluster.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster-autoscaler/simulator/cluster.go b/cluster-autoscaler/simulator/cluster.go index d568becef49c..4ffc85ced200 100644 --- a/cluster-autoscaler/simulator/cluster.go +++ b/cluster-autoscaler/simulator/cluster.go @@ -227,6 +227,7 @@ func (r *RemovalSimulator) findPlaceFor(removedNode string, pods []*apiv1.Pod, n klog.Errorf("Simulating removal of %s/%s return error; %v", pod.Namespace, pod.Name, err) } } + r.clusterSnapshot.RemoveNodeInfo(removedNode) newpods := make([]*apiv1.Pod, 0, len(pods)) for _, podptr := range pods { From 6d4f8513b479ae0f8ec5cec9da89e605cb7c2630 Mon Sep 17 00:00:00 2001 From: MenD32 Date: Sat, 24 May 2025 15:34:19 +0300 Subject: [PATCH 2/3] tests: added test to check that scaledowns work with topology spread constraints Signed-off-by: MenD32 --- cluster-autoscaler/simulator/cluster_test.go | 64 ++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/cluster-autoscaler/simulator/cluster_test.go b/cluster-autoscaler/simulator/cluster_test.go index e08c605c7cf9..7b0003a34ba1 100644 --- a/cluster-autoscaler/simulator/cluster_test.go +++ b/cluster-autoscaler/simulator/cluster_test.go @@ -148,6 +148,59 @@ func TestFindNodesToRemove(t *testing.T) { assert.NoError(t, err) tracker := NewUsageTracker() + topoNode1 := BuildTestNode("topo-n1", 1000, 2000000) + topoNode2 := BuildTestNode("topo-n2", 1000, 2000000) + topoNode3 := BuildTestNode("topo-n3", 1000, 2000000) + topoNode1.Labels = map[string]string{"kubernetes.io/hostname": "topo-n1"} + topoNode2.Labels = map[string]string{"kubernetes.io/hostname": "topo-n2"} + topoNode3.Labels = map[string]string{"kubernetes.io/hostname": "topo-n3"} + + SetNodeReadyState(topoNode1, true, time.Time{}) + SetNodeReadyState(topoNode2, true, time.Time{}) + SetNodeReadyState(topoNode3, true, time.Time{}) + + minDomains := int32(2) + maxSkew := int32(1) + topoConstraint := apiv1.TopologySpreadConstraint{ + MaxSkew: maxSkew, + TopologyKey: "kubernetes.io/hostname", + WhenUnsatisfiable: apiv1.DoNotSchedule, + MinDomains: &minDomains, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "topo-app", + }, + }, + } + + pod5 := BuildTestPod("p5", 100, 100000) + pod5.Labels = map[string]string{"app": "topo-app"} + pod5.OwnerReferences = ownerRefs + pod5.Spec.NodeName = "topo-n1" + pod5.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint} + + pod6 := BuildTestPod("p6", 100, 100000) + pod6.Labels = map[string]string{"app": "topo-app"} + pod6.OwnerReferences = ownerRefs + pod6.Spec.NodeName = "topo-n2" + pod6.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint} + + pod7 := BuildTestPod("p7", 100, 100000) + pod7.Labels = map[string]string{"app": "topo-app"} + pod7.OwnerReferences = ownerRefs + pod7.Spec.NodeName = "topo-n3" + pod7.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{topoConstraint} + + blocker1 := BuildTestPod("blocker1", 100, 100000) + blocker1.Spec.NodeName = "topo-n2" + blocker2 := BuildTestPod("blocker2", 100, 100000) + blocker2.Spec.NodeName = "topo-n3" + + topoNodeToRemove := NodeToBeRemoved{ + Node: topoNode1, + PodsToReschedule: []*apiv1.Pod{pod5}, + } + tests := []findNodesToRemoveTestConfig{ { name: "just an empty node, should be removed", @@ -184,6 +237,17 @@ func TestFindNodesToRemove(t *testing.T) { allNodes: []*apiv1.Node{emptyNode, drainableNode, fullNode, nonDrainableNode}, toRemove: []NodeToBeRemoved{emptyNodeToRemove, drainableNodeToRemove}, }, + { + name: "topology spread constraint test - one node should be removable", + pods: []*apiv1.Pod{pod5, pod6, pod7, blocker1, blocker2}, + allNodes: []*apiv1.Node{topoNode1, topoNode2, topoNode3}, + candidates: []string{topoNode1.Name, topoNode2.Name, topoNode3.Name}, + toRemove: []NodeToBeRemoved{topoNodeToRemove}, + unremovable: []*UnremovableNode{ + {Node: topoNode2, Reason: BlockedByPod, BlockingPod: &drain.BlockingPod{Pod: blocker1, Reason: drain.NotReplicated}}, + {Node: topoNode3, Reason: BlockedByPod, BlockingPod: &drain.BlockingPod{Pod: blocker2, Reason: drain.NotReplicated}}, + }, + }, } for _, test := range tests { From 13f0b04129faf80aa3bd40dc33b3982246771781 Mon Sep 17 00:00:00 2001 From: MenD32 Date: Tue, 27 May 2025 21:31:47 +0300 Subject: [PATCH 3/3] docs: added helpful comments Signed-off-by: MenD32 --- cluster-autoscaler/simulator/cluster.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster-autoscaler/simulator/cluster.go b/cluster-autoscaler/simulator/cluster.go index 4ffc85ced200..3956241040f1 100644 --- a/cluster-autoscaler/simulator/cluster.go +++ b/cluster-autoscaler/simulator/cluster.go @@ -227,6 +227,7 @@ func (r *RemovalSimulator) findPlaceFor(removedNode string, pods []*apiv1.Pod, n klog.Errorf("Simulating removal of %s/%s return error; %v", pod.Namespace, pod.Name, err) } } + // Remove the node from the snapshot, so that it doesn't interfere with topology spread constraint scheduling. r.clusterSnapshot.RemoveNodeInfo(removedNode) newpods := make([]*apiv1.Pod, 0, len(pods))