Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ type machineController struct {
nodeInformer cache.SharedIndexInformer
managementClient dynamic.Interface
managementScaleClient scale.ScalesGetter
managementDiscoveryClient discovery.DiscoveryInterface
machineSetResource schema.GroupVersionResource
machineResource schema.GroupVersionResource
machinePoolResource schema.GroupVersionResource
Expand Down Expand Up @@ -457,7 +458,7 @@ func newMachineController(
managementInformerFactory := dynamicinformer.NewFilteredDynamicSharedInformerFactory(managementClient, 0, namespaceToWatch(autoDiscoverySpecs), nil)

CAPIGroup := getCAPIGroup()
CAPIVersion, err := getAPIGroupPreferredVersion(managementDiscoveryClient, CAPIGroup)
CAPIVersion, err := getCAPIGroupPreferredVersion(managementDiscoveryClient, CAPIGroup)
if err != nil {
return nil, fmt.Errorf("could not find preferred version for CAPI group %q: %v", CAPIGroup, err)
}
Expand Down Expand Up @@ -561,6 +562,7 @@ func newMachineController(
nodeInformer: nodeInformer,
managementClient: managementClient,
managementScaleClient: managementScaleClient,
managementDiscoveryClient: managementDiscoveryClient,
machineSetResource: gvrMachineSet,
machinePoolResource: gvrMachinePool,
machinePoolsAvailable: machinePoolsAvailable,
Expand All @@ -586,11 +588,15 @@ func groupVersionHasResource(client discovery.DiscoveryInterface, groupVersion,
return false, nil
}

func getAPIGroupPreferredVersion(client discovery.DiscoveryInterface, APIGroup string) (string, error) {
func getCAPIGroupPreferredVersion(client discovery.DiscoveryInterface, APIGroup string) (string, error) {
if version := os.Getenv(CAPIVersionEnvVar); version != "" {
return version, nil
}

return getAPIGroupPreferredVersion(client, APIGroup)
}

func getAPIGroupPreferredVersion(client discovery.DiscoveryInterface, APIGroup string) (string, error) {
groupList, err := client.ServerGroups()
if err != nil {
return "", fmt.Errorf("failed to get ServerGroups: %v", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"encoding/json"
"fmt"
"k8s.io/apimachinery/pkg/types"
"math/rand"
"path"
"reflect"
Expand All @@ -36,6 +35,7 @@ import (
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
fakediscovery "k8s.io/client-go/discovery/fake"
"k8s.io/client-go/dynamic"
Expand Down Expand Up @@ -107,13 +107,13 @@ func mustCreateTestController(t testing.TB, testConfigs ...*testConfig) (*machin
dynamicClientset := fakedynamic.NewSimpleDynamicClientWithCustomListKinds(
runtime.NewScheme(),
map[schema.GroupVersionResource]string{
{Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machinedeployments"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machines"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machinesets"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machinepools"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1beta1", Resource: "machinedeployments"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1beta1", Resource: "machines"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1beta1", Resource: "machinesets"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1beta2", Resource: "machinedeployments"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1beta2", Resource: "machines"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1beta2", Resource: "machinesets"}: "kindList",
{Group: "cluster.x-k8s.io", Version: "v1beta2", Resource: "machinepools"}: "kindList",
{Group: "custom.x-k8s.io", Version: "v1beta1", Resource: "machinepools"}: "kindList",
{Group: "custom.x-k8s.io", Version: "v1beta1", Resource: "machinedeployments"}: "kindList",
{Group: "custom.x-k8s.io", Version: "v1beta1", Resource: "machines"}: "kindList",
Expand Down Expand Up @@ -151,7 +151,7 @@ func mustCreateTestController(t testing.TB, testConfigs ...*testConfig) (*machin
},
},
{
GroupVersion: fmt.Sprintf("%s/v1alpha3", defaultCAPIGroup),
GroupVersion: fmt.Sprintf("%s/v1beta2", defaultCAPIGroup),
APIResources: []metav1.APIResource{
{
Name: resourceNameMachineDeployment,
Expand Down Expand Up @@ -191,7 +191,7 @@ func mustCreateTestController(t testing.TB, testConfigs ...*testConfig) (*machin

gvr := schema.GroupVersionResource{
Group: action.GetResource().Group,
Version: "v1alpha3",
Version: "v1beta2",
Resource: resource,
}

Expand Down Expand Up @@ -366,7 +366,7 @@ func createTestConfigs(specs ...testSpec) []*testConfig {
config.machineSet = &unstructured.Unstructured{
Object: map[string]interface{}{
"kind": machineSetKind,
"apiVersion": "cluster.x-k8s.io/v1alpha3",
"apiVersion": "cluster.x-k8s.io/v1beta2",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we have to adjust the infrastructureRef here to use the new format of v1beta2 (i.e. apiGroup instead of apiVersion)

(please also check if there are other cases below)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current test structure is not so flexible. For this case, it covers the machineset with apiVersion case, e.g. the previous behavior.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe then we should use v1beta1 here. As it is it is not a valid v1beta2 object

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have some thoughts about how to make these tests easier to work with, but best if we get this review done first then perhaps i can propose some cleanups.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if I got it right, but v1beta2 MachineDeployments, MachineSets and MachinePools always have apiGroup. How could they have apiVersion?

Copy link
Contributor

@elmiko elmiko Aug 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line is the apiVersion for the kind that is being created, for this clause it's a MachineSet. this isn't about the infrastructure ref.

edit: misread your comment @sbueringer

i agree with your suggestion that we use v1beta1 here.

Copy link
Member

@sbueringer sbueringer Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just chatted with Jun. While it looks trivial to just change this apiVersion to v1beta1 here it breaks a huge amount of tests and requires refactoring

(57 tests failed, 143 tests passed when changing this to v1beta1)

So from my side it would be okay to defer the test refactoring if we feel the change in this PR is sufficiently unit tested.

But I leave this to autoscaler reviewers / maintainers of course

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume we're talking about test refactoring under the cloudprovider/clusterapi directory, which is in fact maintained independently from the core autoscaler. So I'll defer to @elmiko for final call on merging w/ v1beta2 change.

Overall lgtm

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i was talking about refactoring the tests for clusterapi. one of things i'd like to do is replace the brittle test config functions with a better interface modeled around a builder/fluent pattern for creating the capi objects.

"metadata": map[string]interface{}{
"name": spec.machineSetName,
"namespace": spec.namespace,
Expand Down Expand Up @@ -404,7 +404,7 @@ func createTestConfigs(specs ...testSpec) []*testConfig {
config.machineDeployment = &unstructured.Unstructured{
Object: map[string]interface{}{
"kind": machineDeploymentKind,
"apiVersion": "cluster.x-k8s.io/v1alpha3",
"apiVersion": "cluster.x-k8s.io/v1beta2",
"metadata": map[string]interface{}{
"name": spec.machineDeploymentName,
"namespace": spec.namespace,
Expand All @@ -416,9 +416,9 @@ func createTestConfigs(specs ...testSpec) []*testConfig {
"template": map[string]interface{}{
"spec": map[string]interface{}{
"infrastructureRef": map[string]interface{}{
"apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1",
"kind": machineTemplateKind,
"name": "TestMachineTemplate",
"apiGroup": "infrastructure.cluster.x-k8s.io",
"kind": machineTemplateKind,
"name": "TestMachineTemplate",
},
},
},
Expand Down Expand Up @@ -506,7 +506,7 @@ func makeLinkedNodeAndMachine(i int, namespace, clusterName string, owner metav1
machine := &unstructured.Unstructured{
Object: map[string]interface{}{
"kind": machineKind,
"apiVersion": "cluster.x-k8s.io/v1alpha3",
"apiVersion": "cluster.x-k8s.io/v1beta2",
"metadata": map[string]interface{}{
"name": fmt.Sprintf("%s-%s-machine-%d", namespace, owner.Name, i),
"namespace": namespace,
Expand Down Expand Up @@ -1550,7 +1550,7 @@ func TestGetAPIGroupPreferredVersion(t *testing.T) {
{
description: "find version for default API group",
APIGroup: defaultCAPIGroup,
preferredVersion: "v1alpha3",
preferredVersion: "v1beta2",
envVar: "",
error: false,
},
Expand Down Expand Up @@ -1584,7 +1584,7 @@ func TestGetAPIGroupPreferredVersion(t *testing.T) {
GroupVersion: fmt.Sprintf("%s/v1beta1", customCAPIGroup),
},
{
GroupVersion: fmt.Sprintf("%s/v1alpha3", defaultCAPIGroup),
GroupVersion: fmt.Sprintf("%s/v1beta2", defaultCAPIGroup),
},
{
GroupVersion: fmt.Sprintf("%s/%s", customCAPIGroup, customVersion),
Expand All @@ -1595,7 +1595,7 @@ func TestGetAPIGroupPreferredVersion(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
t.Setenv(CAPIVersionEnvVar, tc.envVar)
version, err := getAPIGroupPreferredVersion(discoveryClient, tc.APIGroup)
version, err := getCAPIGroupPreferredVersion(discoveryClient, tc.APIGroup)
if (err != nil) != tc.error {
t.Errorf("expected to have error: %t. Had an error: %t", tc.error, err != nil)
}
Expand All @@ -1617,14 +1617,14 @@ func TestGroupVersionHasResource(t *testing.T) {
{
description: "true when it finds resource",
resourceName: resourceNameMachineDeployment,
APIGroup: fmt.Sprintf("%s/v1alpha3", defaultCAPIGroup),
APIGroup: fmt.Sprintf("%s/v1beta2", defaultCAPIGroup),
expected: true,
error: false,
},
{
description: "false when it does not find resource",
resourceName: "resourceDoesNotExist",
APIGroup: fmt.Sprintf("%s/v1alpha3", defaultCAPIGroup),
APIGroup: fmt.Sprintf("%s/v1beta2", defaultCAPIGroup),
expected: false,
error: false,
},
Expand All @@ -1641,7 +1641,7 @@ func TestGroupVersionHasResource(t *testing.T) {
Fake: &clientgotesting.Fake{
Resources: []*metav1.APIResourceList{
{
GroupVersion: fmt.Sprintf("%s/v1alpha3", defaultCAPIGroup),
GroupVersion: fmt.Sprintf("%s/v1beta2", defaultCAPIGroup),
APIResources: []metav1.APIResource{
{
Name: resourceNameMachineDeployment,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,22 +375,44 @@ func (r unstructuredScalableResource) InstanceDRADriver() string {
}

func (r unstructuredScalableResource) readInfrastructureReferenceResource() (*unstructured.Unstructured, error) {
obKind := r.unstructured.GetKind()
obName := r.unstructured.GetName()

infraref, found, err := unstructured.NestedStringMap(r.unstructured.Object, "spec", "template", "spec", "infrastructureRef")
if !found || err != nil {
return nil, nil
}

apiversion, ok := infraref["apiVersion"]
if !ok {
return nil, nil
var apiversion string

apiGroup, ok := infraref["apiGroup"]
if ok {
if apiversion, err = getAPIGroupPreferredVersion(r.controller.managementDiscoveryClient, apiGroup); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I see correctly this is doing a live call against the apiserver. I'm wondering if 1 live call for every call of readInfrastructureReferenceResource is too much

Should we use a cache with a TTL to cache the apiGroup => version mapping? (ttl: 1m or 10m?)
(we can use client-go/tools/cache.NewTTLStore for that)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Initially, I think this api is invoked only during scale up/down. @elmiko any advice where to put the cache?

Copy link
Member

@sbueringer sbueringer Aug 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's okay to always do a live call here because this isn't called too often, absolutely fine for me of course (I just don't know :))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these calls will only happen when the core autoscaler wants to construct a node template. if the autoscaler has a ready node from the node group, then it will use a node as a template instead of asking the provider to generate a new template (where this function is called).

in the worst case scenario, this function will get called once per node group per scan interval from the autoscaler, which defaults to 10 seconds. in a large cluster this could be called several time for the same template depending on how the cluster-api resources are organized.

i think it's worth investigating putting a cache in for the infrastructure templates as they probably won't change that frequently and it could save us some api calls.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like we don't necessarily need caching. If I see correctly the getInfrastructureResource below is also not cached? So this won't add much on top

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getInfrastructureResource enables informer's cache.

klog.V(4).Infof("Unable to read preferred version from api group %s, error: %v", apiGroup, err)
return nil, err
}
apiversion = fmt.Sprintf("%s/%s", apiGroup, apiversion)
} else {
// Fall back to ObjectReference in capi v1beta1
apiversion, ok = infraref["apiVersion"]
if !ok {
info := fmt.Sprintf("Missing apiVersion from %s %s's InfrastructureReference", obKind, obName)
klog.V(4).Info(info)
return nil, errors.New(info)
}
}

kind, ok := infraref["kind"]
if !ok {
return nil, nil
info := fmt.Sprintf("Missing kind from %s %s's InfrastructureReference", obKind, obName)
klog.V(4).Info(info)
return nil, errors.New(info)
}
name, ok := infraref["name"]
if !ok {
return nil, nil
info := fmt.Sprintf("Missing name from %s %s's InfrastructureReference", obKind, obName)
klog.V(4).Info(info)
return nil, errors.New(info)
}
// kind needs to be lower case and plural
kind = fmt.Sprintf("%ss", strings.ToLower(kind))
Expand Down
Loading