diff --git a/pkg/addon/controllers/addontemplate/controller.go b/pkg/addon/controllers/addontemplate/controller.go index 387c05ca2..0dff9eba7 100644 --- a/pkg/addon/controllers/addontemplate/controller.go +++ b/pkg/addon/controllers/addontemplate/controller.go @@ -8,18 +8,17 @@ import ( "github.com/openshift/library-go/pkg/operator/events" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/dynamic/dynamicinformer" kubeinformers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" "open-cluster-management.io/addon-framework/pkg/addonfactory" "open-cluster-management.io/addon-framework/pkg/addonmanager" "open-cluster-management.io/addon-framework/pkg/utils" - addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" addonv1alpha1client "open-cluster-management.io/api/client/addon/clientset/versioned" addoninformers "open-cluster-management.io/api/client/addon/informers/externalversions" @@ -29,6 +28,7 @@ import ( workv1informers "open-cluster-management.io/api/client/work/informers/externalversions" clusterv1 "open-cluster-management.io/api/cluster/v1" + addonindex "open-cluster-management.io/ocm/pkg/addon/index" "open-cluster-management.io/ocm/pkg/addon/templateagent" "open-cluster-management.io/ocm/pkg/common/queue" ) @@ -40,17 +40,18 @@ type addonTemplateController struct { // The key is the name of the template type addon. addonManagers map[string]context.CancelFunc - kubeConfig *rest.Config - addonClient addonv1alpha1client.Interface - workClient workv1client.Interface - kubeClient kubernetes.Interface - cmaLister addonlisterv1alpha1.ClusterManagementAddOnLister - addonInformers addoninformers.SharedInformerFactory - clusterInformers clusterv1informers.SharedInformerFactory - dynamicInformers dynamicinformer.DynamicSharedInformerFactory - workInformers workv1informers.SharedInformerFactory - runControllerFunc runController - eventRecorder events.Recorder + kubeConfig *rest.Config + addonClient addonv1alpha1client.Interface + workClient workv1client.Interface + kubeClient kubernetes.Interface + cmaLister addonlisterv1alpha1.ClusterManagementAddOnLister + managedClusterAddonIndexer cache.Indexer + addonInformers addoninformers.SharedInformerFactory + clusterInformers clusterv1informers.SharedInformerFactory + dynamicInformers dynamicinformer.DynamicSharedInformerFactory + workInformers workv1informers.SharedInformerFactory + runControllerFunc runController + eventRecorder events.Recorder } type runController func(ctx context.Context, addonName string) error @@ -69,17 +70,18 @@ func NewAddonTemplateController( runController ...runController, ) factory.Controller { c := &addonTemplateController{ - kubeConfig: hubKubeconfig, - kubeClient: hubKubeClient, - addonClient: addonClient, - workClient: workClient, - cmaLister: addonInformers.Addon().V1alpha1().ClusterManagementAddOns().Lister(), - addonManagers: make(map[string]context.CancelFunc), - addonInformers: addonInformers, - clusterInformers: clusterInformers, - dynamicInformers: dynamicInformers, - workInformers: workInformers, - eventRecorder: recorder, + kubeConfig: hubKubeconfig, + kubeClient: hubKubeClient, + addonClient: addonClient, + workClient: workClient, + cmaLister: addonInformers.Addon().V1alpha1().ClusterManagementAddOns().Lister(), + managedClusterAddonIndexer: addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().GetIndexer(), + addonManagers: make(map[string]context.CancelFunc), + addonInformers: addonInformers, + clusterInformers: clusterInformers, + dynamicInformers: dynamicInformers, + workInformers: workInformers, + eventRecorder: recorder, } if len(runController) > 0 { @@ -88,9 +90,28 @@ func NewAddonTemplateController( // easy to mock in unit tests c.runControllerFunc = c.runController } - return factory.New().WithInformersQueueKeysFunc( - queue.QueueKeyByMetaNamespaceName, - addonInformers.Addon().V1alpha1().ClusterManagementAddOns().Informer()). + return factory.New(). + WithInformersQueueKeysFunc( + queue.QueueKeyByMetaNamespaceName, + addonInformers.Addon().V1alpha1().ClusterManagementAddOns().Informer()). + WithFilteredEventsInformersQueueKeysFunc( + queue.QueueKeyByMetaName, + func(obj interface{}) bool { + mca, ok := obj.(*addonv1alpha1.ManagedClusterAddOn) + if !ok { + return false + } + + // Only process ManagedClusterAddOns that reference AddOnTemplates + for _, configRef := range mca.Status.ConfigReferences { + if configRef.ConfigGroupResource.Group == "addon.open-cluster-management.io" && + configRef.ConfigGroupResource.Resource == "addontemplates" { + return true + } + } + return false + }, + addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer()). WithBareInformers( // do not need to queue, just make sure the controller reconciles after the addonTemplate cache is synced // otherwise, there will be "xx-addon-template" not found" errors in the log as the controller uses the @@ -101,14 +122,32 @@ func NewAddonTemplateController( } func (c *addonTemplateController) stopUnusedManagers( - ctx context.Context, syncCtx factory.SyncContext, addOnName string) { - // TODO: check if all managed cluster addon instances are deleted + ctx context.Context, syncCtx factory.SyncContext, addOnName string) error { + logger := klog.FromContext(ctx) + + // Check if all managed cluster addon instances are deleted before stopping the manager + // This ensures the manager continues running while ManagedClusterAddOns exist + addons, err := c.managedClusterAddonIndexer.ByIndex(addonindex.ManagedClusterAddonByName, addOnName) + if err != nil { + return err + } + + // Check if there are still ManagedClusterAddOns for this addon + if len(addons) > 0 { + logger.Info("ManagedClusterAddOn still exists, waiting for deletion", + "addonName", addOnName, "count", len(addons)) + // No need to requeue since we now watch ManagedClusterAddon events + return nil + } + stopFunc, ok := c.addonManagers[addOnName] if ok { + logger.Info("Start to stop the manager for addon", "addonName", addOnName) stopFunc() delete(c.addonManagers, addOnName) - klog.FromContext(ctx).Info("Stopping the manager for addon", "addonName", addOnName) + logger.Info("The manager for addon stopped", "addonName", addOnName) } + return nil } func (c *addonTemplateController) sync(ctx context.Context, syncCtx factory.SyncContext) error { @@ -118,15 +157,13 @@ func (c *addonTemplateController) sync(ctx context.Context, syncCtx factory.Sync cma, err := c.cmaLister.Get(addonName) if err != nil { if errors.IsNotFound(err) { - c.stopUnusedManagers(ctx, syncCtx, addonName) - return nil + return c.stopUnusedManagers(ctx, syncCtx, addonName) } return err } if !templateagent.SupportAddOnTemplate(cma) { - c.stopUnusedManagers(ctx, syncCtx, cma.Name) - return nil + return c.stopUnusedManagers(ctx, syncCtx, cma.Name) } _, exist := c.addonManagers[addonName] @@ -188,7 +225,7 @@ func (c *addonTemplateController) runController(ctx context.Context, addonName s listOptions.LabelSelector = metav1.FormatLabelSelector(selector) }), ) - getValuesClosure := func(cluster *clusterv1.ManagedCluster, addon *addonapiv1alpha1.ManagedClusterAddOn) (addonfactory.Values, error) { + getValuesClosure := func(cluster *clusterv1.ManagedCluster, addon *addonv1alpha1.ManagedClusterAddOn) (addonfactory.Values, error) { return templateagent.GetAddOnRegistriesPrivateValuesFromClusterAnnotation(klog.FromContext(ctx), cluster, addon) } agentAddon := templateagent.NewCRDTemplateAgentAddon( @@ -224,13 +261,14 @@ func (c *addonTemplateController) runController(ctx context.Context, addonName s kubeInformers.Start(ctx.Done()) // trigger the manager to reconcile for the existing managed cluster addons - mcas, err := c.addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Lister().List(labels.Everything()) + mcas, err := c.managedClusterAddonIndexer.ByIndex(addonindex.ManagedClusterAddonByName, addonName) if err != nil { - logger.Info("Failed to list ManagedClusterAddOns", "error", err) + logger.Info("Failed to list ManagedClusterAddOns by index", "error", err) } else { for _, mca := range mcas { - if mca.Name == addonName { - mgr.Trigger(mca.Namespace, addonName) + addon, ok := mca.(*addonv1alpha1.ManagedClusterAddOn) + if ok { + mgr.Trigger(addon.Namespace, addonName) } } } diff --git a/pkg/addon/controllers/addontemplate/controller_test.go b/pkg/addon/controllers/addontemplate/controller_test.go index 05b54861b..50c70ab49 100644 --- a/pkg/addon/controllers/addontemplate/controller_test.go +++ b/pkg/addon/controllers/addontemplate/controller_test.go @@ -13,6 +13,7 @@ import ( dynamicfake "k8s.io/client-go/dynamic/fake" fakekube "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" "open-cluster-management.io/addon-framework/pkg/addonmanager/addontesting" "open-cluster-management.io/addon-framework/pkg/utils" @@ -24,7 +25,9 @@ import ( fakework "open-cluster-management.io/api/client/work/clientset/versioned/fake" workinformers "open-cluster-management.io/api/client/work/informers/externalversions" + addonindex "open-cluster-management.io/ocm/pkg/addon/index" testingcommon "open-cluster-management.io/ocm/pkg/common/testing" + testinghelpers "open-cluster-management.io/ocm/pkg/registration/helpers/testing" ) func TestReconcile(t *testing.T) { @@ -150,6 +153,15 @@ func TestReconcile(t *testing.T) { addonInformers := addoninformers.NewSharedInformerFactory(fakeAddonClient, 10*time.Minute) + // Add the index for ManagedClusterAddonByName + err := addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().AddIndexers( + cache.Indexers{ + addonindex.ManagedClusterAddonByName: addonindex.IndexManagedClusterAddonByName, + }) + if err != nil { + t.Fatal(err) + } + for _, obj := range c.managedClusteraddon { if err := addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().GetStore().Add(obj); err != nil { t.Fatal(err) @@ -248,15 +260,16 @@ func TestRunController(t *testing.T) { workInformers := workinformers.NewSharedInformerFactory(fakeWorkClient, 10*time.Minute) hubKubeClient := fakekube.NewSimpleClientset() controller := &addonTemplateController{ - kubeConfig: &rest.Config{}, - kubeClient: hubKubeClient, - addonClient: fakeAddonClient, - cmaLister: addonInformers.Addon().V1alpha1().ClusterManagementAddOns().Lister(), - addonManagers: make(map[string]context.CancelFunc), - addonInformers: addonInformers, - clusterInformers: clusterInformers, - dynamicInformers: dynamicInformerFactory, - workInformers: workInformers, + kubeConfig: &rest.Config{}, + kubeClient: hubKubeClient, + addonClient: fakeAddonClient, + cmaLister: addonInformers.Addon().V1alpha1().ClusterManagementAddOns().Lister(), + managedClusterAddonIndexer: addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().GetIndexer(), + addonManagers: make(map[string]context.CancelFunc), + addonInformers: addonInformers, + clusterInformers: clusterInformers, + dynamicInformers: dynamicInformerFactory, + workInformers: workInformers, } ctx := context.TODO() @@ -268,3 +281,139 @@ func TestRunController(t *testing.T) { } } } + +func TestStopUnusedManagers(t *testing.T) { + cases := []struct { + name string + addonName string + managedClusterAddons []runtime.Object + existingManagers map[string]context.CancelFunc + expectedManagerStopped bool + expectedRequeue bool + }{ + { + name: "no managed cluster addons, manager should be stopped", + addonName: "test-addon", + managedClusterAddons: []runtime.Object{}, + existingManagers: map[string]context.CancelFunc{ + "test-addon": func() {}, + }, + expectedManagerStopped: true, + expectedRequeue: false, + }, + { + name: "managed cluster addons exist, manager should not be stopped", + addonName: "test-addon", + managedClusterAddons: []runtime.Object{ + testinghelpers.NewManagedClusterAddons("test-addon", "cluster1", nil, nil), + }, + existingManagers: map[string]context.CancelFunc{ + "test-addon": func() {}, + }, + expectedManagerStopped: false, + expectedRequeue: true, + }, + { + name: "multiple managed cluster addons exist, manager should not be stopped", + addonName: "test-addon", + managedClusterAddons: []runtime.Object{ + testinghelpers.NewManagedClusterAddons("test-addon", "cluster1", nil, nil), + testinghelpers.NewManagedClusterAddons("test-addon", "cluster2", nil, nil), + }, + existingManagers: map[string]context.CancelFunc{ + "test-addon": func() {}, + }, + expectedManagerStopped: false, + expectedRequeue: true, + }, + { + name: "no manager exists, should not error", + addonName: "test-addon", + managedClusterAddons: []runtime.Object{}, + existingManagers: map[string]context.CancelFunc{}, + expectedManagerStopped: false, + expectedRequeue: false, + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + fakeAddonClient := fakeaddon.NewSimpleClientset(c.managedClusterAddons...) + addonInformers := addoninformers.NewSharedInformerFactory(fakeAddonClient, 10*time.Minute) + + fakeDynamicClient := dynamicfake.NewSimpleDynamicClient(runtime.NewScheme()) + dynamicInformerFactory := dynamicinformer.NewDynamicSharedInformerFactory(fakeDynamicClient, 0) + fakeClusterClient := fakecluster.NewSimpleClientset() + clusterInformers := clusterv1informers.NewSharedInformerFactory(fakeClusterClient, 10*time.Minute) + fakeWorkClient := fakework.NewSimpleClientset() + workInformers := workinformers.NewSharedInformerFactory(fakeWorkClient, 10*time.Minute) + hubKubeClient := fakekube.NewSimpleClientset() + + managerStopped := false + existingManagers := make(map[string]context.CancelFunc) + for name, stopFunc := range c.existingManagers { + capturedName := name + capturedStopFunc := stopFunc + existingManagers[name] = func() { + if capturedName == c.addonName { + managerStopped = true + } + capturedStopFunc() + } + } + + // Add the index for ManagedClusterAddonByName + err := addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().AddIndexers( + cache.Indexers{ + addonindex.ManagedClusterAddonByName: addonindex.IndexManagedClusterAddonByName, + }) + if err != nil { + t.Fatal(err) + } + + controller := &addonTemplateController{ + kubeConfig: &rest.Config{}, + kubeClient: hubKubeClient, + addonClient: fakeAddonClient, + workClient: fakeWorkClient, + cmaLister: addonInformers.Addon().V1alpha1().ClusterManagementAddOns().Lister(), + managedClusterAddonIndexer: addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().GetIndexer(), + addonManagers: existingManagers, + addonInformers: addonInformers, + clusterInformers: clusterInformers, + dynamicInformers: dynamicInformerFactory, + workInformers: workInformers, + eventRecorder: eventstesting.NewTestingEventRecorder(t), + } + + // Start informers and wait for cache sync + ctx := context.TODO() + addonInformers.Start(ctx.Done()) + addonInformers.WaitForCacheSync(ctx.Done()) + + syncContext := testingcommon.NewFakeSyncContext(t, c.addonName) + + err = controller.stopUnusedManagers(ctx, syncContext, c.addonName) + assert.NoError(t, err) + // Check if manager was stopped + if c.expectedManagerStopped { + assert.True(t, managerStopped, "expected manager to be stopped") + _, exists := controller.addonManagers[c.addonName] + assert.False(t, exists, "expected manager to be removed from map") + } else { + assert.False(t, managerStopped, "expected manager not to be stopped") + if len(c.existingManagers) > 0 { + _, exists := controller.addonManagers[c.addonName] + assert.True(t, exists, "expected manager to still exist in map") + } + } + + // Check if requeue was called + if c.expectedRequeue { + // We can't easily test the exact requeue behavior with the fake sync context + // but we can verify the manager wasn't stopped when it should be requeued + assert.False(t, managerStopped, "manager should not be stopped when requeue is expected") + } + }) + } +} diff --git a/pkg/addon/templateagent/registration.go b/pkg/addon/templateagent/registration.go index c60114cb3..9fd5f57c1 100644 --- a/pkg/addon/templateagent/registration.go +++ b/pkg/addon/templateagent/registration.go @@ -91,7 +91,7 @@ func (a *CRDTemplateAgentAddon) TemplateCSRConfigurationsFunc() func(cluster *cl return func(cluster *clusterv1.ManagedCluster) []addonapiv1alpha1.RegistrationConfig { template, err := a.GetDesiredAddOnTemplate(nil, cluster.Name, a.addonName) if err != nil { - utilruntime.HandleError(fmt.Errorf("failed to get addon %s template: %v", a.addonName, err)) + a.logger.Info("CSRConfigurations failed to get addon template", "addonName", a.addonName, "error", err) return nil } if template == nil { @@ -168,7 +168,7 @@ func (a *CRDTemplateAgentAddon) TemplateCSRApproveCheckFunc() agent.CSRApproveFu template, err := a.GetDesiredAddOnTemplate(addon, cluster.Name, a.addonName) if err != nil { - utilruntime.HandleError(fmt.Errorf("failed to get addon %s template: %v", a.addonName, err)) + a.logger.Info("CSRApproveCheck failed to get addon template", "addonName", a.addonName, "error", err) return false } if template == nil { diff --git a/pkg/addon/templateagent/template_agent.go b/pkg/addon/templateagent/template_agent.go index 71567e803..9795d8b5c 100644 --- a/pkg/addon/templateagent/template_agent.go +++ b/pkg/addon/templateagent/template_agent.go @@ -153,11 +153,11 @@ func (a *CRDTemplateAgentAddon) GetAgentAddonOptions() agent.AgentAddonOptions { template, err := a.GetDesiredAddOnTemplate(nil, "", a.addonName) if err != nil { - utilruntime.HandleError(fmt.Errorf("failed to get addon %s template: %v", a.addonName, err)) + utilruntime.HandleError(fmt.Errorf("GetAgentAddonOptions failed to get addon %s template: %v", a.addonName, err)) return agentAddonOptions } if template == nil { - utilruntime.HandleError(fmt.Errorf("addon %s template is nil", a.addonName)) + utilruntime.HandleError(fmt.Errorf("GetAgentAddonOptions addon %s template is nil", a.addonName)) return agentAddonOptions } agentAddonOptions.ManifestConfigs = template.Spec.AgentSpec.ManifestConfigs diff --git a/test/e2e/addonmanagement_test.go b/test/e2e/addonmanagement_test.go index 413e9eb3e..03a507c82 100644 --- a/test/e2e/addonmanagement_test.go +++ b/test/e2e/addonmanagement_test.go @@ -256,14 +256,14 @@ var _ = ginkgo.Describe("Enable addon management feature gate", ginkgo.Ordered, gomega.Expect(err).ToNot(gomega.HaveOccurred()) gomega.Eventually(func() error { - copyiedConfig, err := spoke.KubeClient.CoreV1().ConfigMaps(addonInstallNamespace).Get( + copiedConfig, err := spoke.KubeClient.CoreV1().ConfigMaps(addonInstallNamespace).Get( context.Background(), configmap.Name, metav1.GetOptions{}) if err != nil { return err } - if !equality.Semantic.DeepEqual(copyiedConfig.Data, configmap.Data) { - return fmt.Errorf("expected configmap is not correct, %v", copyiedConfig.Data) + if !equality.Semantic.DeepEqual(copiedConfig.Data, configmap.Data) { + return fmt.Errorf("expected configmap is not correct, %v", copiedConfig.Data) } return nil }).ShouldNot(gomega.HaveOccurred()) @@ -830,6 +830,94 @@ var _ = ginkgo.Describe("Enable addon management feature gate", ginkgo.Ordered, return nil }).ShouldNot(gomega.HaveOccurred()) }) + + ginkgo.It("ClusterManagementAddon deletion should wait for ManagedClusterAddons cleanup", func() { + ginkgo.By("Make sure addon is functioning before deletion") + configmap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("config-%s", rand.String(6)), + Namespace: universalClusterName, + }, + Data: map[string]string{ + "key1": rand.String(6), + "key2": rand.String(6), + }, + } + + _, err := hub.KubeClient.CoreV1().ConfigMaps(universalClusterName).Create( + context.Background(), configmap, metav1.CreateOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + gomega.Eventually(func() error { + copiedConfig, err := spoke.KubeClient.CoreV1().ConfigMaps(addonInstallNamespace).Get( + context.Background(), configmap.Name, metav1.GetOptions{}) + if err != nil { + return err + } + + if !equality.Semantic.DeepEqual(copiedConfig.Data, configmap.Data) { + return fmt.Errorf("expected configmap is not correct, %v", copiedConfig.Data) + } + return nil + }).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("Delete the ClusterManagementAddon to trigger cascading deletion") + err = hub.AddonClient.AddonV1alpha1().ClusterManagementAddOns().Delete( + context.TODO(), addOnName, metav1.DeleteOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + ginkgo.By("The pre-delete job should clean up the configmap") + gomega.Eventually(func() error { + _, err := spoke.KubeClient.CoreV1().ConfigMaps(addonInstallNamespace).Get( + context.Background(), configmap.Name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return nil + } + return err + } + return fmt.Errorf("the configmap should be deleted") + }).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("ManagedClusterAddon should eventually be deleted after pre-delete job completes") + gomega.Eventually(func() error { + _, err := hub.AddonClient.AddonV1alpha1().ManagedClusterAddOns(universalClusterName).Get( + context.TODO(), addOnName, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return nil + } + return err + } + return fmt.Errorf("the ManagedClusterAddon should be deleted") + }).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("The pre-delete job should be cleaned up") + gomega.Eventually(func() error { + _, err := spoke.KubeClient.BatchV1().Jobs(addonInstallNamespace).Get( + context.Background(), "hello-template-cleanup-configmap", metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return nil + } + return err + } + return fmt.Errorf("the job should be deleted") + }).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("Verify ClusterManagementAddon is deleted") + gomega.Eventually(func() error { + _, err := hub.AddonClient.AddonV1alpha1().ClusterManagementAddOns().Get( + context.TODO(), addOnName, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return nil + } + return err + } + return fmt.Errorf("the ClusterManagementAddon should be deleted") + }).ShouldNot(gomega.HaveOccurred()) + }) }) func prepareInstallNamespace(namespace, installNamespace string) error {