diff --git a/cmd/main.go b/cmd/main.go index a3220668..9344037f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -41,6 +41,7 @@ import ( "github.com/kcp-dev/kcp-operator/internal/controller/kubeconfig" "github.com/kcp-dev/kcp-operator/internal/controller/rootshard" "github.com/kcp-dev/kcp-operator/internal/controller/shard" + "github.com/kcp-dev/kcp-operator/internal/metrics" "github.com/kcp-dev/kcp-operator/internal/reconciling" operatorv1alpha1 "github.com/kcp-dev/kcp-operator/sdk/apis/operator/v1alpha1" ) @@ -190,6 +191,12 @@ func main() { } // +kubebuilder:scaffold:builder + metrics.RegisterMetrics() + + metricsCollector := metrics.NewMetricsCollector(mgr.GetClient()) + ctx := ctrl.SetupSignalHandler() + go metricsCollector.Start(ctx) + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { setupLog.Error(err, "unable to set up health check") os.Exit(1) @@ -200,7 +207,7 @@ func main() { } setupLog.Info("starting manager") - if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") os.Exit(1) } diff --git a/go.mod b/go.mod index 46fc179b..0cefdeea 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/kcp-dev/kcp-operator/sdk v0.0.0-00010101000000-000000000000 github.com/kcp-dev/kcp/sdk v0.27.1 github.com/kcp-dev/logicalcluster/v3 v3.0.5 + github.com/prometheus/client_golang v1.20.5 github.com/stretchr/testify v1.10.0 go.uber.org/zap v1.27.0 k8c.io/reconciler v0.5.0 @@ -68,7 +69,6 @@ require ( github.com/onsi/gomega v1.35.1 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.20.5 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.61.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect diff --git a/internal/controller/frontproxy/controller.go b/internal/controller/frontproxy/controller.go index 84f663cd..006b9f1d 100644 --- a/internal/controller/frontproxy/controller.go +++ b/internal/controller/frontproxy/controller.go @@ -19,6 +19,7 @@ package frontproxy import ( "context" "fmt" + "time" certmanagerv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" @@ -38,6 +39,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/kcp-dev/kcp-operator/internal/controller/util" + "github.com/kcp-dev/kcp-operator/internal/metrics" "github.com/kcp-dev/kcp-operator/internal/resources" "github.com/kcp-dev/kcp-operator/internal/resources/frontproxy" operatorv1alpha1 "github.com/kcp-dev/kcp-operator/sdk/apis/operator/v1alpha1" @@ -89,12 +91,19 @@ func (r *FrontProxyReconciler) SetupWithManager(mgr ctrl.Manager) error { // +kubebuilder:rbac:groups=core,resources=services;configmaps;secrets,verbs=get;list;watch;create;update;patch;delete func (r *FrontProxyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, recErr error) { + startTime := time.Now() + defer func() { + duration := time.Since(startTime) + metrics.RecordReconciliationMetrics(metrics.FrontProxyResourceType, duration.Seconds(), recErr) + }() + logger := log.FromContext(ctx) logger.V(4).Info("Reconciling") var frontProxy operatorv1alpha1.FrontProxy if err := r.Get(ctx, req.NamespacedName, &frontProxy); err != nil { if ctrlruntimeclient.IgnoreNotFound(err) != nil { + metrics.RecordReconciliationError(metrics.FrontProxyResourceType, err.Error()) return ctrl.Result{}, fmt.Errorf("failed to get FrontProxy object: %w", err) } @@ -108,6 +117,14 @@ func (r *FrontProxyReconciler) Reconcile(ctx context.Context, req ctrl.Request) recErr = kerrors.NewAggregate([]error{recErr, err}) } + metrics.RecordObjectMetrics( + metrics.FrontProxyResourceType, + frontProxy.Name, + req.Namespace, + string(frontProxy.Status.Phase), + frontProxy.Status.Conditions, + ) + return ctrl.Result{}, recErr } diff --git a/internal/controller/kubeconfig/controller.go b/internal/controller/kubeconfig/controller.go index fc6a4059..12591ad0 100644 --- a/internal/controller/kubeconfig/controller.go +++ b/internal/controller/kubeconfig/controller.go @@ -40,6 +40,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "github.com/kcp-dev/kcp-operator/internal/controller/util" + "github.com/kcp-dev/kcp-operator/internal/metrics" "github.com/kcp-dev/kcp-operator/internal/reconciling" "github.com/kcp-dev/kcp-operator/internal/resources" "github.com/kcp-dev/kcp-operator/internal/resources/kubeconfig" @@ -74,6 +75,12 @@ func (r *KubeconfigReconciler) SetupWithManager(mgr ctrl.Manager) error { // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. func (r *KubeconfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + startTime := time.Now() + defer func() { + duration := time.Since(startTime) + metrics.RecordReconciliationMetrics(metrics.KubeconfigResourceType, duration.Seconds(), nil) + }() + logger := log.FromContext(ctx) logger.V(4).Info("Reconciling") @@ -83,6 +90,7 @@ func (r *KubeconfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if apierrors.IsNotFound(err) { return ctrl.Result{}, nil } + metrics.RecordReconciliationError(metrics.KubeconfigResourceType, err.Error()) return ctrl.Result{}, err } @@ -113,6 +121,14 @@ func (r *KubeconfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) recErr = kerrors.NewAggregate([]error{recErr, err}) } + metrics.RecordObjectMetrics( + metrics.KubeconfigResourceType, + kc.Name, + req.Namespace, + string(kc.Status.Phase), + kc.Status.Conditions, + ) + return ctrl.Result{}, recErr } diff --git a/internal/controller/rootshard/controller.go b/internal/controller/rootshard/controller.go index fc021b2c..f12bdde0 100644 --- a/internal/controller/rootshard/controller.go +++ b/internal/controller/rootshard/controller.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "sort" + "time" certmanagerv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" k8creconciling "k8c.io/reconciler/pkg/reconciling" @@ -40,6 +41,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/kcp-dev/kcp-operator/internal/controller/util" + "github.com/kcp-dev/kcp-operator/internal/metrics" "github.com/kcp-dev/kcp-operator/internal/reconciling" "github.com/kcp-dev/kcp-operator/internal/resources" "github.com/kcp-dev/kcp-operator/internal/resources/frontproxy" @@ -99,12 +101,19 @@ func (r *RootShardReconciler) SetupWithManager(mgr ctrl.Manager) error { // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.0/pkg/reconcile func (r *RootShardReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, recErr error) { + startTime := time.Now() + defer func() { + duration := time.Since(startTime) + metrics.RecordReconciliationMetrics(metrics.RootShardResourceType, duration.Seconds(), recErr) + }() + logger := log.FromContext(ctx) logger.V(4).Info("Reconciling") var rootShard operatorv1alpha1.RootShard if err := r.Get(ctx, req.NamespacedName, &rootShard); err != nil { if ctrlruntimeclient.IgnoreNotFound(err) != nil { + metrics.RecordReconciliationError(metrics.RootShardResourceType, err.Error()) return ctrl.Result{}, fmt.Errorf("failed to find %s/%s: %w", req.Namespace, req.Name, err) } @@ -118,6 +127,14 @@ func (r *RootShardReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( recErr = kerrors.NewAggregate([]error{recErr, err}) } + metrics.RecordObjectMetrics( + metrics.RootShardResourceType, + rootShard.Name, + req.Namespace, + string(rootShard.Status.Phase), + rootShard.Status.Conditions, + ) + return ctrl.Result{}, recErr } diff --git a/internal/controller/shard/controller.go b/internal/controller/shard/controller.go index 1fd52766..468f6430 100644 --- a/internal/controller/shard/controller.go +++ b/internal/controller/shard/controller.go @@ -19,6 +19,7 @@ package shard import ( "context" "fmt" + "time" certmanagerv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" k8creconciling "k8c.io/reconciler/pkg/reconciling" @@ -39,6 +40,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/kcp-dev/kcp-operator/internal/controller/util" + "github.com/kcp-dev/kcp-operator/internal/metrics" "github.com/kcp-dev/kcp-operator/internal/reconciling" "github.com/kcp-dev/kcp-operator/internal/resources" "github.com/kcp-dev/kcp-operator/internal/resources/shard" @@ -90,12 +92,19 @@ func (r *ShardReconciler) SetupWithManager(mgr ctrl.Manager) error { // +kubebuilder:rbac:groups=core,resources=secrets;services,verbs=get;list;watch;create;update;patch;delete func (r *ShardReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, recErr error) { + startTime := time.Now() + defer func() { + duration := time.Since(startTime) + metrics.RecordReconciliationMetrics(metrics.ShardResourceType, duration.Seconds(), recErr) + }() + logger := log.FromContext(ctx) logger.V(4).Info("Reconciling Shard object") var s operatorv1alpha1.Shard if err := r.Get(ctx, req.NamespacedName, &s); err != nil { if ctrlruntimeclient.IgnoreNotFound(err) != nil { + metrics.RecordReconciliationError(metrics.ShardResourceType, err.Error()) return ctrl.Result{}, fmt.Errorf("failed to get shard: %w", err) } @@ -108,6 +117,14 @@ func (r *ShardReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res recErr = kerrors.NewAggregate([]error{recErr, err}) } + metrics.RecordObjectMetrics( + metrics.ShardResourceType, + s.Name, + req.Namespace, + string(s.Status.Phase), + s.Status.Conditions, + ) + return ctrl.Result{}, recErr } diff --git a/internal/metrics/collector.go b/internal/metrics/collector.go new file mode 100644 index 00000000..7fbfbe5b --- /dev/null +++ b/internal/metrics/collector.go @@ -0,0 +1,181 @@ +/* +Copyright 2025 The kcp Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "time" + + ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" + + operatorv1alpha1 "github.com/kcp-dev/kcp-operator/sdk/apis/operator/v1alpha1" +) + +const ( + UnknownPhase = "Unknown" +) + +type MetricsCollector struct { + client ctrlruntimeclient.Client +} + +func NewMetricsCollector(client ctrlruntimeclient.Client) *MetricsCollector { + return &MetricsCollector{ + client: client, + } +} + +func (mc *MetricsCollector) Start(ctx context.Context) { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + + mc.updateObjectCounts(ctx) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + mc.updateObjectCounts(ctx) + } + } +} + +func (mc *MetricsCollector) updateObjectCounts(ctx context.Context) { + mc.updateRootShardCounts(ctx) + mc.updateShardCounts(ctx) + mc.updateFrontProxyCounts(ctx) + mc.updateCacheServerCounts(ctx) + mc.updateKubeconfigCounts(ctx) +} + +func (mc *MetricsCollector) updateRootShardCounts(ctx context.Context) { + var rootShards operatorv1alpha1.RootShardList + if err := mc.client.List(ctx, &rootShards); err != nil { + return + } + + RootShardCount.Reset() + + phaseCounts := make(map[string]map[string]int) + for _, rs := range rootShards.Items { + phase := string(rs.Status.Phase) + if phase == "" { + phase = UnknownPhase + } + if phaseCounts[phase] == nil { + phaseCounts[phase] = make(map[string]int) + } + phaseCounts[phase][rs.Namespace]++ + } + + for phase, namespaceCounts := range phaseCounts { + for namespace, count := range namespaceCounts { + RootShardCount.WithLabelValues(phase, namespace).Set(float64(count)) + } + } +} + +func (mc *MetricsCollector) updateShardCounts(ctx context.Context) { + var shards operatorv1alpha1.ShardList + if err := mc.client.List(ctx, &shards); err != nil { + return + } + + ShardCount.Reset() + + phaseCounts := make(map[string]map[string]int) + for _, s := range shards.Items { + phase := string(s.Status.Phase) + if phase == "" { + phase = UnknownPhase + } + if phaseCounts[phase] == nil { + phaseCounts[phase] = make(map[string]int) + } + phaseCounts[phase][s.Namespace]++ + } + + for phase, namespaceCounts := range phaseCounts { + for namespace, count := range namespaceCounts { + ShardCount.WithLabelValues(phase, namespace).Set(float64(count)) + } + } +} + +func (mc *MetricsCollector) updateFrontProxyCounts(ctx context.Context) { + var frontProxies operatorv1alpha1.FrontProxyList + if err := mc.client.List(ctx, &frontProxies); err != nil { + return + } + + FrontProxyCount.Reset() + + phaseCounts := make(map[string]map[string]int) + for _, fp := range frontProxies.Items { + phase := string(fp.Status.Phase) + if phase == "" { + phase = UnknownPhase + } + if phaseCounts[phase] == nil { + phaseCounts[phase] = make(map[string]int) + } + phaseCounts[phase][fp.Namespace]++ + } + + for phase, namespaceCounts := range phaseCounts { + for namespace, count := range namespaceCounts { + FrontProxyCount.WithLabelValues(phase, namespace).Set(float64(count)) + } + } +} + +func (mc *MetricsCollector) updateCacheServerCounts(ctx context.Context) { + var cacheServers operatorv1alpha1.CacheServerList + if err := mc.client.List(ctx, &cacheServers); err != nil { + return + } + + CacheServerCount.Reset() + + namespaceCounts := make(map[string]int) + for _, cs := range cacheServers.Items { + namespaceCounts[cs.Namespace]++ + } + + for namespace, count := range namespaceCounts { + CacheServerCount.WithLabelValues(namespace).Set(float64(count)) + } +} + +func (mc *MetricsCollector) updateKubeconfigCounts(ctx context.Context) { + var kubeconfigs operatorv1alpha1.KubeconfigList + if err := mc.client.List(ctx, &kubeconfigs); err != nil { + return + } + + KubeconfigCount.Reset() + + namespaceCounts := make(map[string]int) + for _, kc := range kubeconfigs.Items { + namespaceCounts[kc.Namespace]++ + } + + for namespace, count := range namespaceCounts { + KubeconfigCount.WithLabelValues(namespace).Set(float64(count)) + } +} diff --git a/internal/metrics/helpers.go b/internal/metrics/helpers.go new file mode 100644 index 00000000..51923c50 --- /dev/null +++ b/internal/metrics/helpers.go @@ -0,0 +1,83 @@ +/* +Copyright 2025 The kcp Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + RootShardResourceType = "rootshard" + ShardResourceType = "shard" + FrontProxyResourceType = "frontproxy" + CacheServerResourceType = "cacheserver" + KubeconfigResourceType = "kubeconfig" +) + +// RecordObjectMetrics records metrics for a Kubernetes object with phase and conditions +func RecordObjectMetrics(resourceType, resourceName, namespace, phase string, conditions []metav1.Condition) { + if phase == "" { + phase = "Unknown" + } + + switch resourceType { + case RootShardResourceType: + RootShardCount.WithLabelValues(phase, namespace).Set(1) + case ShardResourceType: + ShardCount.WithLabelValues(phase, namespace).Set(1) + case FrontProxyResourceType: + FrontProxyCount.WithLabelValues(phase, namespace).Set(1) + case CacheServerResourceType: + CacheServerCount.WithLabelValues(namespace).Set(1) + case KubeconfigResourceType: + KubeconfigCount.WithLabelValues(namespace).Set(1) + } + + for _, condition := range conditions { + status := 0.0 + switch condition.Status { + case metav1.ConditionTrue: + status = 1.0 + case metav1.ConditionFalse: + status = 0.0 + case metav1.ConditionUnknown: + status = -1.0 + } + + ConditionStatus.WithLabelValues( + resourceType, + resourceName, + namespace, + condition.Type, + ).Set(status) + } +} + +// RecordReconciliationMetrics records reconciliation duration and error metrics +func RecordReconciliationMetrics(controller string, duration float64, err error) { + result := "success" + if err != nil { + result = "error" + ReconciliationErrors.WithLabelValues(controller, "reconcile_error").Inc() + } + ReconciliationDuration.WithLabelValues(controller, result).Observe(duration) +} + +// RecordReconciliationError records a specific reconciliation error +func RecordReconciliationError(controller, errorType string) { + ReconciliationErrors.WithLabelValues(controller, errorType).Inc() +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 00000000..68551d31 --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,121 @@ +/* +Copyright 2025 The kcp Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + + ctrlruntimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // RootShardCount tracks the number of RootShard objects by their current phase. + // Labels: phase (Provisioning|Running|Deleting), namespace + RootShardCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kcp_operator_rootshard_count", + Help: "Number of RootShard objects by phase", + }, + []string{"phase", "namespace"}, + ) + + // ShardCount tracks the number of Shard objects by their current phase. + // Labels: phase (Provisioning|Running|Deleting), namespace + ShardCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kcp_operator_shard_count", + Help: "Number of Shard objects by phase", + }, + []string{"phase", "namespace"}, + ) + + // FrontProxyCount tracks the number of FrontProxy objects by their current phase. + // Labels: phase (Provisioning|Running|Deleting), namespace + FrontProxyCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kcp_operator_frontproxy_count", + Help: "Number of FrontProxy objects by phase", + }, + []string{"phase", "namespace"}, + ) + + // CacheServerCount tracks the number of CacheServer objects by namespace. + // Labels: namespace + CacheServerCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kcp_operator_cacheserver_count", + Help: "Number of CacheServer objects by namespace", + }, + []string{"namespace"}, + ) + + // KubeconfigCount tracks the number of Kubeconfig objects by namespace. + // Labels: namespace + KubeconfigCount = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kcp_operator_kubeconfig_count", + Help: "Number of Kubeconfig objects by namespace", + }, + []string{"namespace"}, + ) + + // ReconciliationDuration measures the time taken to reconcile kcp operator resources. + // Labels: controller (rootshard|shard|frontproxy|kubeconfig|cacheserver), result (success|error) + ReconciliationDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "kcp_operator_reconciliation_duration_seconds", + Help: "Time taken to reconcile objects", + Buckets: prometheus.DefBuckets, + }, + []string{"controller", "result"}, + ) + + // ReconciliationErrors counts the total number of reconciliation errors by controller and error type. + // Labels: controller (rootshard|shard|frontproxy|kubeconfig|cacheserver), error_type + ReconciliationErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kcp_operator_reconciliation_errors_total", + Help: "Total number of reconciliation errors", + }, + []string{"controller", "error_type"}, + ) + + // ConditionStatus tracks the status of conditions on kcp operator resources. + // Values: 1.0 (True), 0.0 (False), -1.0 (Unknown) + // Labels: resource_type (rootshard|shard|frontproxy|cacheserver|kubeconfig), + // resource_name, namespace, condition_type (Available|RootShard) + ConditionStatus = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "kcp_operator_condition_status", + Help: "Status of conditions", + }, + []string{"resource_type", "resource_name", "namespace", "condition_type"}, + ) +) + +func RegisterMetrics() { + ctrlruntimemetrics.Registry.MustRegister( + RootShardCount, + ShardCount, + FrontProxyCount, + CacheServerCount, + KubeconfigCount, + ReconciliationDuration, + ReconciliationErrors, + ConditionStatus, + ) +}