Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bundle/manifests/kepler-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ spec:
- args:
- --openshift
- --deployment-namespace=power-monitor
- --exp.reconciler.token.refresh-interval=24h
- --exp.uwm.token.ttl=168h
- --leader-elect
- --kepler.image=$(RELATED_IMAGE_KEPLER)
- --kube-rbac-proxy.image=$(RELATED_IMAGE_KUBE_RBAC_PROXY)
Expand Down
20 changes: 20 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"os"
"strings"
"time"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
Expand Down Expand Up @@ -71,6 +72,8 @@ func main() {
var enableHTTP2 bool
var tlsOpts []func(*tls.Config)
var additionalNamespaces stringList
var tokenRefreshInterval time.Duration
var tokenTTL time.Duration

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to."+
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
Expand All @@ -92,6 +95,12 @@ func main() {
flag.BoolVar(&openshift, "openshift", false,
"Indicate if the operator is running on an OpenShift cluster.")

flag.DurationVar(&tokenRefreshInterval, "exp.reconciler.token.refresh-interval", controller.Config.TokenRefreshInterval,
"Interval at which the token expiry reconciler requeues for reconciliation.")

flag.DurationVar(&tokenTTL, "exp.uwm.token.ttl", controller.Config.TokenTTL,
"Time-to-live duration for user workload monitoring tokens.")

// NOTE: RELATED_IMAGE_KEPLER can be set as env or flag, flag takes precedence over env
keplerImage := os.Getenv("RELATED_IMAGE_KEPLER")
flag.StringVar(&controller.Config.Image, "kepler.image", keplerImage, "kepler image")
Expand Down Expand Up @@ -136,6 +145,10 @@ func main() {
}
}

controller.Config.TokenRefreshInterval = tokenRefreshInterval
controller.Config.TokenTTL = tokenTTL
powermonitor.TokenTTL = tokenTTL

// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
// More info:
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/metrics/server
Expand Down Expand Up @@ -200,6 +213,13 @@ func main() {
os.Exit(1)
}

if err = (&controller.TokenExpiryReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "token-expiry")
os.Exit(1)
}
if err = (&controller.PowerMonitorReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Expand Down
6 changes: 6 additions & 0 deletions config/manager/overlays/openshift/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ patches:
- op: add
path: /spec/template/spec/containers/0/args/1
value: --deployment-namespace=power-monitor
- op: add
path: /spec/template/spec/containers/0/args/2
value: --exp.reconciler.token.refresh-interval=24h
- op: add
path: /spec/template/spec/containers/0/args/3
value: --exp.uwm.token.ttl=168h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ spec:
- --health-probe-bind-address=:8081
- --metrics-bind-address=127.0.0.1:8080
- --leader-elect
- --exp.reconciler.token.refresh-interval=24h
- --exp.uwm.token.ttl=168h
command:
- /manager
image: quay.io/sustainable_computing_io/kepler-operator:latest
Expand Down
22 changes: 15 additions & 7 deletions internal/controller/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,25 @@

package controller

import "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s"
import (
"time"

"github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s"
)

// Config holds configuration shared across all controllers. This struct
// should be initialized in main

var Config = struct {
KubeRbacProxyImage string
Image string
Cluster k8s.Cluster
KubeRbacProxyImage string
Image string
Cluster k8s.Cluster
TokenRefreshInterval time.Duration
TokenTTL time.Duration
}{
KubeRbacProxyImage: "quay.io/brancz/kube-rbac-proxy:v0.19.0",
Image: "",
Cluster: k8s.Kubernetes,
KubeRbacProxyImage: "quay.io/brancz/kube-rbac-proxy:v0.19.0",
Image: "",
Cluster: k8s.Kubernetes,
TokenRefreshInterval: 24 * time.Hour,
TokenTTL: 168 * time.Hour,
}
7 changes: 7 additions & 0 deletions internal/controller/power_monitor_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package controller
import (
"context"
"fmt"

"slices"
"time"

Expand All @@ -30,6 +31,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/retry"

monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
Expand Down Expand Up @@ -134,6 +136,7 @@ func (r *PowerMonitorInternalReconciler) SetupWithManager(mgr ctrl.Manager) erro
Owns(&corev1.ServiceAccount{}, genChanged).
Owns(&corev1.Service{}, genChanged).
Owns(&appsv1.DaemonSet{}, resVerChanged).
Owns(&monv1.ServiceMonitor{}, genChanged).
Owns(&rbacv1.ClusterRoleBinding{}, genChanged).
Owns(&rbacv1.ClusterRole{}, genChanged).
// NOTE: requires resVerChanged for ConfigMap & Secret since
Expand Down Expand Up @@ -476,6 +479,8 @@ func powerMonitorExporters(pmi *v1alpha1.PowerMonitorInternal, ds *appsv1.Daemon
fmt.Sprintf("%s:%s", powermonitor.UWMNamespace, powermonitor.UWMServiceAccountName),
)

sm := powermonitor.NewPowerMonitorServiceMonitor(components.Full, pmi)

// cluster-scoped resources first
// update cluster role before cluster role binding
rs := resourceReconcilers(updateResource,
Expand Down Expand Up @@ -504,6 +509,7 @@ func powerMonitorExporters(pmi *v1alpha1.PowerMonitorInternal, ds *appsv1.Daemon
Pmi: pmi,
Cluster: cluster,
Ds: ds,
Sm: sm,
EnableRBAC: enableRBAC,
EnableUWM: enableUWM,
},
Expand All @@ -516,6 +522,7 @@ func powerMonitorExporters(pmi *v1alpha1.PowerMonitorInternal, ds *appsv1.Daemon
rs = append(rs,
reconciler.PowerMonitorServiceMonitorReconciler{
Pmi: pmi,
Sm: sm,
EnableRBAC: enableRBAC,
EnableUWM: enableUWM,
},
Expand Down
147 changes: 147 additions & 0 deletions internal/controller/token_expiry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// SPDX-FileCopyrightText: 2025 The Kepler Authors
// SPDX-License-Identifier: Apache-2.0

package controller

import (
"context"
"time"

"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"

powermonitor "github.com/sustainable.computing.io/kepler-operator/pkg/components/power-monitor"
"github.com/sustainable.computing.io/kepler-operator/pkg/reconciler"

"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/event"

corev1 "k8s.io/api/core/v1"

ctrl "sigs.k8s.io/controller-runtime"
)

type TokenExpiryReconciler struct {
client.Client
Scheme *runtime.Scheme
logger logr.Logger
}

// RBAC for TokenExpirationReconciler
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;delete

// SetupWithManager sets up the controller with the Manager.
func (r *TokenExpiryReconciler) SetupWithManager(mgr ctrl.Manager) error {
secretPredicate := builder.WithPredicates(predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
return r.inPowerMonitorNamespace(e.Object) && r.isPrometheusUserWorkloadToken(e.Object)
},
UpdateFunc: func(e event.UpdateEvent) bool {
return r.inPowerMonitorNamespace(e.ObjectNew) && r.isPrometheusUserWorkloadToken(e.ObjectNew)
},
DeleteFunc: func(e event.DeleteEvent) bool {
return false
},
GenericFunc: func(e event.GenericEvent) bool {
return r.inPowerMonitorNamespace(e.Object) && r.isPrometheusUserWorkloadToken(e.Object)
},
})

return ctrl.NewControllerManagedBy(mgr).
For(&corev1.Secret{}, secretPredicate).
Complete(r)
}

// inPowerMonitorNamespace checks if object is in the PowerMonitorDeploymentNS namespace
func (r *TokenExpiryReconciler) inPowerMonitorNamespace(obj client.Object) bool {
return obj.GetNamespace() == PowerMonitorDeploymentNS
}

// isPrometheusUserWorkloadToken checks if the secret is the prometheus-user-workload-token
func (r *TokenExpiryReconciler) isPrometheusUserWorkloadToken(obj client.Object) bool {
return obj.GetName() == powermonitor.SecretUWMTokenName
}

// hasExpirationAnnotation checks if the secret has an expiration annotation
func (r *TokenExpiryReconciler) hasExpirationAnnotation(obj client.Object) bool {
secret, ok := obj.(*corev1.Secret)
if !ok {
return false
}

annotations := secret.GetAnnotations()
if annotations == nil {
return false
}

_, exists := annotations[powermonitor.SecretTokenExpirationAnnotation]
return exists
}

// deleteResources is a helper function that creates and runs deleter reconcilers for the given resources
func (r *TokenExpiryReconciler) deleteResources(ctx context.Context, resources ...client.Object) (ctrl.Result, error) {
reconcilers := resourceReconcilers(deleteResource, resources...)

return reconciler.Runner{
Reconcilers: reconcilers,
Client: r.Client,
Scheme: r.Scheme,
Logger: r.logger,
}.Run(ctx)
}

func (r *TokenExpiryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
r.logger = logger

logger.Info("Start of reconcile")
defer logger.Info("End of reconcile")

secret := &corev1.Secret{}
err := r.Get(ctx, req.NamespacedName, secret)

Check failure on line 105 in internal/controller/token_expiry.go

View workflow job for this annotation

GitHub Actions / golangci

r.Get undefined (type *TokenExpiryReconciler has no field or method Get) (typecheck)
if err != nil {
if errors.IsNotFound(err) {
r.logger.Info("secret not found, continue without error")
return ctrl.Result{}, nil
}
r.logger.Error(err, "failed to retrieve secret")
return ctrl.Result{}, err
}

if !r.hasExpirationAnnotation(secret) {
r.logger.Info("prometheus-user-workload-token does not have expiration annotation, deleting it")
return r.deleteResources(ctx, secret)
}

expired, expirationTime, err := r.isSecretExpired(secret)
if err != nil {
r.logger.Error(err, "failed to extract expiration time")
return ctrl.Result{RequeueAfter: time.Minute * 5}, nil
}

if expired {
r.logger.Info("secret has expired, reconciling", "expiration-time", expirationTime)
return r.deleteResources(ctx, secret)
}

timeUntilExpiration := time.Until(expirationTime)
r.logger.Info("secret not expired yet, requeuing", "expiration-time", expirationTime, "time-until-expiration", timeUntilExpiration)

return ctrl.Result{RequeueAfter: Config.TokenRefreshInterval}, nil
}

// isSecretExpired checks if the secret has expired according to the expiration annotation
func (r *TokenExpiryReconciler) isSecretExpired(secret *corev1.Secret) (bool, time.Time, error) {
expirationTime, err := powermonitor.GetExpirationFromAnnotation(&secret.ObjectMeta, powermonitor.SecretTokenExpirationAnnotation)
if err != nil {
return false, time.Time{}, err
}
if expirationTime == nil {
return false, time.Time{}, nil
}
return time.Now().After(expirationTime.Add(-(Config.TokenRefreshInterval * 2))), *expirationTime, nil
}
Loading
Loading