diff --git a/.github/workflows/pr-golangci-lint.yaml b/.github/workflows/pr-golangci-lint.yaml index 0ae33198bb7e..5409f60ab4f6 100644 --- a/.github/workflows/pr-golangci-lint.yaml +++ b/.github/workflows/pr-golangci-lint.yaml @@ -11,6 +11,10 @@ jobs: golangci: name: lint runs-on: ubuntu-latest + env: + # Required with Go 1.24 to enable usage of synctest in unit tests + # Can be removed after we bumped to Go 1.25. + GOEXPERIMENT: synctest strategy: fail-fast: false matrix: diff --git a/.golangci.yml b/.golangci.yml index d36de6d69989..4d98044ecad1 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -20,6 +20,7 @@ linters: - durationcheck # multiplying two durations - errcheck # unchecked errors - errchkjson # invalid types passed to json encoder + - forbidigo # allows to block usage of funcs - ginkgolinter # ginkgo and gomega - gocritic # bugs, performance, style (we could add custom ones to this one) - godot # checks that comments end in a period @@ -52,6 +53,10 @@ linters: # TODO: It will be dropped when the Go version migration is done. - usetesting settings: + forbidigo: + forbid: + - pattern: ctrl.NewControllerManagedBy + msg: Use capicontrollerutil.NewControllerManagedBy instead ginkgolinter: forbid-focus-container: true gocritic: @@ -180,6 +185,8 @@ linters: alias: topologynames - pkg: sigs.k8s.io/cluster-api/internal/util/client alias: "clientutil" + - pkg: sigs.k8s.io/cluster-api/internal/util/controller + alias: "capicontrollerutil" # CAPD - pkg: sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1alpha3 alias: infrav1alpha3 diff --git a/Makefile b/Makefile index 5c50caff88fb..e8436cc95c96 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,10 @@ GO_CONTAINER_IMAGE ?= docker.io/library/golang:$(GO_VERSION) GOTOOLCHAIN = go$(GO_VERSION) export GOTOOLCHAIN +# Required with Go 1.24 to enable usage of synctest in unit tests +# Can be removed after we bumped to Go 1.25. +export GOEXPERIMENT=synctest + # Use GOPROXY environment variable if set GOPROXY := $(shell go env GOPROXY) ifeq ($(GOPROXY),) diff --git a/bootstrap/kubeadm/config/manager/manager.yaml b/bootstrap/kubeadm/config/manager/manager.yaml index d3bc3688efdb..e874de005b08 100644 --- a/bootstrap/kubeadm/config/manager/manager.yaml +++ b/bootstrap/kubeadm/config/manager/manager.yaml @@ -22,7 +22,7 @@ spec: - "--leader-elect" - "--diagnostics-address=${CAPI_DIAGNOSTICS_ADDRESS:=:8443}" - "--insecure-diagnostics=${CAPI_INSECURE_DIAGNOSTICS:=false}" - - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},KubeadmBootstrapFormatIgnition=${EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false}" + - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},KubeadmBootstrapFormatIgnition=${EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false},ReconcilerRateLimiting=${EXP_RECONCILER_RATE_LIMITING:=false}" - "--bootstrap-token-ttl=${KUBEADM_BOOTSTRAP_TOKEN_TTL:=15m}" image: controller:latest name: manager diff --git a/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go b/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go index 607dec24c378..d01545898b27 100644 --- a/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go +++ b/bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go @@ -37,7 +37,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" @@ -53,6 +52,7 @@ import ( bsutil "sigs.k8s.io/cluster-api/bootstrap/util" "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/feature" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/taints" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -116,33 +116,26 @@ func (r *KubeadmConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "kubeadmconfig") - b := ctrl.NewControllerManagedBy(mgr). + b := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&bootstrapv1.KubeadmConfig{}). WithOptions(options). Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(r.MachineToBootstrapMapFunc), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ).WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)) if feature.Gates.Enabled(feature.MachinePool) { b = b.Watches( &clusterv1.MachinePool{}, handler.EnqueueRequestsFromMapFunc(r.MachinePoolToBootstrapMapFunc), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ) } b = b.Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(r.ClusterToKubeadmConfigs), - builder.WithPredicates( - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), - predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), - ), - ), + predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), + predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), ).WatchesRawSource(r.ClusterCache.GetClusterSource("kubeadmconfig", r.ClusterToKubeadmConfigs)) if err := b.Complete(r); err != nil { @@ -268,12 +261,6 @@ func (r *KubeadmConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques if err := patchHelper.Patch(ctx, config, patchOpts...); err != nil { rerr = kerrors.NewAggregate([]error{rerr, err}) } - - // Note: controller-runtime logs a warning that non-empty result is ignored - // if error is not nil, so setting result here to empty to avoid noisy warnings. - if rerr != nil { - retRes = ctrl.Result{} - } }() // Ignore deleted KubeadmConfigs. diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 4d46daf4c9e0..f14cbd505132 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -23,7 +23,7 @@ spec: - "--leader-elect" - "--diagnostics-address=${CAPI_DIAGNOSTICS_ADDRESS:=:8443}" - "--insecure-diagnostics=${CAPI_INSECURE_DIAGNOSTICS:=false}" - - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},ClusterTopology=${CLUSTER_TOPOLOGY:=false},RuntimeSDK=${EXP_RUNTIME_SDK:=false},MachineSetPreflightChecks=${EXP_MACHINE_SET_PREFLIGHT_CHECKS:=true},MachineWaitForVolumeDetachConsiderVolumeAttachments=${EXP_MACHINE_WAITFORVOLUMEDETACH_CONSIDER_VOLUMEATTACHMENTS:=true},PriorityQueue=${EXP_PRIORITY_QUEUE:=false},InPlaceUpdates=${EXP_IN_PLACE_UPDATES:=false},MachineTaintPropagation=${EXP_MACHINE_TAINT_PROPAGATION:=false}" + - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},ClusterTopology=${CLUSTER_TOPOLOGY:=false},RuntimeSDK=${EXP_RUNTIME_SDK:=false},MachineSetPreflightChecks=${EXP_MACHINE_SET_PREFLIGHT_CHECKS:=true},MachineWaitForVolumeDetachConsiderVolumeAttachments=${EXP_MACHINE_WAITFORVOLUMEDETACH_CONSIDER_VOLUMEATTACHMENTS:=true},PriorityQueue=${EXP_PRIORITY_QUEUE:=false},ReconcilerRateLimiting=${EXP_RECONCILER_RATE_LIMITING:=false},InPlaceUpdates=${EXP_IN_PLACE_UPDATES:=false},MachineTaintPropagation=${EXP_MACHINE_TAINT_PROPAGATION:=false}" image: controller:latest name: manager env: diff --git a/controllers/clustercache/cluster_cache.go b/controllers/clustercache/cluster_cache.go index 2f4cbcad0986..cf4d20ffafac 100644 --- a/controllers/clustercache/cluster_cache.go +++ b/controllers/clustercache/cluster_cache.go @@ -44,6 +44,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util/predicates" ) @@ -328,7 +329,8 @@ func SetupWithManager(ctx context.Context, mgr manager.Manager, options Options, cacheCtxCancel: cacheCtxCancel, } - err := ctrl.NewControllerManagedBy(mgr). + predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "clustercache") + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). Named("clustercache"). For(&clusterv1.Cluster{}). WithOptions(controllerOptions). diff --git a/controllers/crdmigrator/crd_migrator.go b/controllers/crdmigrator/crd_migrator.go index b5238dd6c1aa..e56110fd0ba8 100644 --- a/controllers/crdmigrator/crd_migrator.go +++ b/controllers/crdmigrator/crd_migrator.go @@ -45,6 +45,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/contract" "sigs.k8s.io/cluster-api/util/predicates" @@ -121,7 +122,7 @@ func (r *CRDMigrator) SetupWithManager(ctx context.Context, mgr ctrl.Manager, co } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "crdmigrator") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&apiextensionsv1.CustomResourceDefinition{}, // This controller uses a PartialObjectMetadata watch/informer to avoid an informer for CRDs // to reduce memory usage. diff --git a/controlplane/kubeadm/config/manager/manager.yaml b/controlplane/kubeadm/config/manager/manager.yaml index 8b0dc6215be2..096ac647011a 100644 --- a/controlplane/kubeadm/config/manager/manager.yaml +++ b/controlplane/kubeadm/config/manager/manager.yaml @@ -22,7 +22,7 @@ spec: - "--leader-elect" - "--diagnostics-address=${CAPI_DIAGNOSTICS_ADDRESS:=:8443}" - "--insecure-diagnostics=${CAPI_INSECURE_DIAGNOSTICS:=false}" - - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},ClusterTopology=${CLUSTER_TOPOLOGY:=false},KubeadmBootstrapFormatIgnition=${EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false},InPlaceUpdates=${EXP_IN_PLACE_UPDATES:=false}" + - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},ClusterTopology=${CLUSTER_TOPOLOGY:=false},KubeadmBootstrapFormatIgnition=${EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false},ReconcilerRateLimiting=${EXP_RECONCILER_RATE_LIMITING:=false},InPlaceUpdates=${EXP_IN_PLACE_UPDATES:=false}" image: controller:latest name: manager env: diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index ac27a96c58fd..f879cb34ea53 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -36,7 +36,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -49,6 +48,7 @@ import ( "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" runtimeclient "sigs.k8s.io/cluster-api/exp/runtime/client" "sigs.k8s.io/cluster-api/feature" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/inplace" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" @@ -85,7 +85,7 @@ type KubeadmControlPlaneReconciler struct { APIReader client.Reader SecretCachingClient client.Client RuntimeClient runtimeclient.Client - controller controller.Controller + controller capicontrollerutil.Controller recorder record.EventRecorder ClusterCache clustercache.ClusterCache @@ -132,23 +132,18 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mg } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "kubeadmcontrolplane") - c, err := ctrl.NewControllerManagedBy(mgr). + c, err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&controlplanev1.KubeadmControlPlane{}). - Owns(&clusterv1.Machine{}, builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog))). + Owns(&clusterv1.Machine{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(r.ClusterToKubeadmControlPlane), - builder.WithPredicates( - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), - predicates.Any(mgr.GetScheme(), predicateLog, - predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), - predicates.ClusterTopologyVersionChanged(mgr.GetScheme(), predicateLog), - ), - ), + predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), + predicates.Any(mgr.GetScheme(), predicateLog, + predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), + predicates.ClusterTopologyVersionChanged(mgr.GetScheme(), predicateLog), ), ). WatchesRawSource(r.ClusterCache.GetClusterSource("kubeadmcontrolplane", r.ClusterToKubeadmControlPlane, @@ -283,12 +278,6 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl. res = ctrl.Result{RequeueAfter: 20 * time.Second} } } - - // Note: controller-runtime logs a warning that non-empty result is ignored - // if error is not nil, so setting result here to empty to avoid noisy warnings. - if reterr != nil { - res = ctrl.Result{} - } }() if !kcp.DeletionTimestamp.IsZero() { diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 4c310a12e18d..e9301b199527 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strings" + "time" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" @@ -190,6 +191,9 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(ctx context.Context, con } log.Info(fmt.Sprintf("Waiting for a version upgrade to %s to be propagated", v)) controlPlane.PreflightCheckResults.TopologyVersionMismatch = true + // Slow down reconcile frequency, as deferring a version upgrade waits for slow processes, + // e.g. workers are completing a previous upgrade step. + r.controller.DeferNextReconcileForObject(controlPlane.KCP, time.Now().Add(5*time.Second)) return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter} } } @@ -198,6 +202,8 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(ctx context.Context, con if controlPlane.HasDeletingMachine() { controlPlane.PreflightCheckResults.HasDeletingMachine = true log.Info("Waiting for machines to be deleted", "machines", strings.Join(controlPlane.Machines.Filter(collections.HasDeletionTimestamp).Names(), ", ")) + // Slow down reconcile frequency, deletion is a slow process. + r.controller.DeferNextReconcileForObject(controlPlane.KCP, time.Now().Add(5*time.Second)) return ctrl.Result{RequeueAfter: deleteRequeueAfter} } @@ -254,7 +260,10 @@ loopmachines: r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass preflight checks to continue reconciliation: %v", aggregatedError) log.Info("Waiting for control plane to pass preflight checks", "failures", aggregatedError.Error()) - + // Slow down reconcile frequency, it takes some time before control plane components stabilize + // after a new Machine is created. Similarly, if there are issues on running Machines, it + // usually takes some time to get back to normal state. + r.controller.DeferNextReconcileForObject(controlPlane.KCP, time.Now().Add(5*time.Second)) return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter} } diff --git a/controlplane/kubeadm/internal/controllers/scale_test.go b/controlplane/kubeadm/internal/controllers/scale_test.go index 84cfa119f654..ccd5bb1aa66a 100644 --- a/controlplane/kubeadm/internal/controllers/scale_test.go +++ b/controlplane/kubeadm/internal/controllers/scale_test.go @@ -31,6 +31,7 @@ import ( "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2" controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" @@ -38,6 +39,7 @@ import ( "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/desiredstate" "sigs.k8s.io/cluster-api/feature" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/collections" ) @@ -218,9 +220,12 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { Workload: &fakeWorkloadCluster{}, } + fc := capicontrollerutil.NewFakeController() + r := &KubeadmControlPlaneReconciler{ Client: env, SecretCachingClient: secretCachingClient, + controller: fc, managementCluster: fmc, managementClusterUncached: fmc, recorder: record.NewFakeRecorder(32), @@ -233,6 +238,10 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { result, err := r.scaleUpControlPlane(context.Background(), controlPlane) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(BeComparableTo(ctrl.Result{RequeueAfter: preflightFailedRequeueAfter})) + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(kcp)}, + BeTemporally("~", time.Now().Add(5*time.Second), 1*time.Second)), + ) // scaleUpControlPlane is never called due to health check failure and new machine is not created to scale up. controlPlaneMachines := &clusterv1.MachineList{} @@ -350,10 +359,13 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. setMachineHealthy(machines["three"]) fakeClient := newFakeClient(machines["one"], machines["two"], machines["three"]) + fc := capicontrollerutil.NewFakeController() + r := &KubeadmControlPlaneReconciler{ recorder: record.NewFakeRecorder(32), Client: fakeClient, SecretCachingClient: fakeClient, + controller: fc, managementCluster: &fakeManagementCluster{ Workload: &fakeWorkloadCluster{}, }, @@ -373,6 +385,10 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. result, err := r.scaleDownControlPlane(context.Background(), controlPlane, machineToDelete) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(BeComparableTo(ctrl.Result{RequeueAfter: preflightFailedRequeueAfter})) + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(kcp)}, + BeTemporally("~", time.Now().Add(5*time.Second), 1*time.Second)), + ) controlPlaneMachines := clusterv1.MachineList{} g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed()) @@ -525,12 +541,13 @@ func TestSelectMachineForInPlaceUpdateOrScaleDown(t *testing.T) { func TestPreflightChecks(t *testing.T) { utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.ClusterTopology, true) testCases := []struct { - name string - cluster *clusterv1.Cluster - kcp *controlplanev1.KubeadmControlPlane - machines []*clusterv1.Machine - expectResult ctrl.Result - expectPreflight internal.PreflightCheckResults + name string + cluster *clusterv1.Cluster + kcp *controlplanev1.KubeadmControlPlane + machines []*clusterv1.Machine + expectResult ctrl.Result + expectPreflight internal.PreflightCheckResults + expectDeferNextReconcile time.Duration }{ { name: "control plane without machines (not initialized) should pass", @@ -568,6 +585,7 @@ func TestPreflightChecks(t *testing.T) { EtcdClusterNotHealthy: false, TopologyVersionMismatch: true, }, + expectDeferNextReconcile: 5 * time.Second, }, { name: "control plane with a pending upgrade, but not yet at the current step of the upgrade plan, should requeue", @@ -599,6 +617,7 @@ func TestPreflightChecks(t *testing.T) { EtcdClusterNotHealthy: false, TopologyVersionMismatch: true, }, + expectDeferNextReconcile: 5 * time.Second, }, { name: "control plane with a deleting machine should requeue", @@ -617,6 +636,7 @@ func TestPreflightChecks(t *testing.T) { EtcdClusterNotHealthy: false, TopologyVersionMismatch: false, }, + expectDeferNextReconcile: 5 * time.Second, }, { name: "control plane without a nodeRef should requeue", @@ -636,6 +656,7 @@ func TestPreflightChecks(t *testing.T) { EtcdClusterNotHealthy: true, TopologyVersionMismatch: false, }, + expectDeferNextReconcile: 5 * time.Second, }, { name: "control plane with an unhealthy machine condition should requeue", @@ -663,6 +684,7 @@ func TestPreflightChecks(t *testing.T) { EtcdClusterNotHealthy: false, TopologyVersionMismatch: false, }, + expectDeferNextReconcile: 5 * time.Second, }, { name: "control plane with an unhealthy machine condition should requeue", @@ -690,6 +712,7 @@ func TestPreflightChecks(t *testing.T) { EtcdClusterNotHealthy: true, TopologyVersionMismatch: false, }, + expectDeferNextReconcile: 5 * time.Second, }, { name: "control plane with an healthy machine and an healthy kcp condition should pass", @@ -780,8 +803,11 @@ func TestPreflightChecks(t *testing.T) { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) + fc := capicontrollerutil.NewFakeController() + r := &KubeadmControlPlaneReconciler{ - recorder: record.NewFakeRecorder(32), + controller: fc, + recorder: record.NewFakeRecorder(32), } cluster := &clusterv1.Cluster{} if tt.cluster != nil { @@ -795,6 +821,14 @@ func TestPreflightChecks(t *testing.T) { result := r.preflightChecks(context.TODO(), controlPlane) g.Expect(result).To(BeComparableTo(tt.expectResult)) g.Expect(controlPlane.PreflightCheckResults).To(Equal(tt.expectPreflight)) + if tt.expectDeferNextReconcile == 0 { + g.Expect(fc.Deferrals).To(BeEmpty()) + } else { + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(tt.kcp)}, + BeTemporally("~", time.Now().Add(tt.expectDeferNextReconcile), 1*time.Second)), + ) + } }) } } diff --git a/controlplane/kubeadm/internal/controllers/update_test.go b/controlplane/kubeadm/internal/controllers/update_test.go index 08d981e375fc..96d9736cad98 100644 --- a/controlplane/kubeadm/internal/controllers/update_test.go +++ b/controlplane/kubeadm/internal/controllers/update_test.go @@ -33,6 +33,7 @@ import ( "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2" controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" @@ -40,6 +41,7 @@ import ( "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/desiredstate" "sigs.k8s.io/cluster-api/feature" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/collections" @@ -85,9 +87,12 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) { kcp.Spec.Replicas = ptr.To[int32](1) setKCPHealthy(kcp) + fc := capicontrollerutil.NewFakeController() + r := &KubeadmControlPlaneReconciler{ Client: env, SecretCachingClient: secretCachingClient, + controller: fc, recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: env}, @@ -158,6 +163,10 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) { result, err = r.reconcile(context.Background(), controlPlane) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(BeComparableTo(ctrl.Result{RequeueAfter: preflightFailedRequeueAfter})) + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(kcp)}, + BeTemporally("~", time.Now().Add(5*time.Second), 1*time.Second)), + ) g.Eventually(func(g Gomega) { g.Expect(env.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed()) g.Expect(bothMachines.Items).To(HaveLen(2)) diff --git a/docs/book/src/tasks/experimental-features/experimental-features.md b/docs/book/src/tasks/experimental-features/experimental-features.md index 15b20b97e618..650fd547cca6 100644 --- a/docs/book/src/tasks/experimental-features/experimental-features.md +++ b/docs/book/src/tasks/experimental-features/experimental-features.md @@ -4,19 +4,20 @@ Cluster API now ships with a new experimental package that lives under the `exp/ temporary location for features which will be moved to their permanent locations after graduation. Users can experiment with these features by enabling them using feature gates. Currently Cluster API has the following experimental features: +* `ClusterTopology` (env var: `CLUSTER_TOPOLOGY`): [ClusterClass](./cluster-class/index.md) +* `KubeadmBootstrapFormatIgnition` (env var: `EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION`): [Ignition](./ignition.md) * `MachinePool` (env var: `EXP_MACHINE_POOL`): [MachinePools](./machine-pools.md) * `MachineSetPreflightChecks` (env var: `EXP_MACHINE_SET_PREFLIGHT_CHECKS`): [MachineSetPreflightChecks](./machineset-preflight-checks.md) -* `PriorityQueue` (env var: `EXP_PRIORITY_QUEUE`): Enables the usage of the controller-runtime PriorityQueue: https://github.com/kubernetes-sigs/controller-runtime/issues/2374 +* `MachineTaintPropagation` (env var: `EXP_MACHINE_TAINT_PROPAGATION`): + * Allows in-place propagation of taints to nodes using the taint fields within Machines, MachineSets, and MachineDeployments. + * In future this feature is planned to also cover topology clusters and KCP. See the proposal [Propagating taints from Cluster API to Nodes](https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20250513-propogate-taints.md) for more information. * `MachineWaitForVolumeDetachConsiderVolumeAttachments` (env var: `EXP_MACHINE_WAITFORVOLUMEDETACH_CONSIDER_VOLUMEATTACHMENTS`): * During Machine drain the Machine controller waits for volumes to be detached. Per default, the controller considers `Nodes.status.volumesAttached` and `VolumesAttachments`. This feature flag allows to opt-out from considering `VolumeAttachments`. The feature gate was added to allow to opt-out in case unforeseen issues occur with `VolumeAttachments`. -* `ClusterTopology` (env var: `CLUSTER_TOPOLOGY`): [ClusterClass](./cluster-class/index.md) +* `PriorityQueue` (env var: `EXP_PRIORITY_QUEUE`): Enables the usage of the controller-runtime PriorityQueue: https://github.com/kubernetes-sigs/controller-runtime/issues/2374 +* `ReconcilerRateLimiting` (env var: `EXP_RECONCILER_RATE_LIMITING`): Enables reconciler rate-limiting: https://github.com/kubernetes-sigs/cluster-api/issues/13005 * `RuntimeSDK` (env var: `EXP_RUNTIME_SDK`): [RuntimeSDK](./runtime-sdk/index.md) -* `KubeadmBootstrapFormatIgnition` (env var: `EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION`): [Ignition](./ignition.md) -* `MachineTaintPropagation` (env var: `EXP_MACHINE_TAINT_PROPAGATION`): - * Allows in-place propagation of taints to nodes using the taint fields within Machines, MachineSets, and MachineDeployments. - * In future this feature is planned to also cover topology clusters and KCP. See the proposal [Propagating taints from Cluster API to Nodes](https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20250513-propogate-taints.md) for more information. ## Enabling Experimental Features for Management Clusters Started with clusterctl diff --git a/exp/topology/desiredstate/desired_state.go b/exp/topology/desiredstate/desired_state.go index 528b223814dc..def45075e96e 100644 --- a/exp/topology/desiredstate/desired_state.go +++ b/exp/topology/desiredstate/desired_state.go @@ -53,6 +53,7 @@ import ( "sigs.k8s.io/cluster-api/internal/webhooks" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conversion" ) @@ -62,7 +63,7 @@ type Generator interface { } // NewGenerator creates a new generator to generate desired state. -func NewGenerator(client client.Client, clusterCache clustercache.ClusterCache, runtimeClient runtimeclient.Client) (Generator, error) { +func NewGenerator(client client.Client, clusterCache clustercache.ClusterCache, runtimeClient runtimeclient.Client, hookCache cache.Cache[cache.HookEntry], getUpgradePlanCache cache.Cache[GenerateUpgradePlanCacheEntry]) (Generator, error) { if client == nil || clusterCache == nil { return nil, errors.New("Client and ClusterCache must not be nil") } @@ -72,10 +73,12 @@ func NewGenerator(client client.Client, clusterCache clustercache.ClusterCache, } return &generator{ - Client: client, - ClusterCache: clusterCache, - RuntimeClient: runtimeClient, - patchEngine: patches.NewEngine(client, runtimeClient), + Client: client, + ClusterCache: clusterCache, + RuntimeClient: runtimeClient, + hookCache: hookCache, + getUpgradePlanCache: getUpgradePlanCache, + patchEngine: patches.NewEngine(client, runtimeClient), }, nil } @@ -88,6 +91,9 @@ type generator struct { RuntimeClient runtimeclient.Client + hookCache cache.Cache[cache.HookEntry] + getUpgradePlanCache cache.Cache[GenerateUpgradePlanCacheEntry] + // patchEngine is used to apply patches during computeDesiredState. patchEngine patches.Engine } @@ -121,7 +127,7 @@ func (g *generator) Generate(ctx context.Context, s *scope.Scope) (*scope.Cluste // Runtime extension takes precedence if defined. getUpgradePlan := GetUpgradePlanOneMinor if s.Blueprint.ClusterClass.Spec.Upgrade.External.GenerateUpgradePlanExtension != "" { - getUpgradePlan = GetUpgradePlanFromExtension(g.RuntimeClient, s.Current.Cluster, s.Blueprint.ClusterClass.Spec.Upgrade.External.GenerateUpgradePlanExtension) + getUpgradePlan = GetUpgradePlanFromExtension(g.RuntimeClient, g.getUpgradePlanCache, s.Current.Cluster, s.Blueprint.ClusterClass.Spec.Upgrade.External.GenerateUpgradePlanExtension) } else if len(s.Blueprint.ClusterClass.Spec.KubernetesVersions) > 0 { getUpgradePlan = GetUpgradePlanFromClusterClassVersions(s.Blueprint.ClusterClass.Spec.KubernetesVersions) } diff --git a/exp/topology/desiredstate/desired_state_test.go b/exp/topology/desiredstate/desired_state_test.go index 59249ee442d1..65ed6ef3cf78 100644 --- a/exp/topology/desiredstate/desired_state_test.go +++ b/exp/topology/desiredstate/desired_state_test.go @@ -54,6 +54,7 @@ import ( topologynames "sigs.k8s.io/cluster-api/internal/topology/names" "sigs.k8s.io/cluster-api/internal/topology/ownerrefs" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -1713,6 +1714,7 @@ func TestComputeControlPlaneVersion(t *testing.T) { r := &generator{ Client: fakeClient, RuntimeClient: runtimeClient, + hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), } version, err := r.computeControlPlaneVersion(ctx, s) if tt.wantErr { @@ -3947,6 +3949,8 @@ func TestGenerate(t *testing.T) { fakeClient, clusterCache, fakeRuntimeClient, + cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), + cache.New[GenerateUpgradePlanCacheEntry](10*time.Minute), ) g.Expect(err).ToNot(HaveOccurred()) diff --git a/exp/topology/desiredstate/lifecycle_hooks.go b/exp/topology/desiredstate/lifecycle_hooks.go index fcddbe23ea7d..0581856cfe98 100644 --- a/exp/topology/desiredstate/lifecycle_hooks.go +++ b/exp/topology/desiredstate/lifecycle_hooks.go @@ -21,6 +21,7 @@ import ( "fmt" "slices" "strings" + "time" "github.com/pkg/errors" ctrl "sigs.k8s.io/controller-runtime" @@ -31,6 +32,7 @@ import ( runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog" "sigs.k8s.io/cluster-api/exp/topology/scope" "sigs.k8s.io/cluster-api/internal/hooks" + "sigs.k8s.io/cluster-api/util/cache" ) // callBeforeClusterUpgradeHook calls the BeforeClusterUpgrade at the beginning of an upgrade. @@ -81,6 +83,14 @@ func (g *generator) callBeforeClusterUpgradeHook(ctx context.Context, s *scope.S return true, nil } + if cacheEntry, ok := g.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.BeforeClusterUpgrade)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling BeforeClusterUpgrade hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.BeforeClusterUpgrade, cacheEntry.ToResponse(&runtimehooksv1.BeforeClusterUpgradeResponse{}, requeueAfter)) + return false, nil + } + } + v1beta1Cluster := &clusterv1beta1.Cluster{} // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. if err := v1beta1Cluster.ConvertFrom(s.Current.Cluster.DeepCopy()); err != nil { @@ -103,7 +113,8 @@ func (g *generator) callBeforeClusterUpgradeHook(ctx context.Context, s *scope.S if hookResponse.RetryAfterSeconds != 0 { // Cannot pickup the new version right now. Need to try again later. - log.Info(fmt.Sprintf("Cluster upgrade from version %s to version %s is blocked by %s hook", hookRequest.FromKubernetesVersion, hookRequest.ToKubernetesVersion, runtimecatalog.HookName(runtimehooksv1.BeforeClusterUpgrade)), + g.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.BeforeClusterUpgrade, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Cluster upgrade from version %s to version %s is blocked by %s hook, retry after %ds", hookRequest.FromKubernetesVersion, hookRequest.ToKubernetesVersion, runtimecatalog.HookName(runtimehooksv1.BeforeClusterUpgrade), hookResponse.RetryAfterSeconds), "ControlPlaneUpgrades", hookRequest.ControlPlaneUpgrades, "WorkersUpgrades", hookRequest.WorkersUpgrades, ) @@ -134,6 +145,14 @@ func (g *generator) callBeforeControlPlaneUpgradeHook(ctx context.Context, s *sc return true, nil } + if cacheEntry, ok := g.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.BeforeControlPlaneUpgrade)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling BeforeControlPlaneUpgrade hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.BeforeControlPlaneUpgrade, cacheEntry.ToResponse(&runtimehooksv1.BeforeControlPlaneUpgradeResponse{}, requeueAfter)) + return false, nil + } + } + // NOTE: the hook should always be called before piking up a new version. v1beta1Cluster := &clusterv1beta1.Cluster{} // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. @@ -157,7 +176,8 @@ func (g *generator) callBeforeControlPlaneUpgradeHook(ctx context.Context, s *sc if hookResponse.RetryAfterSeconds != 0 { // Cannot pickup the new version right now. Need to try again later. - log.Info(fmt.Sprintf("Control plane upgrade from version %s to version %s is blocked by %s hook", hookRequest.FromKubernetesVersion, hookRequest.ToKubernetesVersion, runtimecatalog.HookName(runtimehooksv1.BeforeControlPlaneUpgrade)), + g.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.BeforeControlPlaneUpgrade, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Control plane upgrade from version %s to version %s is blocked by %s hook, retry after %ds", hookRequest.FromKubernetesVersion, hookRequest.ToKubernetesVersion, runtimecatalog.HookName(runtimehooksv1.BeforeControlPlaneUpgrade), hookResponse.RetryAfterSeconds), "ControlPlaneUpgrades", hookRequest.ControlPlaneUpgrades, "WorkersUpgrades", hookRequest.WorkersUpgrades, ) @@ -193,6 +213,14 @@ func (g *generator) callAfterControlPlaneUpgradeHook(ctx context.Context, s *sco return true, nil } + if cacheEntry, ok := g.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.AfterControlPlaneUpgrade)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling AfterControlPlaneUpgrade hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.AfterControlPlaneUpgrade, cacheEntry.ToResponse(&runtimehooksv1.AfterControlPlaneUpgradeResponse{}, requeueAfter)) + return false, nil + } + } + // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. v1beta1Cluster := &clusterv1beta1.Cluster{} if err := v1beta1Cluster.ConvertFrom(s.Current.Cluster.DeepCopy()); err != nil { @@ -214,7 +242,8 @@ func (g *generator) callAfterControlPlaneUpgradeHook(ctx context.Context, s *sco s.HookResponseTracker.Add(runtimehooksv1.AfterControlPlaneUpgrade, hookResponse) if hookResponse.RetryAfterSeconds != 0 { - log.Info(fmt.Sprintf("Control plane upgrade to version %s completed but next steps are blocked by %s hook", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterControlPlaneUpgrade)), + g.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.AfterControlPlaneUpgrade, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Control plane upgrade to version %s completed but next steps are blocked by %s hook, retry after %ds", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterControlPlaneUpgrade), hookResponse.RetryAfterSeconds), "ControlPlaneUpgrades", hookRequest.ControlPlaneUpgrades, "WorkersUpgrades", hookRequest.WorkersUpgrades, ) @@ -254,6 +283,14 @@ func (g *generator) callBeforeWorkersUpgradeHook(ctx context.Context, s *scope.S return true, nil } + if cacheEntry, ok := g.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.BeforeWorkersUpgrade)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling BeforeWorkersUpgrade hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.BeforeWorkersUpgrade, cacheEntry.ToResponse(&runtimehooksv1.BeforeWorkersUpgradeResponse{}, requeueAfter)) + return false, nil + } + } + // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. v1beta1Cluster := &clusterv1beta1.Cluster{} if err := v1beta1Cluster.ConvertFrom(s.Current.Cluster.DeepCopy()); err != nil { @@ -276,7 +313,8 @@ func (g *generator) callBeforeWorkersUpgradeHook(ctx context.Context, s *scope.S if hookResponse.RetryAfterSeconds != 0 { // Cannot pickup the new version right now. Need to try again later. - log.Info(fmt.Sprintf("Workers upgrade from version %s to version %s is blocked by %s hook", hookRequest.FromKubernetesVersion, hookRequest.ToKubernetesVersion, runtimecatalog.HookName(runtimehooksv1.BeforeWorkersUpgrade)), + g.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.BeforeWorkersUpgrade, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Workers upgrade from version %s to version %s is blocked by %s hook, retry after %ds", hookRequest.FromKubernetesVersion, hookRequest.ToKubernetesVersion, runtimecatalog.HookName(runtimehooksv1.BeforeWorkersUpgrade), hookResponse.RetryAfterSeconds), "ControlPlaneUpgrades", hookRequest.ControlPlaneUpgrades, "WorkersUpgrades", hookRequest.WorkersUpgrades, ) @@ -317,6 +355,14 @@ func (g *generator) callAfterWorkersUpgradeHook(ctx context.Context, s *scope.Sc return true, nil } + if cacheEntry, ok := g.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.AfterWorkersUpgrade)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling AfterWorkersUpgrade hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.AfterWorkersUpgrade, cacheEntry.ToResponse(&runtimehooksv1.AfterWorkersUpgradeResponse{}, requeueAfter)) + return false, nil + } + } + // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. v1beta1Cluster := &clusterv1beta1.Cluster{} if err := v1beta1Cluster.ConvertFrom(s.Current.Cluster.DeepCopy()); err != nil { @@ -338,7 +384,8 @@ func (g *generator) callAfterWorkersUpgradeHook(ctx context.Context, s *scope.Sc s.HookResponseTracker.Add(runtimehooksv1.AfterWorkersUpgrade, hookResponse) if hookResponse.RetryAfterSeconds != 0 { - log.Info(fmt.Sprintf("Workers upgrade to version %s completed but next steps are blocked by %s hook", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterWorkersUpgrade)), + g.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.AfterWorkersUpgrade, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Workers upgrade to version %s completed but next steps are blocked by %s hook, retry after %ds", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterWorkersUpgrade), hookResponse.RetryAfterSeconds), "ControlPlaneUpgrades", hookRequest.ControlPlaneUpgrades, "WorkersUpgrades", hookRequest.WorkersUpgrades, ) diff --git a/exp/topology/desiredstate/lifecycle_hooks_test.go b/exp/topology/desiredstate/lifecycle_hooks_test.go index c37db235b42c..753ecf81e264 100644 --- a/exp/topology/desiredstate/lifecycle_hooks_test.go +++ b/exp/topology/desiredstate/lifecycle_hooks_test.go @@ -20,6 +20,7 @@ import ( "maps" "reflect" "testing" + "time" "github.com/google/go-cmp/cmp" . "github.com/onsi/gomega" @@ -42,6 +43,7 @@ import ( "sigs.k8s.io/cluster-api/exp/topology/scope" "sigs.k8s.io/cluster-api/feature" fakeruntimeclient "sigs.k8s.io/cluster-api/internal/runtime/client/fake" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -70,6 +72,13 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.RuntimeSDK, true) + clusterForCacheEntry := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: "test-cluster", + }, + } + catalog := runtimecatalog.New() _ = runtimehooksv1.AddToCatalog(catalog) beforeClusterUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.BeforeClusterUpgrade) @@ -81,7 +90,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { blockingBeforeClusterUpgradeResponse := &runtimehooksv1.BeforeClusterUpgradeResponse{ CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ CommonResponse: runtimehooksv1.CommonResponse{ - Status: runtimehooksv1.ResponseStatusSuccess, + Status: runtimehooksv1.ResponseStatusSuccess, + Message: "processing", }, RetryAfterSeconds: int32(10), }, @@ -97,7 +107,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { blockingBeforeControlPlaneUpgradeResponse := &runtimehooksv1.BeforeControlPlaneUpgradeResponse{ CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ CommonResponse: runtimehooksv1.CommonResponse{ - Status: runtimehooksv1.ResponseStatusSuccess, + Status: runtimehooksv1.ResponseStatusSuccess, + Message: "processing", }, RetryAfterSeconds: int32(10), }, @@ -113,7 +124,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { blockingAfterControlPlaneUpgradeResponse := &runtimehooksv1.AfterControlPlaneUpgradeResponse{ CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ CommonResponse: runtimehooksv1.CommonResponse{ - Status: runtimehooksv1.ResponseStatusSuccess, + Status: runtimehooksv1.ResponseStatusSuccess, + Message: "processing", }, RetryAfterSeconds: int32(10), }, @@ -129,7 +141,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { blockingBeforeWorkersUpgradeResponse := &runtimehooksv1.BeforeWorkersUpgradeResponse{ CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ CommonResponse: runtimehooksv1.CommonResponse{ - Status: runtimehooksv1.ResponseStatusSuccess, + Status: runtimehooksv1.ResponseStatusSuccess, + Message: "processing", }, RetryAfterSeconds: int32(10), }, @@ -145,7 +158,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { blockingAfterWorkersUpgradeResponse := &runtimehooksv1.AfterWorkersUpgradeResponse{ CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ CommonResponse: runtimehooksv1.CommonResponse{ - Status: runtimehooksv1.ResponseStatusSuccess, + Status: runtimehooksv1.ResponseStatusSuccess, + Message: "processing", }, RetryAfterSeconds: int32(10), }, @@ -182,6 +196,7 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantIsStartingUpgrade bool wantIsWaitingForWorkersUpgrade bool wantPendingHookAnnotation string + wantHookCacheEntry *cache.HookEntry }{ // Upgrade cluster with CP, MD, MP (upgrade by one minor) @@ -222,6 +237,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { beforeClusterUpgradeResponse: blockingBeforeClusterUpgradeResponse, wantVersion: "v1.2.2", wantIsPendingUpgrade: true, + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeClusterUpgrade, + time.Now().Add(time.Duration(blockingBeforeClusterUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeClusterUpgradeResponse.Message)), }, { name: "when an upgrade starts: call the BeforeControlPlaneUpgrade hook when BeforeClusterUpgrade hook unblocks, blocking answer", @@ -255,6 +272,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantVersion: "v1.2.2", wantIsPendingUpgrade: true, wantPendingHookAnnotation: "AfterClusterUpgrade", // changed from previous step + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingBeforeControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeControlPlaneUpgradeResponse.Message)), }, { name: "when an upgrade starts: pick up a new version when BeforeControlPlaneUpgrade hook unblocks (does not call the BeforeClusterUpgrade hook when already done)", @@ -326,6 +345,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { afterControlPlaneUpgradeResponse: blockingAfterControlPlaneUpgradeResponse, wantVersion: "v1.2.3", wantPendingHookAnnotation: "AfterClusterUpgrade,AfterControlPlaneUpgrade,AfterWorkersUpgrade,BeforeWorkersUpgrade", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.AfterControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingAfterControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingAfterControlPlaneUpgradeResponse.Message)), }, { name: "after control plane is upgraded: call the BeforeWorkersUpgrade hook when AfterControlPlaneUpgrade hook unblocks, blocking answer", @@ -358,6 +379,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { beforeWorkersUpgradeResponse: blockingBeforeWorkersUpgradeResponse, wantVersion: "v1.2.3", wantPendingHookAnnotation: "AfterClusterUpgrade,AfterWorkersUpgrade,BeforeWorkersUpgrade", // changed from previous step + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeWorkersUpgrade, + time.Now().Add(time.Duration(blockingBeforeWorkersUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeWorkersUpgradeResponse.Message)), }, { name: "after control plane is upgraded: BeforeWorkersUpgrade hook unblocks (does not call the AfterControlPlaneUpgrade hook when already done)", @@ -450,6 +473,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { afterWorkersUpgradeResponse: blockingAfterWorkersUpgradeResponse, wantVersion: "v1.2.3", wantPendingHookAnnotation: "AfterClusterUpgrade,AfterWorkersUpgrade", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.AfterWorkersUpgrade, + time.Now().Add(time.Duration(blockingAfterWorkersUpgradeResponse.RetryAfterSeconds)*time.Second), blockingAfterWorkersUpgradeResponse.Message)), }, { name: "after workers are upgraded: AfterWorkersUpgrade hook unblocks", @@ -517,6 +542,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { beforeClusterUpgradeResponse: blockingBeforeClusterUpgradeResponse, wantVersion: "v1.2.2", wantIsPendingUpgrade: true, + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeClusterUpgrade, + time.Now().Add(time.Duration(blockingBeforeClusterUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeClusterUpgradeResponse.Message)), }, { name: "when an upgrade to the first minor starts: call the BeforeControlPlaneUpgrade hook when BeforeClusterUpgrade hook unblocks, blocking answer", @@ -550,6 +577,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantVersion: "v1.2.2", wantIsPendingUpgrade: true, wantPendingHookAnnotation: "AfterClusterUpgrade", // changed from previous step + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingBeforeControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeControlPlaneUpgradeResponse.Message)), }, { name: "when an upgrade to the first minor starts: pick up a new version when BeforeControlPlaneUpgrade hook unblocks (does not call the BeforeClusterUpgrade hook when already done)", @@ -622,6 +651,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantVersion: "v1.3.3", wantIsPendingUpgrade: true, wantPendingHookAnnotation: "AfterClusterUpgrade,AfterControlPlaneUpgrade", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.AfterControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingAfterControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingAfterControlPlaneUpgradeResponse.Message)), }, { name: "when an upgrade to the second minor starts: call the BeforeControlPlaneUpgrade after AfterControlPlaneUpgrade hook unblocks, blocking answer", @@ -655,6 +686,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantVersion: "v1.3.3", wantIsPendingUpgrade: true, wantPendingHookAnnotation: "AfterClusterUpgrade", // changed from previous step + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingBeforeControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeControlPlaneUpgradeResponse.Message)), }, { name: "when an upgrade to the second minor starts: pick up a new version when BeforeControlPlaneUpgrade hook unblocks (does not call the BeforeClusterUpgrade hook when already done)", @@ -726,6 +759,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { afterControlPlaneUpgradeResponse: blockingAfterControlPlaneUpgradeResponse, wantVersion: "v1.4.4", wantPendingHookAnnotation: "AfterClusterUpgrade,AfterControlPlaneUpgrade,AfterWorkersUpgrade,BeforeWorkersUpgrade", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.AfterControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingAfterControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingAfterControlPlaneUpgradeResponse.Message)), }, { name: "when starting workers upgrade to the second minor: call the BeforeWorkersUpgrade hook when AfterControlPlaneUpgradeRequest hook unblocks, blocking answer", @@ -758,6 +793,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { beforeWorkersUpgradeResponse: blockingBeforeWorkersUpgradeResponse, wantVersion: "v1.4.4", wantPendingHookAnnotation: "AfterClusterUpgrade,AfterWorkersUpgrade,BeforeWorkersUpgrade", // changed from previous step + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeWorkersUpgrade, + time.Now().Add(time.Duration(blockingBeforeWorkersUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeWorkersUpgradeResponse.Message)), }, { name: "when starting workers upgrade to the second minor: BeforeWorkersUpgrade hook unblocks (does not call the AfterControlPlaneUpgrade hook when already done)", @@ -830,6 +867,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { afterWorkersUpgradeResponse: blockingAfterWorkersUpgradeResponse, wantVersion: "v1.4.4", wantPendingHookAnnotation: "AfterClusterUpgrade,AfterWorkersUpgrade", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.AfterWorkersUpgrade, + time.Now().Add(time.Duration(blockingAfterWorkersUpgradeResponse.RetryAfterSeconds)*time.Second), blockingAfterWorkersUpgradeResponse.Message)), }, { name: "after workers are upgraded to the second minor: AfterWorkersUpgrade hook unblocks", @@ -896,6 +935,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { beforeClusterUpgradeResponse: blockingBeforeClusterUpgradeResponse, wantVersion: "v1.2.2", wantIsPendingUpgrade: true, + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeClusterUpgrade, + time.Now().Add(time.Duration(blockingBeforeClusterUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeClusterUpgradeResponse.Message)), }, { name: "when an upgrade to the first minor starts: call the BeforeControlPlaneUpgrade hook when BeforeClusterUpgrade hook unblocks, blocking answer", @@ -928,6 +969,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantVersion: "v1.2.2", wantIsPendingUpgrade: true, wantPendingHookAnnotation: "AfterClusterUpgrade", // changed from previous step + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingBeforeControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeControlPlaneUpgradeResponse.Message)), }, { name: "when an upgrade to the first minor starts: pick up a new version when BeforeControlPlaneUpgrade hook unblocks (does not call the BeforeClusterUpgrade hook when already done)", @@ -998,6 +1041,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantVersion: "v1.3.3", wantIsPendingUpgrade: true, wantPendingHookAnnotation: "AfterClusterUpgrade,AfterControlPlaneUpgrade", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.AfterControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingAfterControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingAfterControlPlaneUpgradeResponse.Message)), }, { name: "when an upgrade to the second minor starts: call the BeforeControlPlaneUpgrade after AfterControlPlaneUpgrade hook unblocks, blocking answer", @@ -1030,6 +1075,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { wantVersion: "v1.3.3", wantIsPendingUpgrade: true, wantPendingHookAnnotation: "AfterClusterUpgrade", // changed from previous step + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.BeforeControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingBeforeControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingBeforeControlPlaneUpgradeResponse.Message)), }, { name: "when an upgrade to the second minor starts: pick up a new version when BeforeControlPlaneUpgrade hook unblocks (does not call the BeforeClusterUpgrade hook when already done)", @@ -1098,6 +1145,8 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { afterControlPlaneUpgradeResponse: blockingAfterControlPlaneUpgradeResponse, wantVersion: "v1.4.4", wantPendingHookAnnotation: "AfterClusterUpgrade,AfterControlPlaneUpgrade", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(clusterForCacheEntry, runtimehooksv1.AfterControlPlaneUpgrade, + time.Now().Add(time.Duration(blockingAfterControlPlaneUpgradeResponse.RetryAfterSeconds)*time.Second), blockingAfterControlPlaneUpgradeResponse.Message)), }, { name: "after control plane is upgraded to the second minor: call the AfterControlPlaneUpgrade hook, non blocking answer", @@ -1246,6 +1295,7 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { r := &generator{ Client: fakeClient, RuntimeClient: runtimeClient, + hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), } version, err := r.computeControlPlaneVersion(ctx, s) g.Expect(err).ToNot(HaveOccurred()) @@ -1268,6 +1318,25 @@ func TestComputeControlPlaneVersion_LifecycleHooksSequences(t *testing.T) { } else { g.Expect(s.Current.Cluster.Annotations).ToNot(HaveKey(runtimev1.PendingHooksAnnotation), "Unexpected PendingHookAnnotation") } + + if tt.wantHookCacheEntry != nil { + // Verify the cache entry. + cacheEntry, ok := r.hookCache.Has(tt.wantHookCacheEntry.Key()) + g.Expect(ok).To(BeTrue()) + g.Expect(cacheEntry.ObjectKey).To(Equal(tt.wantHookCacheEntry.ObjectKey)) + g.Expect(cacheEntry.HookName).To(Equal(tt.wantHookCacheEntry.HookName)) + g.Expect(cacheEntry.ReconcileAfter).To(BeTemporally("~", tt.wantHookCacheEntry.ReconcileAfter, 5*time.Second)) + g.Expect(cacheEntry.ResponseMessage).To(Equal(tt.wantHookCacheEntry.ResponseMessage)) + + // Call computeControlPlaneVersion again and verify the cache hit. + hooksCalled = sets.Set[string]{} + version, err := r.computeControlPlaneVersion(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(version).To(Equal(tt.wantVersion)) + g.Expect(hooksCalled).To(BeEmpty()) + } else { + g.Expect(r.hookCache.Len()).To(Equal(0)) + } }) } } diff --git a/exp/topology/desiredstate/upgrade_plan.go b/exp/topology/desiredstate/upgrade_plan.go index 9410e219ab81..9df1ebee313a 100644 --- a/exp/topology/desiredstate/upgrade_plan.go +++ b/exp/topology/desiredstate/upgrade_plan.go @@ -19,11 +19,13 @@ package desiredstate import ( "context" "fmt" + "slices" "strings" "github.com/blang/semver/v4" "github.com/pkg/errors" "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" @@ -31,6 +33,7 @@ import ( "sigs.k8s.io/cluster-api/exp/topology/scope" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/contract" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/version" ) @@ -410,8 +413,23 @@ func GetUpgradePlanFromClusterClassVersions(clusterClassVersions []string) func( } } +// GenerateUpgradePlanCacheEntry is an entry for the GenerateUpgradePlan hook cache. +type GenerateUpgradePlanCacheEntry struct { + ClusterKey client.ObjectKey + FromControlPlaneKubernetesVersion string + FromWorkersKubernetesVersion string + ToKubernetesVersion string + ControlPlaneUpgradePlan []string + WorkersUpgradePlan []string +} + +// Key returns the cache key of a GenerateUpgradePlanCacheEntry. +func (r GenerateUpgradePlanCacheEntry) Key() string { + return fmt.Sprintf("%s: (%s,%s) => %s", r.ClusterKey, r.FromControlPlaneKubernetesVersion, r.FromWorkersKubernetesVersion, r.ToKubernetesVersion) +} + // GetUpgradePlanFromExtension returns an upgrade plan by calling the GenerateUpgradePlan runtime extension. -func GetUpgradePlanFromExtension(runtimeClient runtimeclient.Client, cluster *clusterv1.Cluster, extensionName string) func(ctx context.Context, desiredVersion, currentControlPlaneVersion, currentMinWorkersVersion string) ([]string, []string, error) { +func GetUpgradePlanFromExtension(runtimeClient runtimeclient.Client, getUpgradePlanCache cache.Cache[GenerateUpgradePlanCacheEntry], cluster *clusterv1.Cluster, extensionName string) func(ctx context.Context, desiredVersion, currentControlPlaneVersion, currentMinWorkersVersion string) ([]string, []string, error) { return func(ctx context.Context, desiredVersion, currentControlPlaneVersion, currentMinWorkersVersion string) ([]string, []string, error) { if !feature.Gates.Enabled(feature.RuntimeSDK) { return nil, nil, errors.Errorf("can not use GenerateUpgradePlan extension %q if RuntimeSDK feature flag is disabled", extensionName) @@ -425,6 +443,17 @@ func GetUpgradePlanFromExtension(runtimeClient runtimeclient.Client, cluster *cl ToKubernetesVersion: desiredVersion, } + entry := GenerateUpgradePlanCacheEntry{ + ClusterKey: client.ObjectKeyFromObject(cluster), + FromControlPlaneKubernetesVersion: req.FromControlPlaneKubernetesVersion, + FromWorkersKubernetesVersion: req.FromWorkersKubernetesVersion, + ToKubernetesVersion: req.ToKubernetesVersion, + } + + if cacheEntry, ok := getUpgradePlanCache.Has(entry.Key()); ok { + return slices.Clone(cacheEntry.ControlPlaneUpgradePlan), slices.Clone(cacheEntry.WorkersUpgradePlan), nil + } + // Call the extension. resp := &runtimehooksv1.GenerateUpgradePlanResponse{} if err := runtimeClient.CallExtension(ctx, runtimehooksv1.GenerateUpgradePlan, cluster, extensionName, req, resp); err != nil { @@ -442,6 +471,10 @@ func GetUpgradePlanFromExtension(runtimeClient runtimeclient.Client, cluster *cl workersUpgradePlan[i] = step.Version } + entry.ControlPlaneUpgradePlan = slices.Clone(controlPlaneUpgradePlan) + entry.WorkersUpgradePlan = slices.Clone(workersUpgradePlan) + getUpgradePlanCache.Add(entry) + return controlPlaneUpgradePlan, workersUpgradePlan, nil } } diff --git a/exp/topology/desiredstate/upgrade_plan_test.go b/exp/topology/desiredstate/upgrade_plan_test.go index 647b3e0fe15d..cb21acafafac 100644 --- a/exp/topology/desiredstate/upgrade_plan_test.go +++ b/exp/topology/desiredstate/upgrade_plan_test.go @@ -19,11 +19,13 @@ package desiredstate import ( "context" "testing" + "time" "github.com/blang/semver/v4" . "github.com/onsi/gomega" "github.com/pkg/errors" utilfeature "k8s.io/component-base/featuregate/testing" + "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" @@ -31,6 +33,7 @@ import ( "sigs.k8s.io/cluster-api/exp/topology/scope" "sigs.k8s.io/cluster-api/feature" fakeruntimeclient "sigs.k8s.io/cluster-api/internal/runtime/client/fake" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -1173,12 +1176,36 @@ func TestGetUpgradePlanFromExtension(t *testing.T) { Build() // Call GetUpgradePlanFromExtension - f := GetUpgradePlanFromExtension(fakeRuntimeClient, cluster, "test-extension") + generateUpgradePlanCache := cache.New[GenerateUpgradePlanCacheEntry](10 * time.Minute) + f := GetUpgradePlanFromExtension(fakeRuntimeClient, generateUpgradePlanCache, cluster, "test-extension") controlPlaneUpgradePlan, workersUpgradePlan, err := f(ctx, "v1.33.0", "v1.31.0", "v1.31.0") g.Expect(err).ToNot(HaveOccurred()) g.Expect(controlPlaneUpgradePlan).To(Equal([]string{"v1.32.0", "v1.33.0"})) g.Expect(workersUpgradePlan).To(Equal([]string{"v1.33.0"})) + g.Expect(fakeRuntimeClient.CallCount(runtimehooksv1.GenerateUpgradePlan)).To(Equal(1)) + + // Verify the cache entry. + wantCacheEntry := GenerateUpgradePlanCacheEntry{ + ClusterKey: client.ObjectKeyFromObject(cluster), + FromControlPlaneKubernetesVersion: "v1.31.0", + FromWorkersKubernetesVersion: "v1.31.0", + ToKubernetesVersion: "v1.33.0", + ControlPlaneUpgradePlan: controlPlaneUpgradePlan, + WorkersUpgradePlan: workersUpgradePlan, + } + + cacheEntry, ok := generateUpgradePlanCache.Has(wantCacheEntry.Key()) + g.Expect(ok).To(BeTrue()) + g.Expect(cacheEntry).To(Equal(wantCacheEntry)) + + // Call GetUpgradePlanFromExtension again and verify the cache hit. + f = GetUpgradePlanFromExtension(fakeRuntimeClient, generateUpgradePlanCache, cluster, "test-extension") + controlPlaneUpgradePlan, workersUpgradePlan, err = f(ctx, "v1.33.0", "v1.31.0", "v1.31.0") + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(fakeRuntimeClient.CallCount(runtimehooksv1.GenerateUpgradePlan)).To(Equal(1)) + g.Expect(controlPlaneUpgradePlan).To(Equal(wantCacheEntry.ControlPlaneUpgradePlan)) + g.Expect(workersUpgradePlan).To(Equal(wantCacheEntry.WorkersUpgradePlan)) } func TestGetUpgradePlanFromExtension_Errors(t *testing.T) { @@ -1236,7 +1263,7 @@ func TestGetUpgradePlanFromExtension_Errors(t *testing.T) { fakeRuntimeClient := fakeRuntimeClientBuilder.Build() // Call GetUpgradePlanFromExtension - f := GetUpgradePlanFromExtension(fakeRuntimeClient, cluster, "test-extension") + f := GetUpgradePlanFromExtension(fakeRuntimeClient, cache.New[GenerateUpgradePlanCacheEntry](10*time.Minute), cluster, "test-extension") _, _, err := f(ctx, tt.desiredVersion, tt.currentControlPlaneVersion, tt.currentMinWorkersVersion) g.Expect(err).To(HaveOccurred()) diff --git a/feature/feature.go b/feature/feature.go index a1855d01a065..5f5956cdcf83 100644 --- a/feature/feature.go +++ b/feature/feature.go @@ -69,6 +69,12 @@ const ( // alpha: v1.10 PriorityQueue featuregate.Feature = "PriorityQueue" + // ReconcilerRateLimiting is a feature gate that controls if reconcilers are rate-limited. + // Note: Currently the feature gate is rate-limiting to 1 request / 1 second. + // + // alpha: v1.12 + ReconcilerRateLimiting featuregate.Feature = "ReconcilerRateLimiting" + // InPlaceUpdates is a feature gate for the in-place machine updates functionality. // alpha: v1.12 InPlaceUpdates featuregate.Feature = "InPlaceUpdates" @@ -91,6 +97,7 @@ var defaultClusterAPIFeatureGates = map[featuregate.Feature]featuregate.FeatureS MachineSetPreflightChecks: {Default: true, PreRelease: featuregate.Beta}, MachineWaitForVolumeDetachConsiderVolumeAttachments: {Default: true, PreRelease: featuregate.Beta}, PriorityQueue: {Default: false, PreRelease: featuregate.Alpha}, + ReconcilerRateLimiting: {Default: false, PreRelease: featuregate.Alpha}, ClusterTopology: {Default: false, PreRelease: featuregate.Alpha}, KubeadmBootstrapFormatIgnition: {Default: false, PreRelease: featuregate.Alpha}, RuntimeSDK: {Default: false, PreRelease: featuregate.Alpha}, diff --git a/go.mod b/go.mod index af7036bad26e..5a306196a9ca 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ require ( github.com/onsi/gomega v1.38.2 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.22.0 + github.com/prometheus/client_model v0.6.1 github.com/spf13/cobra v1.10.1 github.com/spf13/pflag v1.0.10 github.com/spf13/viper v1.21.0 @@ -115,7 +116,6 @@ require ( github.com/opencontainers/go-digest v1.0.0 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect diff --git a/internal/controllers/cluster/cluster_controller.go b/internal/controllers/cluster/cluster_controller.go index 6158ed58f79b..ff54eca90d9e 100644 --- a/internal/controllers/cluster/cluster_controller.go +++ b/internal/controllers/cluster/cluster_controller.go @@ -34,7 +34,6 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -47,6 +46,7 @@ import ( "sigs.k8s.io/cluster-api/controllers/external" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/hooks" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" @@ -94,7 +94,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "cluster") - b := ctrl.NewControllerManagedBy(mgr). + b := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&clusterv1.Cluster{}). WatchesRawSource(r.ClusterCache.GetClusterSource("cluster", func(_ context.Context, o client.Object) []ctrl.Request { return []ctrl.Request{{NamespacedName: client.ObjectKeyFromObject(o)}} @@ -102,18 +102,15 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(r.controlPlaneMachineToCluster), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &clusterv1.MachineDeployment{}, handler.EnqueueRequestsFromMapFunc(r.machineDeploymentToCluster), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ) if feature.Gates.Enabled(feature.MachinePool) { b = b.Watches( &clusterv1.MachinePool{}, handler.EnqueueRequestsFromMapFunc(r.machinePoolToCluster), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ) } @@ -180,7 +177,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retRes ct defer func() { // Always reconcile the Status. if err := r.updateStatus(ctx, s); err != nil { - retRes = ctrl.Result{} reterr = kerrors.NewAggregate([]error{reterr, err}) return } @@ -194,10 +190,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retRes ct if err := patchCluster(ctx, patchHelper, cluster, patchOpts...); err != nil { reterr = kerrors.NewAggregate([]error{reterr, err}) } - - if reterr != nil { - retRes = ctrl.Result{} - } }() alwaysReconcile := []clusterReconcileFunc{ diff --git a/internal/controllers/clusterclass/clusterclass_controller.go b/internal/controllers/clusterclass/clusterclass_controller.go index 2d6d1780cc30..828e8a40a17d 100644 --- a/internal/controllers/clusterclass/clusterclass_controller.go +++ b/internal/controllers/clusterclass/clusterclass_controller.go @@ -35,7 +35,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -52,6 +51,7 @@ import ( "sigs.k8s.io/cluster-api/internal/contract" internalruntimeclient "sigs.k8s.io/cluster-api/internal/runtime/client" "sigs.k8s.io/cluster-api/internal/topology/variables" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/patch" @@ -87,13 +87,12 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "clusterclass") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&clusterv1.ClusterClass{}). WithOptions(options). Watches( &runtimev1.ExtensionConfig{}, handler.EnqueueRequestsFromMapFunc(r.extensionConfigToClusterClass), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Complete(r) @@ -156,10 +155,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct reterr = kerrors.NewAggregate([]error{reterr, err}) return } - - if reterr != nil { - retres = ctrl.Result{} - } }() reconcileNormal := []clusterClassReconcileFunc{ diff --git a/internal/controllers/clusterresourceset/clusterresourceset_controller.go b/internal/controllers/clusterresourceset/clusterresourceset_controller.go index 5d603b67bb7f..7056bc580581 100644 --- a/internal/controllers/clusterresourceset/clusterresourceset_controller.go +++ b/internal/controllers/clusterresourceset/clusterresourceset_controller.go @@ -32,7 +32,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" @@ -45,6 +44,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/controllers/clustercache" resourcepredicates "sigs.k8s.io/cluster-api/internal/controllers/clusterresourceset/predicates" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" v1beta1conditions "sigs.k8s.io/cluster-api/util/conditions/deprecated/v1beta1" @@ -77,12 +77,11 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "clusterresourceset") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&addonsv1.ClusterResourceSet{}). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(r.clusterToClusterResourceSet), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). WatchesRawSource(r.ClusterCache.GetClusterSource("clusterresourceset", r.clusterToClusterResourceSet)). WatchesMetadata( @@ -90,12 +89,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt handler.EnqueueRequestsFromMapFunc( resourceToClusterResourceSetFunc[client.Object](r.Client), ), - builder.WithPredicates( - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - resourcepredicates.TypedResourceCreateOrUpdate[client.Object](predicateLog), - ), - ), + resourcepredicates.TypedResourceCreateOrUpdate[client.Object](predicateLog), ). WatchesRawSource(source.Kind( partialSecretCache, diff --git a/internal/controllers/clusterresourcesetbinding/clusterresourcesetbinding_controller.go b/internal/controllers/clusterresourcesetbinding/clusterresourcesetbinding_controller.go index c00bd68e50a5..62e6da77fafb 100644 --- a/internal/controllers/clusterresourcesetbinding/clusterresourcesetbinding_controller.go +++ b/internal/controllers/clusterresourcesetbinding/clusterresourcesetbinding_controller.go @@ -24,7 +24,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" @@ -34,6 +33,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/hooks" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/predicates" @@ -55,12 +55,11 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "clusterresourcesetbinding") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&addonsv1.ClusterResourceSetBinding{}). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(r.clusterToClusterResourceSetBinding), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). WithOptions(options). WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). diff --git a/internal/controllers/extensionconfig/extensionconfig_controller.go b/internal/controllers/extensionconfig/extensionconfig_controller.go index a8296f0e396f..a82f183a0372 100644 --- a/internal/controllers/extensionconfig/extensionconfig_controller.go +++ b/internal/controllers/extensionconfig/extensionconfig_controller.go @@ -40,6 +40,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" runtimev1 "sigs.k8s.io/cluster-api/api/runtime/v1beta2" runtimeclient "sigs.k8s.io/cluster-api/exp/runtime/client" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util/conditions" v1beta1conditions "sigs.k8s.io/cluster-api/util/conditions/deprecated/v1beta1" "sigs.k8s.io/cluster-api/util/patch" @@ -81,7 +82,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "extensionconfig") - b := ctrl.NewControllerManagedBy(mgr). + b := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&runtimev1.ExtensionConfig{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)) diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 304937e4de00..77e5c614e63d 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -39,7 +39,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -56,6 +55,7 @@ import ( "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/internal/controllers/machine/drain" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/cache" @@ -107,7 +107,7 @@ type Reconciler struct { AdditionalSyncMachineLabels []*regexp.Regexp AdditionalSyncMachineAnnotations []*regexp.Regexp - controller controller.Controller + controller capicontrollerutil.Controller recorder record.EventRecorder externalTracker external.ObjectTracker @@ -115,10 +115,7 @@ type Reconciler struct { // during a single reconciliation. nodeDeletionRetryTimeout time.Duration - // reconcileDeleteCache is used to store when reconcileDelete should not be executed before a - // specific time for a specific Request. This is used to implement rate-limiting to avoid - // e.g. spamming workload clusters with eviction requests during Node drain. - reconcileDeleteCache cache.Cache[cache.ReconcileEntry] + hookCache cache.Cache[cache.HookEntry] predicateLog *logr.Logger } @@ -152,37 +149,33 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt if r.nodeDeletionRetryTimeout.Nanoseconds() == 0 { r.nodeDeletionRetryTimeout = 10 * time.Second } - c, err := ctrl.NewControllerManagedBy(mgr). + + c, err := capicontrollerutil.NewControllerManagedBy(mgr, *r.predicateLog). For(&clusterv1.Machine{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToMachines), - builder.WithPredicates( - // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? - predicates.All(mgr.GetScheme(), *r.predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), *r.predicateLog), - predicates.ClusterControlPlaneInitialized(mgr.GetScheme(), *r.predicateLog), - predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue), - ), - )). + // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? + predicates.ClusterControlPlaneInitialized(mgr.GetScheme(), *r.predicateLog), + predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue), + ). WatchesRawSource(r.ClusterCache.GetClusterSource("machine", clusterToMachines, clustercache.WatchForProbeFailure(r.RemoteConditionsGracePeriod))). Watches( &clusterv1.MachineSet{}, handler.EnqueueRequestsFromMapFunc(msToMachines), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), *r.predicateLog)), ). Watches( &clusterv1.MachineDeployment{}, handler.EnqueueRequestsFromMapFunc(mdToMachines), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), *r.predicateLog)), ). Build(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } + r.hookCache = cache.New[cache.HookEntry](cache.HookCacheDefaultTTL) r.controller = c r.recorder = mgr.GetEventRecorderFor("machine-controller") r.externalTracker = external.ObjectTracker{ @@ -191,7 +184,6 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt Scheme: mgr.GetScheme(), PredicateLogger: r.predicateLog, } - r.reconcileDeleteCache = cache.New[cache.ReconcileEntry](cache.DefaultTTL) return nil } @@ -239,16 +231,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct return ctrl.Result{}, err } - if !m.DeletionTimestamp.IsZero() { - // Check reconcileDeleteCache to ensure we won't run reconcileDelete too frequently. - // Note: The reconcileDelete func will add entries to the cache. - if cacheEntry, ok := r.reconcileDeleteCache.Has(cache.NewReconcileEntryKey(m)); ok { - if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { - return ctrl.Result{RequeueAfter: requeueAfter}, nil - } - } - } - s := &scope{ cluster: cluster, machine: m, @@ -447,12 +429,6 @@ type scope struct { // nodeGetError is the error that occurred when trying to get the Node. nodeGetError error - // reconcileDeleteExecuted will be set to true if the logic in reconcileDelete is executed. - // We might requeue early in reconcileDelete because of rate-limiting. - // If the Machine has the deletionTimestamp set and this field is false we don't update the - // Deleting condition. - reconcileDeleteExecuted bool - // deletingReason is the reason that should be used when setting the Deleting condition. deletingReason string @@ -492,14 +468,6 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result cluster := s.cluster m := s.machine - s.reconcileDeleteExecuted = true - - // Add entry to the reconcileDeleteCache so we won't run reconcileDelete more than once per second. - // Under certain circumstances the ReconcileAfter time will be set to a later time, e.g. when we're waiting - // for Pods to terminate or volumes to detach. - // This is done to ensure we're not spamming the workload cluster API server. - r.reconcileDeleteCache.Add(cache.NewReconcileEntry(s.machine, time.Now().Add(1*time.Second))) - // Set "fallback" reason and message. This is used if we don't set a more specific reason and message below. s.deletingReason = clusterv1.MachineDeletingReason s.deletingMessage = "Deletion started" @@ -948,8 +916,8 @@ func (r *Reconciler) drainNode(ctx context.Context, s *scope) (ctrl.Result, erro return ctrl.Result{}, nil } - // Add entry to the reconcileDeleteCache so we won't retry drain again before drainRetryInterval. - r.reconcileDeleteCache.Add(cache.NewReconcileEntry(machine, time.Now().Add(drainRetryInterval))) + // Slow down the reconcile frequency, because Node drain is a slow process. + r.controller.DeferNextReconcileForObject(machine, time.Now().Add(drainRetryInterval)) conditionMessage := evictionResult.ConditionMessage(machine.Status.Deletion.NodeDrainStartTime) v1beta1conditions.MarkFalse(machine, clusterv1.DrainingSucceededV1Beta1Condition, clusterv1.DrainingV1Beta1Reason, clusterv1.ConditionSeverityInfo, "%s", conditionMessage) @@ -1034,8 +1002,8 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, s *scope) (ct return ctrl.Result{}, nil } - // Add entry to the reconcileDeleteCache so we won't retry shouldWaitForNodeVolumes again before waitForVolumeDetachRetryInterval. - r.reconcileDeleteCache.Add(cache.NewReconcileEntry(machine, time.Now().Add(waitForVolumeDetachRetryInterval))) + // Slow down the reconcile process, because volume detach is a slow process. + r.controller.DeferNextReconcileForObject(machine, time.Now().Add(waitForVolumeDetachRetryInterval)) s.deletingReason = clusterv1.MachineDeletingWaitingForVolumeDetachReason s.deletingMessage = attachedVolumeInformation.conditionMessage(machine) diff --git a/internal/controllers/machine/machine_controller_inplace_update.go b/internal/controllers/machine/machine_controller_inplace_update.go index 3785ab56b6d0..48cbf64fd6c1 100644 --- a/internal/controllers/machine/machine_controller_inplace_update.go +++ b/internal/controllers/machine/machine_controller_inplace_update.go @@ -36,6 +36,7 @@ import ( runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/hooks" + "sigs.k8s.io/cluster-api/util/cache" ) // reconcileInPlaceUpdate handles the in-place update workflow for a Machine. @@ -150,6 +151,13 @@ func (r *Reconciler) callUpdateMachineHook(ctx context.Context, s *scope) (ctrl. return ctrl.Result{}, "", errors.Errorf("found multiple UpdateMachine hooks (%s): only one hook is supported", strings.Join(extensions, ",")) } + if cacheEntry, ok := r.hookCache.Has(cache.NewHookEntryKey(s.machine, runtimehooksv1.UpdateMachine)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling UpdateMachine hook, retry after %s", requeueAfter)) + return ctrl.Result{RequeueAfter: requeueAfter}, cacheEntry.ResponseMessage, nil + } + } + // Note: When building request message, dropping status; Runtime extension should treat UpdateMachine // requests as desired state; it is up to them to compare with current state and perform necessary actions. request := &runtimehooksv1.UpdateMachineRequest{ @@ -171,7 +179,9 @@ func (r *Reconciler) callUpdateMachineHook(ctx context.Context, s *scope) (ctrl. if response.GetRetryAfterSeconds() != 0 { log.Info(fmt.Sprintf("UpdateMachine hook requested retry after %d seconds", response.GetRetryAfterSeconds())) - return ctrl.Result{RequeueAfter: time.Duration(response.GetRetryAfterSeconds()) * time.Second}, response.GetMessage(), nil + requeueAfter := time.Duration(response.RetryAfterSeconds) * time.Second + r.hookCache.Add(cache.NewHookEntry(s.machine, runtimehooksv1.UpdateMachine, time.Now().Add(requeueAfter), response.GetMessage())) + return ctrl.Result{RequeueAfter: requeueAfter}, response.GetMessage(), nil } log.Info("UpdateMachine hook completed successfully") diff --git a/internal/controllers/machine/machine_controller_inplace_update_test.go b/internal/controllers/machine/machine_controller_inplace_update_test.go index 5f574eb41265..ed06eaf41e2d 100644 --- a/internal/controllers/machine/machine_controller_inplace_update_test.go +++ b/internal/controllers/machine/machine_controller_inplace_update_test.go @@ -37,6 +37,7 @@ import ( runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog" "sigs.k8s.io/cluster-api/feature" fakeruntimeclient "sigs.k8s.io/cluster-api/internal/runtime/client/fake" + "sigs.k8s.io/cluster-api/util/cache" ) func TestReconcileInPlaceUpdate(t *testing.T) { @@ -82,7 +83,7 @@ func TestReconcileInPlaceUpdate(t *testing.T) { client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(machine, infra, bootstrap).Build() - return &Reconciler{Client: client}, &scope{ + return &Reconciler{Client: client, hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL)}, &scope{ machine: machine, infraMachine: infra, bootstrapConfig: bootstrap, @@ -190,6 +191,7 @@ func TestReconcileInPlaceUpdate(t *testing.T) { return &Reconciler{ Client: client, RuntimeClient: runtimeClient, + hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), }, &scope{ machine: machine, infraMachine: infra, @@ -254,6 +256,7 @@ func TestReconcileInPlaceUpdate(t *testing.T) { return &Reconciler{ Client: client, RuntimeClient: runtimeClient, + hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), }, &scope{ machine: machine, infraMachine: infra, @@ -335,12 +338,13 @@ func TestCallUpdateMachineHook(t *testing.T) { } tests := []struct { - name string - setup func(*testing.T) (*Reconciler, *scope) - wantResult ctrl.Result - wantMessage string - wantErr bool - wantErrSubstrings []string + name string + setup func(*testing.T) (*Reconciler, *scope) + wantResult ctrl.Result + wantMessage string + wantErr bool + wantErrSubstrings []string + wantHookCacheEntry *cache.HookEntry }{ { name: "fails if no extensions registered", @@ -350,7 +354,7 @@ func TestCallUpdateMachineHook(t *testing.T) { WithCatalog(catalog). WithGetAllExtensionResponses(map[runtimecatalog.GroupVersionHook][]string{}). Build() - return &Reconciler{RuntimeClient: runtimeClient}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} + return &Reconciler{RuntimeClient: runtimeClient, hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL)}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} }, wantErr: true, wantErrSubstrings: []string{"no extensions registered for UpdateMachine hook"}, @@ -365,7 +369,7 @@ func TestCallUpdateMachineHook(t *testing.T) { updateGVH: {"ext-a", "ext-b"}, }). Build() - return &Reconciler{RuntimeClient: runtimeClient}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} + return &Reconciler{RuntimeClient: runtimeClient, hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL)}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} }, wantErr: true, wantErrSubstrings: []string{"found multiple UpdateMachine hooks (ext-a,ext-b): only one hook is supported"}, @@ -387,7 +391,7 @@ func TestCallUpdateMachineHook(t *testing.T) { }, }). Build() - return &Reconciler{RuntimeClient: runtimeClient}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} + return &Reconciler{RuntimeClient: runtimeClient, hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL)}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} }, wantErr: true, wantErrSubstrings: []string{"runtime hook", "UpdateMachine", "failed"}, @@ -413,10 +417,12 @@ func TestCallUpdateMachineHook(t *testing.T) { }, }). Build() - return &Reconciler{RuntimeClient: runtimeClient}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} + return &Reconciler{RuntimeClient: runtimeClient, hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL)}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} }, wantResult: ctrl.Result{RequeueAfter: 45 * time.Second}, wantMessage: "processing", + wantHookCacheEntry: ptr.To(cache.NewHookEntry(newTestMachine(), runtimehooksv1.UpdateMachine, + time.Now().Add(45*time.Second), "processing")), }, { name: "returns message when hook succeeds", @@ -438,7 +444,7 @@ func TestCallUpdateMachineHook(t *testing.T) { }, }). Build() - return &Reconciler{RuntimeClient: runtimeClient}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} + return &Reconciler{RuntimeClient: runtimeClient, hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL)}, &scope{machine: newTestMachine(), infraMachine: newTestUnstructured("GenericInfrastructureMachine", "infrastructure.cluster.x-k8s.io/v1beta2", "infra")} }, wantResult: ctrl.Result{}, wantMessage: "done", @@ -462,6 +468,25 @@ func TestCallUpdateMachineHook(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(Equal(tt.wantResult)) g.Expect(message).To(Equal(tt.wantMessage)) + + if tt.wantHookCacheEntry != nil { + // Verify the cache entry. + cacheEntry, ok := r.hookCache.Has(tt.wantHookCacheEntry.Key()) + g.Expect(ok).To(BeTrue()) + g.Expect(cacheEntry.ObjectKey).To(Equal(tt.wantHookCacheEntry.ObjectKey)) + g.Expect(cacheEntry.HookName).To(Equal(tt.wantHookCacheEntry.HookName)) + g.Expect(cacheEntry.ReconcileAfter).To(BeTemporally("~", tt.wantHookCacheEntry.ReconcileAfter, 5*time.Second)) + g.Expect(cacheEntry.ResponseMessage).To(Equal(tt.wantHookCacheEntry.ResponseMessage)) + + // Call callUpdateMachineHook again and verify the cache hit. + secondResult, message, err := r.callUpdateMachineHook(context.Background(), scope) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(message).To(Equal(tt.wantHookCacheEntry.ResponseMessage)) + // RequeueAfter should be now < then the previous RequeueAfter. + g.Expect(secondResult.RequeueAfter).To(BeNumerically("<", result.RequeueAfter)) + } else { + g.Expect(r.hookCache.Len()).To(Equal(0)) + } }) } } diff --git a/internal/controllers/machine/machine_controller_noderef_test.go b/internal/controllers/machine/machine_controller_noderef_test.go index 7319f6c6bb32..716c0f72c747 100644 --- a/internal/controllers/machine/machine_controller_noderef_test.go +++ b/internal/controllers/machine/machine_controller_noderef_test.go @@ -42,6 +42,7 @@ import ( "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/topology/ownerrefs" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/kubeconfig" "sigs.k8s.io/cluster-api/util/test/builder" @@ -458,7 +459,8 @@ func TestGetNode(t *testing.T) { Client: env, } - w, err := ctrl.NewControllerManagedBy(env.Manager).For(&corev1.Node{}).Build(r) + predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "node") + w, err := capicontrollerutil.NewControllerManagedBy(env.Manager, predicateLog).For(&corev1.Node{}).Build(r) g.Expect(err).ToNot(HaveOccurred()) // Retry because the ClusterCache might not have immediately created the clusterAccessor. diff --git a/internal/controllers/machine/machine_controller_phases.go b/internal/controllers/machine/machine_controller_phases.go index d4fe4608a6d1..6cedb6d2a80d 100644 --- a/internal/controllers/machine/machine_controller_phases.go +++ b/internal/controllers/machine/machine_controller_phases.go @@ -333,6 +333,8 @@ func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctr log.Info(fmt.Sprintf("Waiting for infrastructure provider to set %s on %s", contract.InfrastructureMachine().ProviderID().Path().String(), s.infraMachine.GetKind()), s.infraMachine.GetKind(), klog.KObj(s.infraMachine)) + // Slow down reconcile frequency, provisioning infrastructure takes some time. + r.controller.DeferNextReconcileForObject(s.machine, time.Now().Add(5*time.Second)) return ctrl.Result{}, nil // Note: Requeue is not needed, changes to InfraMachine trigger another reconcile. } diff --git a/internal/controllers/machine/machine_controller_phases_test.go b/internal/controllers/machine/machine_controller_phases_test.go index 10d30fe5b2ce..6dd81ce028aa 100644 --- a/internal/controllers/machine/machine_controller_phases_test.go +++ b/internal/controllers/machine/machine_controller_phases_test.go @@ -30,12 +30,15 @@ import ( "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache/informertest" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/controllers/external" externalfake "sigs.k8s.io/cluster-api/controllers/external/fake" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -404,14 +407,15 @@ func TestReconcileInfrastructure(t *testing.T) { } testCases := []struct { - name string - contract string - machine *clusterv1.Machine - infraMachine map[string]interface{} - infraMachineGetError error - expectResult ctrl.Result - expectError bool - expected func(g *WithT, m *clusterv1.Machine) + name string + contract string + machine *clusterv1.Machine + infraMachine map[string]interface{} + infraMachineGetError error + expectResult ctrl.Result + expectError bool + expected func(g *WithT, m *clusterv1.Machine) + expectDeferNextReconcile time.Duration }{ { name: "err reading infra machine (something different than not found), it should return error", @@ -695,9 +699,10 @@ func TestReconcileInfrastructure(t *testing.T) { "ready": true, }, }, - infraMachineGetError: nil, - expectResult: ctrl.Result{}, - expectError: false, + infraMachineGetError: nil, + expectResult: ctrl.Result{}, + expectError: false, + expectDeferNextReconcile: 5 * time.Second, }, { name: "should never revert back to infrastructure not ready", @@ -974,8 +979,11 @@ func TestReconcileInfrastructure(t *testing.T) { g.Expect(c.Create(ctx, infraMachine)).To(Succeed()) } + fc := capicontrollerutil.NewFakeController() + r := &Reconciler{ - Client: c, + Client: c, + controller: fc, externalTracker: external.ObjectTracker{ Controller: externalfake.Controller{}, Cache: &informertest.FakeInformers{}, @@ -995,6 +1003,15 @@ func TestReconcileInfrastructure(t *testing.T) { if tc.expected != nil { tc.expected(g, tc.machine) } + + if tc.expectDeferNextReconcile == 0 { + g.Expect(fc.Deferrals).To(BeEmpty()) + } else { + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(s.machine)}, + BeTemporally("~", time.Now().Add(tc.expectDeferNextReconcile), 1*time.Second)), + ) + } }) } } diff --git a/internal/controllers/machine/machine_controller_status.go b/internal/controllers/machine/machine_controller_status.go index 5b2970e6e44f..d311288934da 100644 --- a/internal/controllers/machine/machine_controller_status.go +++ b/internal/controllers/machine/machine_controller_status.go @@ -66,7 +66,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) ctrl.Result { // in reconcileDelete (e.g. status.Deletion nested fields), and also in the defer patch at the end of the main reconcile loop (status.ObservedGeneration) etc. // Note: also other controllers adds conditions to the machine object (machine's owner controller sets the UpToDate condition, // MHC controller sets HealthCheckSucceeded and OwnerRemediated conditions, KCP sets conditions about etcd and control plane pods). - setDeletingCondition(ctx, s.machine, s.reconcileDeleteExecuted, s.deletingReason, s.deletingMessage) + setDeletingCondition(ctx, s.machine, s.deletingReason, s.deletingMessage) setUpdatingCondition(ctx, s.machine, s.updatingReason, s.updatingMessage) setUpToDateCondition(ctx, s.machine, s.owningMachineSet, s.owningMachineDeployment) setReadyCondition(ctx, s.machine) @@ -613,7 +613,7 @@ func transformControlPlaneAndEtcdConditions(messages []string) []string { return out } -func setDeletingCondition(_ context.Context, machine *clusterv1.Machine, reconcileDeleteExecuted bool, deletingReason, deletingMessage string) { +func setDeletingCondition(_ context.Context, machine *clusterv1.Machine, deletingReason, deletingMessage string) { if machine.DeletionTimestamp.IsZero() { conditions.Set(machine, metav1.Condition{ Type: clusterv1.MachineDeletingCondition, @@ -623,12 +623,6 @@ func setDeletingCondition(_ context.Context, machine *clusterv1.Machine, reconci return } - if !reconcileDeleteExecuted { - // Don't update the Deleting condition if reconcileDelete was not executed (e.g. - // because of rate-limiting). - return - } - conditions.Set(machine, metav1.Condition{ Type: clusterv1.MachineDeletingCondition, Status: metav1.ConditionTrue, diff --git a/internal/controllers/machine/machine_controller_status_test.go b/internal/controllers/machine/machine_controller_status_test.go index fa8e7dc5a13c..d8d8ede25dcf 100644 --- a/internal/controllers/machine/machine_controller_status_test.go +++ b/internal/controllers/machine/machine_controller_status_test.go @@ -1171,12 +1171,11 @@ func TestSetNodeHealthyAndReadyConditions(t *testing.T) { func TestDeletingCondition(t *testing.T) { testCases := []struct { - name string - machine *clusterv1.Machine - reconcileDeleteExecuted bool - deletingReason string - deletingMessage string - expectCondition metav1.Condition + name string + machine *clusterv1.Machine + deletingReason string + deletingMessage string + expectCondition metav1.Condition }{ { name: "deletionTimestamp not set", @@ -1186,9 +1185,8 @@ func TestDeletingCondition(t *testing.T) { Namespace: metav1.NamespaceDefault, }, }, - reconcileDeleteExecuted: false, - deletingReason: "", - deletingMessage: "", + deletingReason: "", + deletingMessage: "", expectCondition: metav1.Condition{ Type: clusterv1.MachineDeletingCondition, Status: metav1.ConditionFalse, @@ -1204,9 +1202,8 @@ func TestDeletingCondition(t *testing.T) { DeletionTimestamp: &metav1.Time{Time: time.Now()}, }, }, - reconcileDeleteExecuted: true, - deletingReason: clusterv1.MachineDeletingWaitingForPreDrainHookReason, - deletingMessage: "Waiting for pre-drain hooks to succeed (hooks: test-hook)", + deletingReason: clusterv1.MachineDeletingWaitingForPreDrainHookReason, + deletingMessage: "Waiting for pre-drain hooks to succeed (hooks: test-hook)", expectCondition: metav1.Condition{ Type: clusterv1.MachineDeletingCondition, Status: metav1.ConditionTrue, @@ -1223,8 +1220,7 @@ func TestDeletingCondition(t *testing.T) { DeletionTimestamp: &metav1.Time{Time: time.Now()}, }, }, - reconcileDeleteExecuted: true, - deletingReason: clusterv1.MachineDeletingDrainingNodeReason, + deletingReason: clusterv1.MachineDeletingDrainingNodeReason, deletingMessage: `Drain not completed yet (started at 2024-10-09T16:13:59Z): * Pods with deletionTimestamp that still exist: pod-2-deletionTimestamp-set-1, pod-2-deletionTimestamp-set-2, pod-2-deletionTimestamp-set-3, pod-3-to-trigger-eviction-successfully-1, pod-3-to-trigger-eviction-successfully-2, ... (2 more) * Pods with eviction failed: @@ -1249,43 +1245,13 @@ func TestDeletingCondition(t *testing.T) { * ... (1 more error applying to 1 Pod)`, }, }, - { - name: "deletionTimestamp set, reconcileDelete not executed", - machine: &clusterv1.Machine{ - ObjectMeta: metav1.ObjectMeta{ - Name: "machine-test", - Namespace: metav1.NamespaceDefault, - DeletionTimestamp: &metav1.Time{Time: time.Now()}, - }, - Status: clusterv1.MachineStatus{ - Conditions: []metav1.Condition{ - { - Type: clusterv1.MachineDeletingCondition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineDeletingWaitingForPreDrainHookReason, - Message: "Waiting for pre-drain hooks to succeed (hooks: test-hook)", - }, - }, - }, - }, - reconcileDeleteExecuted: false, - deletingReason: "", - deletingMessage: "", - // Condition was not updated because reconcileDelete was not executed. - expectCondition: metav1.Condition{ - Type: clusterv1.MachineDeletingCondition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineDeletingWaitingForPreDrainHookReason, - Message: "Waiting for pre-drain hooks to succeed (hooks: test-hook)", - }, - }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - setDeletingCondition(ctx, tc.machine, tc.reconcileDeleteExecuted, tc.deletingReason, tc.deletingMessage) + setDeletingCondition(ctx, tc.machine, tc.deletingReason, tc.deletingMessage) deletingCondition := conditions.Get(tc.machine, clusterv1.MachineDeletingCondition) g.Expect(deletingCondition).ToNot(BeNil()) diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index 51aac8bbfdc8..b12bf4581d3f 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -50,8 +50,8 @@ import ( externalfake "sigs.k8s.io/cluster-api/controllers/external/fake" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/contract" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" - "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conditions" v1beta1conditions "sigs.k8s.io/cluster-api/util/conditions/deprecated/v1beta1" "sigs.k8s.io/cluster-api/util/patch" @@ -968,10 +968,9 @@ func TestReconcileRequest(t *testing.T) { ).WithStatusSubresource(&clusterv1.Machine{}).WithIndex(&corev1.Node{}, index.NodeProviderIDField, index.NodeByProviderID).Build() r := &Reconciler{ - Client: clientFake, - ClusterCache: clustercache.NewFakeClusterCache(clientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), - recorder: record.NewFakeRecorder(10), - reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL), + Client: clientFake, + ClusterCache: clustercache.NewFakeClusterCache(clientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}), + recorder: record.NewFakeRecorder(10), externalTracker: external.ObjectTracker{ Controller: externalfake.Controller{}, Cache: &informertest.FakeInformers{}, @@ -1340,6 +1339,10 @@ func TestMachineV1Beta1Conditions(t *testing.T) { type fakeController struct{} +func (f fakeController) DeferNextReconcile(_ reconcile.Request, _ time.Time) {} + +func (f fakeController) DeferNextReconcileForObject(_ metav1.Object, _ time.Time) {} + func (f fakeController) Reconcile(_ context.Context, _ reconcile.Request) (reconcile.Result, error) { panic("implement me") } @@ -1392,9 +1395,8 @@ func TestRemoveMachineFinalizerAfterDeleteReconcile(t *testing.T) { key := client.ObjectKey{Namespace: m.Namespace, Name: m.Name} c := fake.NewClientBuilder().WithObjects(testCluster, m, builder.GenericInfrastructureMachineCRD.DeepCopy()).WithStatusSubresource(&clusterv1.Machine{}).Build() mr := &Reconciler{ - Client: c, - ClusterCache: clustercache.NewFakeClusterCache(c, client.ObjectKeyFromObject(testCluster)), - reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL), + Client: c, + ClusterCache: clustercache.NewFakeClusterCache(c, client.ObjectKeyFromObject(testCluster)), } _, err := mr.Reconcile(ctx, reconcile.Request{NamespacedName: key}) g.Expect(err).ToNot(HaveOccurred()) @@ -1589,16 +1591,17 @@ func TestDrainNode(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) tests := []struct { - name string - nodeName string - node *corev1.Node - pods []*corev1.Pod - nodeDrainStartTime metav1.Time - wantV1Beta1Condition *clusterv1.Condition - wantResult ctrl.Result - wantErr string - wantDeletingReason string - wantDeletingMessage string + name string + nodeName string + node *corev1.Node + pods []*corev1.Pod + nodeDrainStartTime metav1.Time + wantV1Beta1Condition *clusterv1.Condition + wantResult ctrl.Result + wantErr string + wantDeletingReason string + wantDeletingMessage string + expectDeferNextReconcile time.Duration }{ { name: "Node does not exist, no-op", @@ -1714,6 +1717,7 @@ func TestDrainNode(t *testing.T) { wantDeletingReason: clusterv1.MachineDeletingDrainingNodeReason, wantDeletingMessage: `Drain not completed yet (started at 2024-10-09T16:13:59Z): * Pod test-namespace/pod-2-delete-running-deployment-pod: deletionTimestamp set, but still not removed from the Node`, + expectDeferNextReconcile: drainRetryInterval, }, { name: "Node does exist but is unreachable, no Pods have to be drained because they all have old deletionTimestamps", @@ -1789,10 +1793,12 @@ func TestDrainNode(t *testing.T) { WithObjects(remoteObjs...). Build() + fc := capicontrollerutil.NewFakeController() + r := &Reconciler{ - Client: c, - ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)), - reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL), + Client: c, + controller: fc, + ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)), } testMachine.Status.NodeRef = clusterv1.MachineNodeReference{ @@ -1837,6 +1843,15 @@ func TestDrainNode(t *testing.T) { g.Expect(remoteClient.Get(ctx, client.ObjectKeyFromObject(tt.node), gotNode)).To(Succeed()) g.Expect(gotNode.Spec.Unschedulable).To(BeTrue()) } + + if tt.expectDeferNextReconcile == 0 { + g.Expect(fc.Deferrals).To(BeEmpty()) + } else { + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(s.machine)}, + BeTemporally("~", time.Now().Add(tt.expectDeferNextReconcile), 1*time.Second)), + ) + } }) } } @@ -1923,11 +1938,12 @@ func TestDrainNode_withCaching(t *testing.T) { WithObjects(remoteObjs...). Build() - reconcileDeleteCache := cache.New[cache.ReconcileEntry](cache.DefaultTTL) + fc := capicontrollerutil.NewFakeController() + r := &Reconciler{ - Client: c, - ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)), - reconcileDeleteCache: reconcileDeleteCache, + Client: c, + ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)), + controller: fc, } s := &scope{ @@ -1962,10 +1978,10 @@ func TestDrainNode_withCaching(t *testing.T) { g.Expect(gotNode.Spec.Unschedulable).To(BeTrue()) // Drain cache should have an entry for the Machine - gotEntry, ok := reconcileDeleteCache.Has(cache.NewReconcileEntryKey(testMachine)) - g.Expect(ok).To(BeTrue()) - g.Expect(gotEntry.Request.Namespace).To(Equal(testMachine.Namespace)) - g.Expect(gotEntry.Request.Name).To(Equal(testMachine.Name)) + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(s.machine)}, + BeTemporally("~", time.Now().Add(drainRetryInterval), 1*time.Second)), + ) } func TestIsNodeVolumeDetachingAllowed(t *testing.T) { @@ -2202,13 +2218,14 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { } tests := []struct { - name string - node *corev1.Node - remoteObjects []client.Object - featureGateDisabled bool - expected ctrl.Result - expectedDeletingReason string - expectedDeletingMessage string + name string + node *corev1.Node + remoteObjects []client.Object + featureGateDisabled bool + expected ctrl.Result + expectedDeletingReason string + expectedDeletingMessage string + expectDeferNextReconcile time.Duration }{ { name: "Node has volumes attached according to node status", @@ -2233,6 +2250,7 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { expectedDeletingReason: clusterv1.MachineDeletingWaitingForVolumeDetachReason, expectedDeletingMessage: `Waiting for Node volumes to be detached (started at 2024-10-09T16:13:59Z) * PersistentVolumeClaims: default/test-pvc`, + expectDeferNextReconcile: waitForVolumeDetachRetryInterval, }, { name: "Node has volumes attached according to node status but the pv does not reference a PersistentVolumeClaim", @@ -2257,6 +2275,7 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { expectedDeletingReason: clusterv1.MachineDeletingWaitingForVolumeDetachReason, expectedDeletingMessage: `Waiting for Node volumes to be detached (started at 2024-10-09T16:13:59Z) * PersistentVolumes without a .spec.claimRef to a PersistentVolumeClaim: test-pv`, + expectDeferNextReconcile: waitForVolumeDetachRetryInterval, }, { name: "Node has volumes attached according to node status but without a pv", @@ -2279,6 +2298,7 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { expectedDeletingReason: clusterv1.MachineDeletingWaitingForVolumeDetachReason, expectedDeletingMessage: `Waiting for Node volumes to be detached (started at 2024-10-09T16:13:59Z) * Node with .status.volumesAttached entries not matching a PersistentVolume: kubernetes.io/csi/dummy^foo`, + expectDeferNextReconcile: waitForVolumeDetachRetryInterval, }, { name: "Node has volumes attached according to node status but its from a daemonset pod which gets ignored", @@ -2356,6 +2376,7 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { expectedDeletingReason: clusterv1.MachineDeletingWaitingForVolumeDetachReason, expectedDeletingMessage: `Waiting for Node volumes to be detached (started at 2024-10-09T16:13:59Z) * PersistentVolumeClaims: default/test-pvc`, + expectDeferNextReconcile: waitForVolumeDetachRetryInterval, }, { name: "Node has volumes attached according to volumeattachments (but ignored because feature gate is disabled)", @@ -2401,6 +2422,7 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { expectedDeletingReason: clusterv1.MachineDeletingWaitingForVolumeDetachReason, expectedDeletingMessage: `Waiting for Node volumes to be detached (started at 2024-10-09T16:13:59Z) * VolumeAttachment with .spec.source.persistentVolumeName not matching a PersistentVolume: test-pv`, + expectDeferNextReconcile: waitForVolumeDetachRetryInterval, }, { name: "Node has volumes attached according to volumeattachments but its from a daemonset pod which gets ignored", @@ -2502,6 +2524,7 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { expectedDeletingReason: clusterv1.MachineDeletingWaitingForVolumeDetachReason, expectedDeletingMessage: `Waiting for Node volumes to be detached (started at 2024-10-09T16:13:59Z) * PersistentVolumeClaims: default/test-pvc`, + expectDeferNextReconcile: waitForVolumeDetachRetryInterval, }, { name: "Node has no volumes attached", @@ -2580,10 +2603,12 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { remoteFakeClient := fake.NewClientBuilder().WithIndex(&corev1.Pod{}, "spec.nodeName", nodeNameIndex). WithObjects(remoteObjects...).Build() + fc := capicontrollerutil.NewFakeController() + r := &Reconciler{ - Client: fakeClient, - ClusterCache: clustercache.NewFakeClusterCache(remoteFakeClient, client.ObjectKeyFromObject(testCluster)), - reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL), + Client: fakeClient, + ClusterCache: clustercache.NewFakeClusterCache(remoteFakeClient, client.ObjectKeyFromObject(testCluster)), + controller: fc, } testMachine.Status.NodeRef = clusterv1.MachineNodeReference{ @@ -2600,6 +2625,15 @@ func TestShouldWaitForNodeVolumes(t *testing.T) { g.Expect(got).To(BeEquivalentTo(tt.expected)) g.Expect(s.deletingReason).To(BeEquivalentTo(tt.expectedDeletingReason)) g.Expect(s.deletingMessage).To(BeEquivalentTo(tt.expectedDeletingMessage)) + + if tt.expectDeferNextReconcile == 0 { + g.Expect(fc.Deferrals).To(BeEmpty()) + } else { + g.Expect(fc.Deferrals).To(HaveKeyWithValue( + reconcile.Request{NamespacedName: client.ObjectKeyFromObject(s.machine)}, + BeTemporally("~", time.Now().Add(tt.expectDeferNextReconcile), 1*time.Second)), + ) + } }) } } @@ -3474,7 +3508,6 @@ func TestNodeDeletion(t *testing.T) { ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKeyFromObject(&testCluster)), recorder: record.NewFakeRecorder(10), nodeDeletionRetryTimeout: 10 * time.Millisecond, - reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL), } cluster := testCluster.DeepCopy() @@ -3607,7 +3640,6 @@ func TestNodeDeletionWithoutNodeRefFallback(t *testing.T) { ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKeyFromObject(&testCluster)), recorder: record.NewFakeRecorder(10), nodeDeletionRetryTimeout: 10 * time.Millisecond, - reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL), } s := &scope{ diff --git a/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go b/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go index bc3917afe377..f25a13ef83c0 100644 --- a/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go +++ b/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" @@ -37,6 +38,18 @@ import ( "sigs.k8s.io/cluster-api/internal/util/patch" ) +// CanUpdateMachineSetCacheEntry is an entry for the CanUpdateMachineSet hook cache. +type CanUpdateMachineSetCacheEntry struct { + OldMS client.ObjectKey + NewMS client.ObjectKey + CanUpdateMachineSet bool +} + +// Key returns the cache key of a CanUpdateMachineSetCacheEntry. +func (r CanUpdateMachineSetCacheEntry) Key() string { + return fmt.Sprintf("%s => %s", r.OldMS, r.NewMS) +} + func (p *rolloutPlanner) canUpdateMachineSetInPlace(ctx context.Context, oldMS, newMS *clusterv1.MachineSet) (bool, error) { if p.overrideCanUpdateMachineSetInPlace != nil { return p.overrideCanUpdateMachineSetInPlace(ctx, oldMS, newMS) @@ -72,10 +85,27 @@ func (p *rolloutPlanner) canUpdateMachineSetInPlace(ctx context.Context, oldMS, return false, errors.Errorf("found multiple CanUpdateMachineSet hooks (%s): only one hook is supported", strings.Join(extensionHandlers, ",")) } + entry := CanUpdateMachineSetCacheEntry{ + OldMS: client.ObjectKeyFromObject(oldMS), + NewMS: client.ObjectKeyFromObject(newMS), + } + + if cacheEntry, ok := p.canUpdateMachineSetCache.Has(entry.Key()); ok { + if cacheEntry.CanUpdateMachineSet { + log.V(5).Info(fmt.Sprintf("MachineSet %s can be updated in-place by extensions (cached)", oldMS.Name)) + } else { + log.V(5).Info(fmt.Sprintf("MachineSet %s cannot be updated in-place by extensions (cached)", oldMS.Name)) + } + return cacheEntry.CanUpdateMachineSet, nil + } + canUpdateMachineSet, reasons, err := p.canExtensionsUpdateMachineSet(ctx, oldMS, newMS, templateObjects, extensionHandlers) if err != nil { return false, err } + entry.CanUpdateMachineSet = canUpdateMachineSet + p.canUpdateMachineSetCache.Add(entry) + if !canUpdateMachineSet { log.Info(fmt.Sprintf("MachineSet %s cannot be updated in-place by extensions", oldMS.Name), "reason", strings.Join(reasons, ",")) return false, nil diff --git a/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset_test.go b/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset_test.go index 845b6ae424c9..c6ece5c9e4ff 100644 --- a/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset_test.go +++ b/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset_test.go @@ -38,6 +38,7 @@ import ( fakeruntimeclient "sigs.k8s.io/cluster-api/internal/runtime/client/fake" "sigs.k8s.io/cluster-api/internal/util/compare" "sigs.k8s.io/cluster-api/internal/util/patch" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -81,6 +82,7 @@ func Test_canUpdateMachineSetInPlace(t *testing.T) { wantCanUpdateMachineSet bool wantError bool wantErrorMessage string + wantCacheEntry *CanUpdateMachineSetCacheEntry }{ { name: "Return error if oldMS infrastructureRef is not set", @@ -159,6 +161,11 @@ func Test_canUpdateMachineSetInPlace(t *testing.T) { }, wantCanExtensionsUpdateMachineSetCalled: true, wantCanUpdateMachineSet: false, + wantCacheEntry: &CanUpdateMachineSetCacheEntry{ + OldMS: client.ObjectKeyFromObject(oldMS), + NewMS: client.ObjectKeyFromObject(newMS), + CanUpdateMachineSet: false, + }, }, { name: "Return true if canExtensionsUpdateMachineSet returns true", @@ -179,6 +186,11 @@ func Test_canUpdateMachineSetInPlace(t *testing.T) { }, wantCanExtensionsUpdateMachineSetCalled: true, wantCanUpdateMachineSet: true, + wantCacheEntry: &CanUpdateMachineSetCacheEntry{ + OldMS: client.ObjectKeyFromObject(oldMS), + NewMS: client.ObjectKeyFromObject(newMS), + CanUpdateMachineSet: true, + }, }, } for _, tt := range tests { @@ -216,10 +228,13 @@ func Test_canUpdateMachineSetInPlace(t *testing.T) { fakeClient := fake.NewClientBuilder().WithObjects(objs...).Build() + canUpdateMachineSetCache := cache.New[CanUpdateMachineSetCacheEntry](cache.HookCacheDefaultTTL) + var canExtensionsUpdateMachineSetCalled bool p := &rolloutPlanner{ - RuntimeClient: runtimeClient, - Client: fakeClient, + RuntimeClient: runtimeClient, + Client: fakeClient, + canUpdateMachineSetCache: canUpdateMachineSetCache, overrideCanExtensionsUpdateMachineSet: func(ctx context.Context, oldMS, newMS *clusterv1.MachineSet, templateObjects *templateObjects, extensionHandlers []string) (bool, []string, error) { canExtensionsUpdateMachineSetCalled = true return tt.canExtensionsUpdateMachineSetFunc(ctx, oldMS, newMS, templateObjects, extensionHandlers) @@ -236,6 +251,26 @@ func Test_canUpdateMachineSetInPlace(t *testing.T) { g.Expect(canUpdateMachineSet).To(Equal(tt.wantCanUpdateMachineSet)) g.Expect(canExtensionsUpdateMachineSetCalled).To(Equal(tt.wantCanExtensionsUpdateMachineSetCalled), "canExtensionsUpdateMachineSetCalled: actual: %t expected: %t", canExtensionsUpdateMachineSetCalled, tt.wantCanExtensionsUpdateMachineSetCalled) + + cacheEntry, ok := canUpdateMachineSetCache.Has(CanUpdateMachineSetCacheEntry{ + OldMS: client.ObjectKeyFromObject(oldMS), + NewMS: client.ObjectKeyFromObject(newMS), + }.Key()) + if tt.wantCacheEntry == nil { + g.Expect(ok).To(BeFalse()) + g.Expect(cacheEntry).To(Equal(CanUpdateMachineSetCacheEntry{})) + } else { + // Verify the cache entry. + g.Expect(ok).To(BeTrue()) + g.Expect(cacheEntry).To(Equal(*tt.wantCacheEntry)) + + // Call canUpdateMachineSetInPlace again and verify the cache hit. + canExtensionsUpdateMachineSetCalled = false + canUpdateMachineSet, err := p.canUpdateMachineSetInPlace(ctx, oldMS, newMS) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(canUpdateMachineSet).To(Equal(tt.wantCanUpdateMachineSet)) + g.Expect(canExtensionsUpdateMachineSetCalled).To(BeFalse()) + } }) } } diff --git a/internal/controllers/machinedeployment/machinedeployment_controller.go b/internal/controllers/machinedeployment/machinedeployment_controller.go index 2e7d27b13ed6..2b6a0f349fba 100644 --- a/internal/controllers/machinedeployment/machinedeployment_controller.go +++ b/internal/controllers/machinedeployment/machinedeployment_controller.go @@ -32,7 +32,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -43,8 +42,10 @@ import ( runtimeclient "sigs.k8s.io/cluster-api/exp/runtime/client" "sigs.k8s.io/cluster-api/feature" clientutil "sigs.k8s.io/cluster-api/internal/util/client" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/collections" v1beta1conditions "sigs.k8s.io/cluster-api/util/conditions/deprecated/v1beta1" "sigs.k8s.io/cluster-api/util/finalizers" @@ -82,6 +83,8 @@ type Reconciler struct { recorder record.EventRecorder ssaCache ssa.Cache + + canUpdateMachineSetCache cache.Cache[CanUpdateMachineSetCacheEntry] } func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { @@ -98,30 +101,27 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt return err } - err = ctrl.NewControllerManagedBy(mgr). + err = capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&clusterv1.MachineDeployment{}). - Owns(&clusterv1.MachineSet{}, builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog))). + Owns(&clusterv1.MachineSet{}). // Watches enqueues MachineDeployment for corresponding MachineSet resources, if no managed controller reference (owner) exists. Watches( &clusterv1.MachineSet{}, handler.EnqueueRequestsFromMapFunc(r.MachineSetToDeployments), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToMachineDeployments), - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), - )), + predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? ).Complete(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") } + r.canUpdateMachineSetCache = cache.New[CanUpdateMachineSetCacheEntry](cache.HookCacheDefaultTTL) r.recorder = mgr.GetEventRecorderFor("machinedeployment-controller") r.ssaCache = ssa.NewCache("machinedeployment") return nil @@ -195,10 +195,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct if err := patchMachineDeployment(ctx, patchHelper, deployment, patchOpts...); err != nil { reterr = kerrors.NewAggregate([]error{reterr, err}) } - - if reterr != nil { - retres = ctrl.Result{} - } }() // Handle deletion reconciliation loop. diff --git a/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete.go b/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete.go index 8429a578b946..8a7d1468e0d6 100644 --- a/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete.go +++ b/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete.go @@ -32,7 +32,7 @@ import ( // rolloutOnDelete reconcile machine sets controlled by a MachineDeployment that is using the OnDelete strategy. func (r *Reconciler) rolloutOnDelete(ctx context.Context, md *clusterv1.MachineDeployment, msList []*clusterv1.MachineSet, machines collections.Machines, templateExists bool) error { - planner := newRolloutPlanner(r.Client, r.RuntimeClient) + planner := newRolloutPlanner(r.Client, r.RuntimeClient, r.canUpdateMachineSetCache) if err := planner.init(ctx, md, msList, machines.UnsortedList(), true, templateExists); err != nil { return err } diff --git a/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete_test.go b/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete_test.go index 749e30948265..be07f6f90994 100644 --- a/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete_test.go +++ b/internal/controllers/machinedeployment/machinedeployment_rollout_ondelete_test.go @@ -338,7 +338,7 @@ func TestReconcileOldMachineSetsOnDelete(t *testing.T) { for _, tt := range testCases { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - planner := newRolloutPlanner(nil, nil) + planner := newRolloutPlanner(nil, nil, nil) planner.scaleIntents = tt.scaleIntent planner.md = tt.machineDeployment planner.newMS = tt.newMachineSet @@ -593,7 +593,7 @@ func runOnDeleteTestCase(ctx context.Context, t *testing.T, tt onDeleteSequenceT fLogger.Logf("[MD controller] Iteration %d, Reconcile md", i) // Running a small subset of MD reconcile (the rollout logic and a bit of setReplicas) - p := newRolloutPlanner(nil, nil) + p := newRolloutPlanner(nil, nil, nil) p.overrideComputeDesiredMS = func(ctx context.Context, deployment *clusterv1.MachineDeployment, currentNewMS *clusterv1.MachineSet) (*clusterv1.MachineSet, error) { log := ctrl.LoggerFrom(ctx) desiredNewMS := currentNewMS diff --git a/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go b/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go index 5ef1a9d2ee59..eb75273f1b51 100644 --- a/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go +++ b/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go @@ -33,11 +33,13 @@ import ( "sigs.k8s.io/cluster-api/internal/controllers/machinedeployment/mdutil" "sigs.k8s.io/cluster-api/internal/util/hash" "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/cache" ) type rolloutPlanner struct { - Client client.Client - RuntimeClient runtimeclient.Client + Client client.Client + RuntimeClient runtimeclient.Client + canUpdateMachineSetCache cache.Cache[CanUpdateMachineSetCacheEntry] md *clusterv1.MachineDeployment revision string @@ -61,12 +63,13 @@ type rolloutPlanner struct { overrideCanExtensionsUpdateMachineSet func(ctx context.Context, oldMS, newMS *clusterv1.MachineSet, templateObjects *templateObjects, extensionHandlers []string) (bool, []string, error) } -func newRolloutPlanner(c client.Client, runtimeClient runtimeclient.Client) *rolloutPlanner { +func newRolloutPlanner(c client.Client, runtimeClient runtimeclient.Client, canUpdateMachineSetCache cache.Cache[CanUpdateMachineSetCacheEntry]) *rolloutPlanner { return &rolloutPlanner{ - Client: c, - RuntimeClient: runtimeClient, - scaleIntents: make(map[string]int32), - notes: make(map[string][]string), + Client: c, + RuntimeClient: runtimeClient, + canUpdateMachineSetCache: canUpdateMachineSetCache, + scaleIntents: make(map[string]int32), + notes: make(map[string][]string), } } diff --git a/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate.go b/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate.go index c78781ab5196..7b692963d67c 100644 --- a/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate.go +++ b/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate.go @@ -38,7 +38,7 @@ import ( // rolloutRollingUpdate reconcile machine sets controlled by a MachineDeployment that is using the RolloutUpdate strategy. func (r *Reconciler) rolloutRollingUpdate(ctx context.Context, md *clusterv1.MachineDeployment, msList []*clusterv1.MachineSet, machines collections.Machines, templateExists bool) error { - planner := newRolloutPlanner(r.Client, r.RuntimeClient) + planner := newRolloutPlanner(r.Client, r.RuntimeClient, r.canUpdateMachineSetCache) if err := planner.init(ctx, md, msList, machines.UnsortedList(), true, templateExists); err != nil { return err } diff --git a/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate_test.go b/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate_test.go index a40d90b344be..39c9dc7890cd 100644 --- a/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate_test.go +++ b/internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate_test.go @@ -124,7 +124,7 @@ func TestReconcileReplicasPendingAcknowledgeMove(t *testing.T) { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - planner := newRolloutPlanner(nil, nil) + planner := newRolloutPlanner(nil, nil, nil) planner.md = tc.md planner.newMS = tc.newMS if tc.originalNewMS != nil { @@ -355,7 +355,7 @@ func TestReconcileNewMachineSet(t *testing.T) { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - planner := newRolloutPlanner(nil, nil) + planner := newRolloutPlanner(nil, nil, nil) planner.md = tc.machineDeployment planner.newMS = tc.newMachineSet planner.oldMSs = tc.oldMachineSets @@ -1256,7 +1256,7 @@ func TestReconcileInPlaceUpdateIntent(t *testing.T) { canUpdateCalls := make(map[string]bool) - planner := newRolloutPlanner(nil, nil) + planner := newRolloutPlanner(nil, nil, nil) planner.md = tc.md planner.newMS = tc.newMS planner.oldMSs = tc.oldMS @@ -1864,7 +1864,7 @@ func runRollingUpdateTestCase(ctx context.Context, t *testing.T, tt rollingUpdat fLogger.Logf("[MD controller] Iteration %d, Reconcile md", i) // Running a small subset of MD reconcile (the rollout logic and a bit of setReplicas) - p := newRolloutPlanner(nil, nil) + p := newRolloutPlanner(nil, nil, nil) p.overrideCanUpdateMachineSetInPlace = func(_ context.Context, _, _ *clusterv1.MachineSet) (bool, error) { return false, nil } if tt.overrideCanUpdateMachineSetInPlaceFunc != nil { p.overrideCanUpdateMachineSetInPlace = tt.overrideCanUpdateMachineSetInPlaceFunc diff --git a/internal/controllers/machinedeployment/machinedeployment_sync.go b/internal/controllers/machinedeployment/machinedeployment_sync.go index 4c67959e5e69..c5a08c40fc0c 100644 --- a/internal/controllers/machinedeployment/machinedeployment_sync.go +++ b/internal/controllers/machinedeployment/machinedeployment_sync.go @@ -45,7 +45,7 @@ func (r *Reconciler) sync(ctx context.Context, md *clusterv1.MachineDeployment, // - identifying newMS and OldMS when necessary // - computing desired state for newMS and OldMS, including managing rollout related annotations and // in-place propagation of labels, annotations and other fields. - planner := newRolloutPlanner(r.Client, r.RuntimeClient) + planner := newRolloutPlanner(r.Client, r.RuntimeClient, r.canUpdateMachineSetCache) if err := planner.init(ctx, md, msList, machines.UnsortedList(), false, templateExists); err != nil { return err } diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go index a190ba799c7a..738dcef64799 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go @@ -37,7 +37,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" @@ -49,6 +48,7 @@ import ( "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" "sigs.k8s.io/cluster-api/internal/controllers/machine" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/conditions" @@ -100,26 +100,20 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt } r.predicateLog = ptr.To(ctrl.LoggerFrom(ctx).WithValues("controller", "machinehealthcheck")) - c, err := ctrl.NewControllerManagedBy(mgr). + c, err := capicontrollerutil.NewControllerManagedBy(mgr, *r.predicateLog). For(&clusterv1.MachineHealthCheck{}). Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(r.machineToMachineHealthCheck), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), *r.predicateLog)), ). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(r.clusterToMachineHealthCheck), - builder.WithPredicates( - // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? - predicates.All(mgr.GetScheme(), *r.predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), *r.predicateLog), - predicates.ClusterPausedTransitions(mgr.GetScheme(), *r.predicateLog), - predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue), - ), - ), + // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? + predicates.ClusterPausedTransitions(mgr.GetScheme(), *r.predicateLog), + predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue), ). WatchesRawSource(r.ClusterCache.GetClusterSource("machinehealthcheck", r.clusterToMachineHealthCheck)). Build(r) diff --git a/internal/controllers/machinepool/machinepool_controller.go b/internal/controllers/machinepool/machinepool_controller.go index c499fd8a3ef7..5eb6f6dc5386 100644 --- a/internal/controllers/machinepool/machinepool_controller.go +++ b/internal/controllers/machinepool/machinepool_controller.go @@ -32,7 +32,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -44,6 +43,7 @@ import ( "sigs.k8s.io/cluster-api/api/core/v1beta2/index" "sigs.k8s.io/cluster-api/controllers/clustercache" "sigs.k8s.io/cluster-api/controllers/external" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -99,7 +99,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt return err } - c, err := ctrl.NewControllerManagedBy(mgr). + c, err := capicontrollerutil.NewControllerManagedBy(mgr, *r.predicateLog). For(&clusterv1.MachinePool{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue)). @@ -107,13 +107,8 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToMachinePools), // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? - builder.WithPredicates( - predicates.All(mgr.GetScheme(), *r.predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), *r.predicateLog), - predicates.ClusterPausedTransitions(mgr.GetScheme(), *r.predicateLog), - predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue), - ), - ), + predicates.ClusterPausedTransitions(mgr.GetScheme(), *r.predicateLog), + predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue), ). WatchesRawSource(r.ClusterCache.GetClusterSource("machinepool", clusterToMachinePools)). Build(r) diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index aea83cff1d73..92b94a7efa45 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -38,7 +38,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -54,6 +53,7 @@ import ( "sigs.k8s.io/cluster-api/internal/hooks" topologynames "sigs.k8s.io/cluster-api/internal/topology/names" clientutil "sigs.k8s.io/cluster-api/internal/util/client" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/inplace" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" @@ -126,33 +126,26 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt return err } - err = ctrl.NewControllerManagedBy(mgr). + err = capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&clusterv1.MachineSet{}). - Owns(&clusterv1.Machine{}, builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog))). + Owns(&clusterv1.Machine{}). // Watches enqueues MachineSet for corresponding Machine resources, if no managed controller reference (owner) exists. Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(r.MachineToMachineSets), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &clusterv1.MachineDeployment{}, handler.EnqueueRequestsFromMapFunc(mdToMachineSets), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToMachineSets), - builder.WithPredicates( - // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), - predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), - ), - ), + // TODO: should this wait for Cluster.Status.InfrastructureReady similar to Infra Machine resources? + predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), + predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue), ). WatchesRawSource(r.ClusterCache.GetClusterSource("machineset", clusterToMachineSets)). Complete(r) @@ -230,11 +223,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ct reterr = kerrors.NewAggregate([]error{reterr, err}) } - if reterr != nil { - retres = ctrl.Result{} - return - } - // Adjust requeue when scaling up if s.machineSet.DeletionTimestamp.IsZero() && reterr == nil { retres = util.LowestNonZeroResult(retres, shouldRequeueForReplicaCountersRefresh(s)) diff --git a/internal/controllers/topology/cluster/cluster_controller.go b/internal/controllers/topology/cluster/cluster_controller.go index ebb6b9591e63..620df6cdbdf4 100644 --- a/internal/controllers/topology/cluster/cluster_controller.go +++ b/internal/controllers/topology/cluster/cluster_controller.go @@ -55,10 +55,12 @@ import ( "sigs.k8s.io/cluster-api/exp/topology/scope" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/hooks" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/internal/webhooks" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/patch" @@ -90,6 +92,8 @@ type Reconciler struct { externalTracker external.ObjectTracker recorder record.EventRecorder + hookCache cache.Cache[cache.HookEntry] + // desiredStateGenerator is used to generate the desired state. desiredStateGenerator desiredstate.Generator @@ -106,13 +110,11 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "topology/cluster") - c, err := ctrl.NewControllerManagedBy(mgr). + c, err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&clusterv1.Cluster{}, builder.WithPredicates( // Only reconcile Cluster with topology and with changes relevant for this controller. - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ClusterHasTopology(mgr.GetScheme(), predicateLog), - clusterChangeIsRelevant(mgr.GetScheme(), predicateLog), - ), + predicates.ClusterHasTopology(mgr.GetScheme(), predicateLog), + clusterChangeIsRelevant(mgr.GetScheme(), predicateLog), )). Named("topology/cluster"). WatchesRawSource(r.ClusterCache.GetClusterSource("topology/cluster", func(_ context.Context, o client.Object) []ctrl.Request { @@ -121,26 +123,19 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt Watches( &clusterv1.ClusterClass{}, handler.EnqueueRequestsFromMapFunc(r.clusterClassToCluster), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &clusterv1.MachineDeployment{}, handler.EnqueueRequestsFromMapFunc(r.machineDeploymentToCluster), // Only trigger Cluster reconciliation if the MachineDeployment is topology owned, the resource is changed, and the change is relevant. - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), - machineDeploymentChangeIsRelevant(mgr.GetScheme(), predicateLog), - )), + predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), + machineDeploymentChangeIsRelevant(mgr.GetScheme(), predicateLog), ). Watches( &clusterv1.MachinePool{}, handler.EnqueueRequestsFromMapFunc(r.machinePoolToCluster), // Only trigger Cluster reconciliation if the MachinePool is topology owned, the resource is changed. - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), - )), + predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), ). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). @@ -156,7 +151,16 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt Scheme: mgr.GetScheme(), PredicateLogger: &predicateLog, } - r.desiredStateGenerator, err = desiredstate.NewGenerator(r.Client, r.ClusterCache, r.RuntimeClient) + r.hookCache = cache.New[cache.HookEntry](cache.HookCacheDefaultTTL) + r.desiredStateGenerator, err = desiredstate.NewGenerator( + r.Client, + r.ClusterCache, + r.RuntimeClient, + r.hookCache, + // Note: We are using 10m so that we are able to relatively quickly pick up changes to the + // upgrade plan from the extension if necessary. + cache.New[desiredstate.GenerateUpgradePlanCacheEntry](10*time.Minute), + ) if err != nil { return errors.Wrap(err, "failed creating desired state generator") } @@ -408,10 +412,9 @@ func (r *Reconciler) setupDynamicWatches(ctx context.Context, s *scope.Scope) er if err := r.externalTracker.Watch(ctrl.LoggerFrom(ctx), s.Current.InfrastructureCluster, handler.EnqueueRequestForOwner(scheme, r.Client.RESTMapper(), &clusterv1.Cluster{}), // Only trigger Cluster reconciliation if the InfrastructureCluster is topology owned. - predicates.All(scheme, *r.externalTracker.PredicateLogger, - predicates.ResourceIsChanged(scheme, *r.externalTracker.PredicateLogger), - predicates.ResourceIsTopologyOwned(scheme, *r.externalTracker.PredicateLogger), - )); err != nil { + predicates.ResourceIsChanged(scheme, *r.externalTracker.PredicateLogger), + predicates.ResourceIsTopologyOwned(scheme, *r.externalTracker.PredicateLogger), + ); err != nil { return errors.Wrap(err, "error watching Infrastructure CR") } } @@ -419,10 +422,9 @@ func (r *Reconciler) setupDynamicWatches(ctx context.Context, s *scope.Scope) er if err := r.externalTracker.Watch(ctrl.LoggerFrom(ctx), s.Current.ControlPlane.Object, handler.EnqueueRequestForOwner(scheme, r.Client.RESTMapper(), &clusterv1.Cluster{}), // Only trigger Cluster reconciliation if the ControlPlane is topology owned. - predicates.All(scheme, *r.externalTracker.PredicateLogger, - predicates.ResourceIsChanged(scheme, *r.externalTracker.PredicateLogger), - predicates.ResourceIsTopologyOwned(scheme, *r.externalTracker.PredicateLogger), - )); err != nil { + predicates.ResourceIsChanged(scheme, *r.externalTracker.PredicateLogger), + predicates.ResourceIsTopologyOwned(scheme, *r.externalTracker.PredicateLogger), + ); err != nil { return errors.Wrap(err, "error watching ControlPlane CR") } } @@ -444,6 +446,14 @@ func (r *Reconciler) callBeforeClusterCreateHook(ctx context.Context, s *scope.S return ctrl.Result{}, nil } + if cacheEntry, ok := r.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.BeforeClusterCreate)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling BeforeClusterCreate hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.BeforeClusterCreate, cacheEntry.ToResponse(&runtimehooksv1.BeforeClusterCreateResponse{}, requeueAfter)) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + } + // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. v1beta1Cluster := &clusterv1beta1.Cluster{} if err := v1beta1Cluster.ConvertFrom(s.Current.Cluster.DeepCopy()); err != nil { @@ -460,7 +470,8 @@ func (r *Reconciler) callBeforeClusterCreateHook(ctx context.Context, s *scope.S s.HookResponseTracker.Add(runtimehooksv1.BeforeClusterCreate, hookResponse) if hookResponse.RetryAfterSeconds != 0 { - log.Info(fmt.Sprintf("Creation of Cluster topology is blocked by %s hook", runtimecatalog.HookName(runtimehooksv1.BeforeClusterCreate))) + r.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.BeforeClusterCreate, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Creation of Cluster topology is blocked by %s hook, retry after %ds", runtimecatalog.HookName(runtimehooksv1.BeforeClusterCreate), hookResponse.RetryAfterSeconds)) return ctrl.Result{RequeueAfter: time.Duration(hookResponse.RetryAfterSeconds) * time.Second}, nil } @@ -555,6 +566,14 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope.Scope) (ctrl. return ctrl.Result{}, nil } + if cacheEntry, ok := r.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.BeforeClusterDelete)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling BeforeClusterDelete hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.BeforeClusterDelete, cacheEntry.ToResponse(&runtimehooksv1.BeforeClusterDeleteResponse{}, requeueAfter)) + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + } + v1beta1Cluster := &clusterv1beta1.Cluster{} // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. if err := v1beta1Cluster.ConvertFrom(cluster.DeepCopy()); err != nil { @@ -572,7 +591,8 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope.Scope) (ctrl. s.HookResponseTracker.Add(runtimehooksv1.BeforeClusterDelete, hookResponse) if hookResponse.RetryAfterSeconds != 0 { - log.Info(fmt.Sprintf("Cluster deletion is blocked by %q hook", runtimecatalog.HookName(runtimehooksv1.BeforeClusterDelete))) + r.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.BeforeClusterDelete, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Cluster deletion is blocked by %q hook, retry after %ds", runtimecatalog.HookName(runtimehooksv1.BeforeClusterDelete), hookResponse.RetryAfterSeconds)) return ctrl.Result{RequeueAfter: time.Duration(hookResponse.RetryAfterSeconds) * time.Second}, nil } // The BeforeClusterDelete hook returned a non-blocking response. Now the cluster is ready to be deleted. diff --git a/internal/controllers/topology/cluster/cluster_controller_test.go b/internal/controllers/topology/cluster/cluster_controller_test.go index 93d1f9ad784d..51766812116f 100644 --- a/internal/controllers/topology/cluster/cluster_controller_test.go +++ b/internal/controllers/topology/cluster/cluster_controller_test.go @@ -48,6 +48,7 @@ import ( "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/internal/hooks" fakeruntimeclient "sigs.k8s.io/cluster-api/internal/runtime/client/fake" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/kubeconfig" @@ -483,6 +484,7 @@ func TestClusterReconciler_reconcileDelete(t *testing.T) { wantResult ctrl.Result wantOkToDelete bool wantErr bool + wantHookCacheEntry *cache.HookEntry }{ { name: "should apply the ok-to-delete annotation if the BeforeClusterDelete hook returns a non-blocking response", @@ -517,6 +519,13 @@ func TestClusterReconciler_reconcileDelete(t *testing.T) { wantHookToBeCalled: true, wantOkToDelete: false, wantErr: false, + wantHookCacheEntry: ptr.To(cache.NewHookEntry(&clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: "test-cluster", + }, + }, runtimehooksv1.BeforeClusterDelete, + time.Now().Add(time.Duration(blockingResponse.RetryAfterSeconds)*time.Second), blockingResponse.Message)), }, { name: "should fail if the BeforeClusterDelete hook returns a failure response", @@ -596,6 +605,7 @@ func TestClusterReconciler_reconcileDelete(t *testing.T) { Client: fakeClient, APIReader: fakeClient, RuntimeClient: fakeRuntimeClient, + hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), } s := scope.New(tt.cluster) @@ -618,6 +628,28 @@ func TestClusterReconciler_reconcileDelete(t *testing.T) { g.Expect(fakeRuntimeClient.CallAllCount(runtimehooksv1.BeforeClusterDelete)).To(Equal(0), "Did not expect hook to be called") } } + + if tt.wantHookCacheEntry != nil { + // Verify the cache entry. + cacheEntry, ok := r.hookCache.Has(tt.wantHookCacheEntry.Key()) + g.Expect(ok).To(BeTrue()) + g.Expect(cacheEntry.ObjectKey).To(Equal(tt.wantHookCacheEntry.ObjectKey)) + g.Expect(cacheEntry.HookName).To(Equal(tt.wantHookCacheEntry.HookName)) + g.Expect(cacheEntry.ReconcileAfter).To(BeTemporally("~", tt.wantHookCacheEntry.ReconcileAfter, 5*time.Second)) + g.Expect(cacheEntry.ResponseMessage).To(Equal(tt.wantHookCacheEntry.ResponseMessage)) + + // Call reconcileDelete again and verify the cache hit. + g.Expect(fakeRuntimeClient.CallAllCount(runtimehooksv1.BeforeClusterDelete)).To(Equal(1)) + secondResult, err := r.reconcileDelete(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(fakeRuntimeClient.CallAllCount(runtimehooksv1.BeforeClusterDelete)).To(Equal(1)) + g.Expect(s.HookResponseTracker.AggregateMessage("delete")).To(Equal( + fmt.Sprintf("Following hooks are blocking delete: BeforeClusterDelete: %s", tt.wantHookCacheEntry.ResponseMessage))) + // RequeueAfter should be now < then the previous RequeueAfter. + g.Expect(secondResult.RequeueAfter).To(BeNumerically("<", res.RequeueAfter)) + } else { + g.Expect(r.hookCache.Len()).To(Equal(0)) + } }) } } @@ -689,7 +721,8 @@ func TestReconciler_callBeforeClusterCreateHook(t *testing.T) { blockingResponse := &runtimehooksv1.BeforeClusterCreateResponse{ CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ CommonResponse: runtimehooksv1.CommonResponse{ - Status: runtimehooksv1.ResponseStatusSuccess, + Status: runtimehooksv1.ResponseStatusSuccess, + Message: "processing", }, RetryAfterSeconds: int32(10), }, @@ -711,16 +744,24 @@ func TestReconciler_callBeforeClusterCreateHook(t *testing.T) { } tests := []struct { - name string - hookResponse *runtimehooksv1.BeforeClusterCreateResponse - wantResult reconcile.Result - wantErr bool + name string + hookResponse *runtimehooksv1.BeforeClusterCreateResponse + wantResult reconcile.Result + wantErr bool + wantHookCacheEntry *cache.HookEntry }{ { name: "should return a requeue response when the BeforeClusterCreate hook is blocking", hookResponse: blockingResponse, wantResult: ctrl.Result{RequeueAfter: time.Duration(10) * time.Second}, wantErr: false, + wantHookCacheEntry: ptr.To(cache.NewHookEntry(&clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "cluster-1", + }, + }, runtimehooksv1.BeforeClusterCreate, + time.Now().Add(time.Duration(blockingResponse.RetryAfterSeconds)*time.Second), blockingResponse.Message)), }, { name: "should return an empty response when the BeforeClusterCreate hook is not blocking", @@ -743,6 +784,8 @@ func TestReconciler_callBeforeClusterCreateHook(t *testing.T) { Current: &scope.ClusterState{ Cluster: &clusterv1.Cluster{ ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "cluster-1", // Add managedFields and annotations that should be cleaned up before the Cluster is sent to the RuntimeExtension. ManagedFields: []metav1.ManagedFieldsEntry{ { @@ -778,6 +821,7 @@ func TestReconciler_callBeforeClusterCreateHook(t *testing.T) { r := &Reconciler{ RuntimeClient: runtimeClient, Client: fake.NewClientBuilder().WithScheme(fakeScheme).WithObjects(s.Current.Cluster).Build(), + hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), } res, err := r.callBeforeClusterCreateHook(ctx, s) if tt.wantErr { @@ -786,6 +830,28 @@ func TestReconciler_callBeforeClusterCreateHook(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) g.Expect(res).To(BeComparableTo(tt.wantResult)) } + + if tt.wantHookCacheEntry != nil { + // Verify the cache entry. + cacheEntry, ok := r.hookCache.Has(tt.wantHookCacheEntry.Key()) + g.Expect(ok).To(BeTrue()) + g.Expect(cacheEntry.ObjectKey).To(Equal(tt.wantHookCacheEntry.ObjectKey)) + g.Expect(cacheEntry.HookName).To(Equal(tt.wantHookCacheEntry.HookName)) + g.Expect(cacheEntry.ReconcileAfter).To(BeTemporally("~", tt.wantHookCacheEntry.ReconcileAfter, 5*time.Second)) + g.Expect(cacheEntry.ResponseMessage).To(Equal(tt.wantHookCacheEntry.ResponseMessage)) + + // Call callBeforeClusterCreateHook again and verify the cache hit. + g.Expect(runtimeClient.CallAllCount(runtimehooksv1.BeforeClusterCreate)).To(Equal(1)) + secondResult, err := r.callBeforeClusterCreateHook(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(runtimeClient.CallAllCount(runtimehooksv1.BeforeClusterCreate)).To(Equal(1)) + g.Expect(s.HookResponseTracker.AggregateMessage("Cluster topology creation")).To(Equal( + fmt.Sprintf("Following hooks are blocking Cluster topology creation: BeforeClusterCreate: %s", tt.wantHookCacheEntry.ResponseMessage))) + // RequeueAfter should be now < then the previous RequeueAfter. + g.Expect(secondResult.RequeueAfter).To(BeNumerically("<", res.RequeueAfter)) + } else { + g.Expect(r.hookCache.Len()).To(Equal(0)) + } }) } } diff --git a/internal/controllers/topology/cluster/reconcile_state.go b/internal/controllers/topology/cluster/reconcile_state.go index 9e2d5f812431..9c1accce86f0 100644 --- a/internal/controllers/topology/cluster/reconcile_state.go +++ b/internal/controllers/topology/cluster/reconcile_state.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strings" + "time" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" @@ -49,6 +50,7 @@ import ( "sigs.k8s.io/cluster-api/internal/topology/ownerrefs" clientutil "sigs.k8s.io/cluster-api/internal/util/client" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/cache" ) const ( @@ -262,6 +264,14 @@ func (r *Reconciler) callAfterClusterUpgrade(ctx context.Context, s *scope.Scope return hooks.MarkAsDone(ctx, r.Client, s.Current.Cluster, false, runtimehooksv1.AfterClusterUpgrade) } + if cacheEntry, ok := r.hookCache.Has(cache.NewHookEntryKey(s.Current.Cluster, runtimehooksv1.AfterClusterUpgrade)); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(time.Now()); requeue { + log.V(5).Info(fmt.Sprintf("Skip calling AfterClusterUpgrade hook, retry after %s", requeueAfter)) + s.HookResponseTracker.Add(runtimehooksv1.AfterClusterUpgrade, cacheEntry.ToResponse(&runtimehooksv1.AfterClusterUpgradeResponse{}, requeueAfter)) + return nil + } + } + // DeepCopy cluster because ConvertFrom has side effects like adding the conversion annotation. v1beta1Cluster := &clusterv1beta1.Cluster{} if err := v1beta1Cluster.ConvertFrom(s.Current.Cluster.DeepCopy()); err != nil { @@ -280,7 +290,8 @@ func (r *Reconciler) callAfterClusterUpgrade(ctx context.Context, s *scope.Scope s.HookResponseTracker.Add(runtimehooksv1.AfterClusterUpgrade, hookResponse) if hookResponse.RetryAfterSeconds != 0 { - log.Info(fmt.Sprintf("Cluster upgrade to version %s completed but next upgrades are blocked by %s hook", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterClusterUpgrade))) + r.hookCache.Add(cache.NewHookEntry(s.Current.Cluster, runtimehooksv1.AfterClusterUpgrade, time.Now().Add(time.Duration(hookResponse.RetryAfterSeconds)*time.Second), hookResponse.GetMessage())) + log.Info(fmt.Sprintf("Cluster upgrade to version %s completed but next upgrades are blocked by %s hook, retry after %ds", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterClusterUpgrade), hookResponse.RetryAfterSeconds)) return nil } diff --git a/internal/controllers/topology/cluster/reconcile_state_test.go b/internal/controllers/topology/cluster/reconcile_state_test.go index 3cd547d0235f..4a8f3f472803 100644 --- a/internal/controllers/topology/cluster/reconcile_state_test.go +++ b/internal/controllers/topology/cluster/reconcile_state_test.go @@ -65,6 +65,7 @@ import ( "sigs.k8s.io/cluster-api/internal/topology/selectors" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/internal/webhooks" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conversion" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -576,7 +577,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ CommonResponse: runtimehooksv1.CommonResponse{ Status: runtimehooksv1.ResponseStatusSuccess, - Message: "foo", + Message: "processing", }, RetryAfterSeconds: 60, }, @@ -594,6 +595,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { wantHookToBeCalled bool wantRetryAfter time.Duration wantError bool + wantHookCacheEntry *cache.HookEntry }{ { name: "hook should not be called if it is not marked", @@ -1224,6 +1226,13 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { wantHookToBeCalled: true, wantRetryAfter: time.Second * time.Duration(blockingResponse.RetryAfterSeconds), wantError: false, + wantHookCacheEntry: ptr.To(cache.NewHookEntry(&clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: "test-cluster", + }, + }, runtimehooksv1.AfterClusterUpgrade, + time.Now().Add(time.Duration(blockingResponse.RetryAfterSeconds)*time.Second), blockingResponse.Message)), }, { name: "hook should be called if the control plane, MDs, and MPs are stable at the topology version - failure response should leave the hook marked", @@ -1301,6 +1310,8 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { fakeClient, clustercache.NewFakeEmptyClusterCache(), fakeRuntimeClient, + cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), + cache.New[desiredstate.GenerateUpgradePlanCacheEntry](10*time.Minute), ) g.Expect(err).ToNot(HaveOccurred()) @@ -1308,6 +1319,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { Client: fakeClient, APIReader: fakeClient, RuntimeClient: fakeRuntimeClient, + hookCache: cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), desiredStateGenerator: desiredStateGenerator, } @@ -1326,6 +1338,26 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) { } g.Expect(hooks.IsPending(runtimehooksv1.AfterClusterUpgrade, tt.s.Current.Cluster)).To(Equal(tt.wantMarked)) + + if tt.wantHookCacheEntry != nil { + // Verify the cache entry. + cacheEntry, ok := r.hookCache.Has(tt.wantHookCacheEntry.Key()) + g.Expect(ok).To(BeTrue()) + g.Expect(cacheEntry.ObjectKey).To(Equal(tt.wantHookCacheEntry.ObjectKey)) + g.Expect(cacheEntry.HookName).To(Equal(tt.wantHookCacheEntry.HookName)) + g.Expect(cacheEntry.ReconcileAfter).To(BeTemporally("~", tt.wantHookCacheEntry.ReconcileAfter, 5*time.Second)) + g.Expect(cacheEntry.ResponseMessage).To(Equal(tt.wantHookCacheEntry.ResponseMessage)) + + // Call callAfterClusterUpgrade again and verify the cache hit. + g.Expect(r.callAfterClusterUpgrade(ctx, tt.s)).To(Succeed()) + g.Expect(fakeRuntimeClient.CallAllCount(runtimehooksv1.AfterClusterUpgrade)).To(Equal(1)) + g.Expect(tt.s.HookResponseTracker.AggregateMessage("upgrade")).To(Equal( + fmt.Sprintf("Following hooks are blocking upgrade: AfterClusterUpgrade: %s", tt.wantHookCacheEntry.ResponseMessage))) + // RequeueAfter should be now < then the previous RequeueAfter. + g.Expect(tt.s.HookResponseTracker.AggregateRetryAfter()).To(BeNumerically("<=", tt.wantRetryAfter)) + } else { + g.Expect(r.hookCache.Len()).To(Equal(0)) + } }) } } diff --git a/internal/controllers/topology/machinedeployment/machinedeployment_controller.go b/internal/controllers/topology/machinedeployment/machinedeployment_controller.go index 23875c76d6a0..2ab4dd985196 100644 --- a/internal/controllers/topology/machinedeployment/machinedeployment_controller.go +++ b/internal/controllers/topology/machinedeployment/machinedeployment_controller.go @@ -33,6 +33,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/internal/controllers/topology/machineset" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/finalizers" @@ -69,12 +70,11 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt return err } - err = ctrl.NewControllerManagedBy(mgr). + err = capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&clusterv1.MachineDeployment{}, builder.WithPredicates( - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), - predicates.ResourceNotPaused(mgr.GetScheme(), predicateLog)), + predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), + predicates.ResourceNotPaused(mgr.GetScheme(), predicateLog), ), ). Named("topology/machinedeployment"). @@ -83,13 +83,8 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToMachineDeployments), - builder.WithPredicates( - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterUnpaused(mgr.GetScheme(), predicateLog), - predicates.ClusterHasTopology(mgr.GetScheme(), predicateLog), - ), - ), + predicates.ClusterUnpaused(mgr.GetScheme(), predicateLog), + predicates.ClusterHasTopology(mgr.GetScheme(), predicateLog), ). Complete(r) if err != nil { diff --git a/internal/controllers/topology/machineset/machineset_controller.go b/internal/controllers/topology/machineset/machineset_controller.go index af80c79d024f..2d17236e2d81 100644 --- a/internal/controllers/topology/machineset/machineset_controller.go +++ b/internal/controllers/topology/machineset/machineset_controller.go @@ -36,6 +36,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/handler" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/finalizers" @@ -72,13 +73,11 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt return err } - err = ctrl.NewControllerManagedBy(mgr). + err = capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&clusterv1.MachineSet{}, builder.WithPredicates( - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), - predicates.ResourceNotPaused(mgr.GetScheme(), predicateLog)), - ), + predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog), + predicates.ResourceNotPaused(mgr.GetScheme(), predicateLog)), ). Named("topology/machineset"). WithOptions(options). @@ -86,13 +85,8 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToMachineSets), - builder.WithPredicates( - predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterUnpaused(mgr.GetScheme(), predicateLog), - predicates.ClusterHasTopology(mgr.GetScheme(), predicateLog), - ), - ), + predicates.ClusterUnpaused(mgr.GetScheme(), predicateLog), + predicates.ClusterHasTopology(mgr.GetScheme(), predicateLog), ). Complete(r) if err != nil { diff --git a/internal/runtime/client/fake/fake_client.go b/internal/runtime/client/fake/fake_client.go index 02cb17a3c677..7f44b2184e5c 100644 --- a/internal/runtime/client/fake/fake_client.go +++ b/internal/runtime/client/fake/fake_client.go @@ -99,6 +99,7 @@ func (f *RuntimeClientBuilder) Build() *RuntimeClient { callValidations: f.callValidations, catalog: f.catalog, callAllTracker: map[string]int{}, + callTracker: map[string]int{}, } } @@ -114,6 +115,7 @@ type RuntimeClient struct { callResponses map[string]runtimehooksv1.ResponseObject callValidations func(name string, object runtimehooksv1.RequestObject) error + callTracker map[string]int callAllTracker map[string]int } @@ -164,7 +166,11 @@ func (fc *RuntimeClient) CallAllExtensions(ctx context.Context, hook runtimecata } // CallExtension implements Client. -func (fc *RuntimeClient) CallExtension(ctx context.Context, _ runtimecatalog.Hook, _ client.Object, name string, req runtimehooksv1.RequestObject, response runtimehooksv1.ResponseObject, _ ...runtimeclient.CallExtensionOption) error { +func (fc *RuntimeClient) CallExtension(ctx context.Context, hook runtimecatalog.Hook, _ client.Object, name string, req runtimehooksv1.RequestObject, response runtimehooksv1.ResponseObject, _ ...runtimeclient.CallExtensionOption) error { + defer func() { + fc.callTracker[runtimecatalog.HookName(hook)]++ + }() + if fc.callValidations != nil { if err := fc.callValidations(name, req); err != nil { return err @@ -217,7 +223,12 @@ func (fc *RuntimeClient) WarmUp(_ *runtimev1.ExtensionConfigList) error { panic("unimplemented") } -// CallAllCount return the number of times a hook was called. +// CallAllCount returns the number of times a hook was called with CallAllExtensions. func (fc *RuntimeClient) CallAllCount(hook runtimecatalog.Hook) int { return fc.callAllTracker[runtimecatalog.HookName(hook)] } + +// CallCount returns the number of times a hook was called with CallExtension. +func (fc *RuntimeClient) CallCount(hook runtimecatalog.Hook) int { + return fc.callTracker[runtimecatalog.HookName(hook)] +} diff --git a/internal/util/controller/builder.go b/internal/util/controller/builder.go new file mode 100644 index 000000000000..a6aa186e52b4 --- /dev/null +++ b/internal/util/controller/builder.go @@ -0,0 +1,230 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "strings" + "time" + + "github.com/go-logr/logr" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" + + "sigs.k8s.io/cluster-api/util/cache" + predicatesutil "sigs.k8s.io/cluster-api/util/predicates" +) + +const ( + // defaultReconciliationTimeout is the default ReconciliationTimeout that is set. + // This means that the context of a Reconcile will time out after 1m. + defaultReconciliationTimeout = 1 * time.Minute +) + +// Builder is a wrapper around controller-runtime's builder.Builder. +type Builder struct { + builder *builder.Builder + mgr manager.Manager + predicateLog logr.Logger + options controller.TypedOptions[reconcile.Request] + forObject client.Object + controllerName string +} + +// NewControllerManagedBy returns a new controller builder that will be started by the provided Manager. +func NewControllerManagedBy(m manager.Manager, predicateLog logr.Logger) *Builder { + return &Builder{ + builder: builder.ControllerManagedBy(m), + mgr: m, + predicateLog: predicateLog, + } +} + +// For defines the type of Object being *reconciled*, and configures the ControllerManagedBy to respond to create / delete / +// update events by *reconciling the object*. +func (blder *Builder) For(object client.Object, opts ...builder.ForOption) *Builder { + blder.forObject = object + blder.builder.For(object, opts...) + return blder +} + +// Owns defines types of Objects being *generated* by the ControllerManagedBy, and configures the ControllerManagedBy to respond to +// create / delete / update events by *reconciling the owner object*. +func (blder *Builder) Owns(object client.Object, predicates ...predicate.Predicate) *Builder { + // Note: Prepend a ResourceIsChanged predicate to all "secondary" watches, this will filter out resync events, + // because resync from the primary object is enough. + // Note: Prepending it to avoid calling (potentially) more resource intensive predicates for resyncs. + predicates = append([]predicate.Predicate{predicatesutil.ResourceIsChanged(blder.mgr.GetScheme(), blder.predicateLog)}, predicates...) + blder.builder.Owns(object, builder.WithPredicates(predicates...)) + return blder +} + +// Watches defines the type of Object to watch, and configures the ControllerManagedBy to respond to create / delete / +// update events by *reconciling the object* with the given EventHandler. +func (blder *Builder) Watches(object client.Object, eventHandler handler.TypedEventHandler[client.Object, reconcile.Request], predicates ...predicate.Predicate) *Builder { + // Note: Prepend a ResourceIsChanged predicate to all "secondary" watches, this will filter out resync events, + // because resync from the primary object is enough. + // Note: Prepending it to avoid calling (potentially) more resource intensive predicates for resyncs. + predicates = append([]predicate.Predicate{predicatesutil.ResourceIsChanged(blder.mgr.GetScheme(), blder.predicateLog)}, predicates...) + blder.builder.Watches(object, eventHandler, builder.WithPredicates(predicates...)) + return blder +} + +// WatchesMetadata is the same as Watches, but forces the internal cache to only watch PartialObjectMetadata. +func (blder *Builder) WatchesMetadata(object client.Object, eventHandler handler.TypedEventHandler[client.Object, reconcile.Request], + predicates ...predicate.Predicate) *Builder { + // Note: Prepend a ResourceIsChanged predicate to all "secondary" watches, this will filter out resync events, + // because resync from the primary object is enough. + // Note: Prepending it to avoid calling (potentially) more resource intensive predicates for resyncs. + predicates = append([]predicate.Predicate{predicatesutil.ResourceIsChanged(blder.mgr.GetScheme(), blder.predicateLog)}, predicates...) + blder.builder.Watches(object, eventHandler, builder.WithPredicates(predicates...), builder.OnlyMetadata) + return blder +} + +// WatchesRawSource exposes the lower-level ControllerManagedBy Watches functions through the builder. +func (blder *Builder) WatchesRawSource(src source.TypedSource[reconcile.Request]) *Builder { + blder.builder.WatchesRawSource(src) + return blder +} + +// WithOptions overrides the controller options used in doController. Defaults to empty. +func (blder *Builder) WithOptions(options controller.TypedOptions[reconcile.Request]) *Builder { + blder.options = options + return blder +} + +// WithEventFilter sets the event filters, to filter which create/update/delete/generic events eventually +// trigger reconciliations. For example, filtering on whether the resource version has changed. +func (blder *Builder) WithEventFilter(p predicate.Predicate) *Builder { + blder.builder.WithEventFilter(p) + return blder +} + +// Named sets the name of the controller to the given name. The name shows up +// in metrics, among other things, and thus should be a prometheus compatible name +// (underscores and alphanumeric characters only). +func (blder *Builder) Named(name string) *Builder { + blder.controllerName = name + blder.builder.Named(name) + return blder +} + +// Controller is the controller-runtime Controller interface with +// additional methods to defer the next reconcile for a request / object. +type Controller interface { + controller.Controller + DeferNextReconcile(req reconcile.Request, reconcileAfter time.Time) + DeferNextReconcileForObject(obj metav1.Object, reconcileAfter time.Time) +} + +// Complete builds the Application Controller. +func (blder *Builder) Complete(r reconcile.TypedReconciler[reconcile.Request]) error { + _, err := blder.Build(r) + return err +} + +// Build builds the Application Controller and returns the Controller it created. +func (blder *Builder) Build(r reconcile.TypedReconciler[reconcile.Request]) (Controller, error) { + // Get GVK of the for object. + var gvk schema.GroupVersionKind + hasGVK := blder.forObject != nil + if hasGVK { + var err error + gvk, err = apiutil.GVKForObject(blder.forObject, blder.mgr.GetScheme()) + if err != nil { + return nil, err + } + } + + // Get controllerName. + controllerName := blder.controllerName + if controllerName == "" { + controllerName = strings.ToLower(gvk.Kind) + } + + // Default ReconciliationTimeout. + // This means that the context of a Reconcile will time out after 1m. + if blder.options.ReconciliationTimeout == 0 { + blder.options.ReconciliationTimeout = defaultReconciliationTimeout + } + + // Default LogConstructor. + // This overwrites the LogConstructor defaulting in controller-runtime, because we do not want + // to add additional & redundant name & namespace k/v pairs like controller-runtime. + if blder.options.LogConstructor == nil { + log := blder.mgr.GetLogger().WithValues( + "controller", controllerName, + ) + if hasGVK { + log = log.WithValues( + "controllerGroup", gvk.Group, + "controllerKind", gvk.Kind, + ) + } + + blder.options.LogConstructor = func(req *reconcile.Request) logr.Logger { + // Note: This logic has to be inside the LogConstructor as this k/v pair + // is different for every single reconcile.Request + log := log + if req != nil { + if hasGVK { + log = log.WithValues(gvk.Kind, klog.KRef(req.Namespace, req.Name)) + } + // Note: Not setting additional name & namespace k/v pairs like controller-runtime + // as they are redundant. + } + return log + } + } + + // Passing the options to the underlying builder here because we modified them above. + blder.builder.WithOptions(blder.options) + + // Create reconcileCache. + reconcileCache := cache.New[reconcileCacheEntry](cache.DefaultTTL) + + c, err := blder.builder.Build(reconcilerWrapper{ + name: controllerName, + reconciler: r, + reconcileCache: reconcileCache, + }) + if err != nil { + return nil, err + } + + // Initialize metrics to align to what controller-runtime is doing for its metrics. + // Note: This is not done for reconcileTime because we cannot add data to this metric + // without skewing the data. + reconcileTotal.WithLabelValues(controllerName, labelError).Add(0) + reconcileTotal.WithLabelValues(controllerName, labelRequeueAfter).Add(0) + reconcileTotal.WithLabelValues(controllerName, labelRequeue).Add(0) + reconcileTotal.WithLabelValues(controllerName, labelSuccess).Add(0) + + return &controllerWrapper{ + TypedController: c, + reconcileCache: reconcileCache, + }, nil +} diff --git a/internal/util/controller/builder_test.go b/internal/util/controller/builder_test.go new file mode 100644 index 000000000000..a2e4c5d3c9f1 --- /dev/null +++ b/internal/util/controller/builder_test.go @@ -0,0 +1,132 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "reflect" + "testing" + "time" + "unsafe" + + "github.com/go-logr/logr" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/config" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" + + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/cluster-api/util/predicates" +) + +func TestBuilder(t *testing.T) { + g := NewWithT(t) + + predicateLog := ctrl.LoggerFrom(t.Context()).WithValues("controller", "test/cluster") + + scheme := runtime.NewScheme() + g.Expect(clusterv1.AddToScheme(scheme)).To(Succeed()) + + realManager, err := manager.New(&rest.Config{}, manager.Options{}) + g.Expect(err).ToNot(HaveOccurred()) + fakeSink := &fakeSink{} + fakeManager := &fakeManager{ + Manager: realManager, + Scheme: scheme, + Sink: fakeSink, + } + + fakeSource := source.Channel(nil, handler.EnqueueRequestsFromMapFunc(nil)) + + reconciler := reconcile.Func(func(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + ctrl.LoggerFrom(ctx).Info("Reconciling", "request", req) + return reconcile.Result{}, nil + }) + + c, err := NewControllerManagedBy(fakeManager, predicateLog). + For(&clusterv1.Cluster{}). + Named("test/cluster"). + WatchesRawSource(fakeSource). + Watches(&clusterv1.ClusterClass{}, handler.EnqueueRequestsFromMapFunc(nil)). + Watches(&clusterv1.MachineDeployment{}, handler.EnqueueRequestsFromMapFunc(nil), predicates.ResourceIsTopologyOwned(fakeManager.GetScheme(), predicateLog)). + Watches(&clusterv1.MachinePool{}, handler.EnqueueRequestsFromMapFunc(nil)). + WithOptions(controller.Options{}). + WithEventFilter(predicates.ResourceHasFilterLabel(fakeManager.GetScheme(), predicateLog, "labelValue")). + Build(reconciler) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(c).ToNot(BeNil()) + + typedController := (c.(*controllerWrapper)).TypedController + + // Verify ReconciliationTimeout. + var timeDurationType time.Duration + reconciliationTimeout := reflect.NewAt( + reflect.TypeOf(timeDurationType), + unsafe.Pointer(reflect.ValueOf(typedController).Elem().FieldByName("ReconciliationTimeout").UnsafeAddr()), //nolint:gosec // Using unsafe here is ~ fine. + ).Elem().Interface().(time.Duration) + g.Expect(reconciliationTimeout).To(Equal(defaultReconciliationTimeout)) + + // Verify LogConstructor. + g.Expect(fakeSink.keysAndValues).To(Equal([]string{ + "controller", "test/cluster", + "controllerGroup", "cluster.x-k8s.io", + "controllerKind", "Cluster", + })) +} + +type fakeManager struct { + manager.Manager + Scheme *runtime.Scheme + Sink *fakeSink +} + +func (m *fakeManager) Add(_ manager.Runnable) error { + return nil +} + +func (m *fakeManager) GetScheme() *runtime.Scheme { + return m.Scheme +} + +func (m *fakeManager) GetLogger() logr.Logger { + return logr.New(m.Sink) +} + +func (m *fakeManager) GetControllerOptions() config.Controller { + return config.Controller{} +} + +type fakeSink struct { + logr.LogSink + keysAndValues []string +} + +func (s *fakeSink) Init(_ logr.RuntimeInfo) { +} + +func (s *fakeSink) WithValues(keysAndValues ...any) logr.LogSink { + for _, item := range keysAndValues { + s.keysAndValues = append(s.keysAndValues, item.(string)) + } + return s +} diff --git a/internal/util/controller/controller.go b/internal/util/controller/controller.go new file mode 100644 index 000000000000..041f188dba0f --- /dev/null +++ b/internal/util/controller/controller.go @@ -0,0 +1,130 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller provides utils for controller-runtime. +package controller + +import ( + "context" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "sigs.k8s.io/cluster-api/feature" + "sigs.k8s.io/cluster-api/util/cache" +) + +type reconcilerWrapper struct { + name string + reconcileCache cache.Cache[reconcileCacheEntry] + reconciler reconcile.Reconciler +} + +func (r reconcilerWrapper) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + reconcileStartTime := time.Now() + + // Check reconcileCache to ensure we won't run reconcile too frequently. + if cacheEntry, ok := r.reconcileCache.Has(reconcileCacheEntry{Request: req}.Key()); ok { + if requeueAfter, requeue := cacheEntry.ShouldRequeue(reconcileStartTime); requeue { + return ctrl.Result{RequeueAfter: requeueAfter}, nil + } + } + + if feature.Gates.Enabled(feature.ReconcilerRateLimiting) { + // Add entry to the reconcileCache so we won't run Reconcile more than once per second. + // Under certain circumstances the ReconcileAfter time will be set to a later time via DeferNextReconcile / + // DeferNextReconcileForObject, e.g. when we're waiting for Pods to terminate during node drain or + // volumes to detach. This is done to ensure we're not spamming the workload cluster API server. + r.reconcileCache.Add(reconcileCacheEntry{Request: req, ReconcileAfter: reconcileStartTime.Add(1 * time.Second)}) + } + + // Update metrics after processing each item + defer func() { + reconcileTime.WithLabelValues(r.name).Observe(time.Since(reconcileStartTime).Seconds()) + }() + + result, err := r.reconciler.Reconcile(ctx, req) + if err != nil { + // Note: controller-runtime logs a warning if an error is returned in combination with + // RequeueAfter / Requeue. Dropping RequeueAfter and Requeue here to avoid this warning + // (while preserving Priority). + result.RequeueAfter = 0 + result.Requeue = false //nolint:staticcheck // We have to handle Requeue until it is removed + } + switch { + case err != nil: + reconcileTotal.WithLabelValues(r.name, labelError).Inc() + case result.RequeueAfter > 0: + reconcileTotal.WithLabelValues(r.name, labelRequeueAfter).Inc() + case result.Requeue: //nolint: staticcheck // We have to handle Requeue until it is removed + reconcileTotal.WithLabelValues(r.name, labelRequeue).Inc() + default: + reconcileTotal.WithLabelValues(r.name, labelSuccess).Inc() + } + + return result, err +} + +type controllerWrapper struct { + controller.TypedController[reconcile.Request] + reconcileCache cache.Cache[reconcileCacheEntry] +} + +func (c controllerWrapper) DeferNextReconcile(req reconcile.Request, reconcileAfter time.Time) { + c.reconcileCache.Add(reconcileCacheEntry{ + Request: req, + ReconcileAfter: reconcileAfter, + }) +} + +func (c controllerWrapper) DeferNextReconcileForObject(obj metav1.Object, reconcileAfter time.Time) { + c.DeferNextReconcile(reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: obj.GetNamespace(), + Name: obj.GetName(), + }}, reconcileAfter) +} + +// reconcileCacheEntry is an Entry for the Cache that stores the +// earliest time after which the next Reconcile should be executed. +type reconcileCacheEntry struct { + Request reconcile.Request + ReconcileAfter time.Time +} + +var _ cache.Entry = &reconcileCacheEntry{} + +// Key returns the cache key of a reconcileCacheEntry. +func (r reconcileCacheEntry) Key() string { + return r.Request.String() +} + +// ShouldRequeue returns if the current Reconcile should be requeued. +func (r reconcileCacheEntry) ShouldRequeue(now time.Time) (requeueAfter time.Duration, requeue bool) { + if r.ReconcileAfter.IsZero() { + return time.Duration(0), false + } + + if r.ReconcileAfter.After(now) { + return r.ReconcileAfter.Sub(now), true + } + + return time.Duration(0), false +} diff --git a/internal/util/controller/controller_fake.go b/internal/util/controller/controller_fake.go new file mode 100644 index 000000000000..e2cd9780fd68 --- /dev/null +++ b/internal/util/controller/controller_fake.go @@ -0,0 +1,47 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +func NewFakeController() *FakeController { + return &FakeController{ + Deferrals: map[reconcile.Request]time.Time{}, + } +} + +type FakeController struct { + controller.Controller + Deferrals map[reconcile.Request]time.Time +} + +func (f *FakeController) DeferNextReconcile(req reconcile.Request, reconcileAfter time.Time) { + f.Deferrals[req] = reconcileAfter +} + +func (f *FakeController) DeferNextReconcileForObject(obj metav1.Object, reconcileAfter time.Time) { + f.Deferrals[reconcile.Request{ + NamespacedName: types.NamespacedName{Namespace: obj.GetNamespace(), Name: obj.GetName()}, + }] = reconcileAfter +} diff --git a/internal/util/controller/controller_test.go b/internal/util/controller/controller_test.go new file mode 100644 index 000000000000..3edaf4eb4322 --- /dev/null +++ b/internal/util/controller/controller_test.go @@ -0,0 +1,289 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "testing" + "testing/synctest" + "time" + + . "github.com/onsi/gomega" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/component-base/featuregate/testing" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/cluster-api/feature" + "sigs.k8s.io/cluster-api/util/cache" +) + +func TestReconcile(t *testing.T) { + utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.ReconcilerRateLimiting, true) + + // reconcileCache has to be created outside synctest.Test, otherwise + // the test would fail because of the cleanup go routine in the cache. + reconcileCache := cache.New[reconcileCacheEntry](cache.DefaultTTL) + + // Using synctest with Go 1.24 requires setting the GOEXPERIMENT env var to synctest. + // In Intellij this can be done via Settings > Language & Frameworks > Go > Build Tags > Experiments + synctest.Run(func() { + g := NewWithT(t) + + var reconcileCounter int + r := reconcilerWrapper{ + name: "cluster", + reconcileCache: reconcileCache, + reconciler: reconcile.Func(func(_ context.Context, _ reconcile.Request) (reconcile.Result, error) { + reconcileCounter++ + return reconcile.Result{}, nil + }), + } + c := controllerWrapper{ + reconcileCache: reconcileCache, + } + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: metav1.NamespaceDefault, + Name: "cluster-1", + }, + } + req := reconcile.Request{ + NamespacedName: client.ObjectKeyFromObject(cluster), + } + + // Reconcile will reconcile and defer next reconcile by 1s. + res, err := r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.IsZero()).To(BeTrue()) + g.Expect(reconcileCounter).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(1)) + + // Reconcile will not reconcile and return RequeueAfter 1s. + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(1 * time.Second)) + g.Expect(reconcileCounter).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(1)) + + time.Sleep(1 * time.Second) + + // Reconcile will reconcile and defer next reconcile by 1s. + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.IsZero()).To(BeTrue()) + g.Expect(reconcileCounter).To(Equal(2)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(2)) + + // Defer next Reconcile by 11s. + c.DeferNextReconcile(req, time.Now().Add(11*time.Second)) + + // Reconcile will not reconcile and return RequeueAfter 11s. + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(11 * time.Second)) + g.Expect(reconcileCounter).To(Equal(2)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(2)) + + time.Sleep(4 * time.Second) + + // Reconcile will not reconcile and return RequeueAfter 7s (== 11s-4s). + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(7 * time.Second)) + g.Expect(reconcileCounter).To(Equal(2)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(2)) + + time.Sleep(7 * time.Second) + + // Reconcile will reconcile and defer next reconcile by 1s. + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.IsZero()).To(BeTrue()) + g.Expect(reconcileCounter).To(Equal(3)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(3)) + + // Defer next Reconcile by 55s. + c.DeferNextReconcileForObject(cluster, time.Now().Add(55*time.Second)) + + // Reconcile will not reconcile and return RequeueAfter 55s. + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(55 * time.Second)) + g.Expect(reconcileCounter).To(Equal(3)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(3)) + }) +} + +func TestReconcileMetrics(t *testing.T) { + // Note: Feature gate is intentionally turned off for additional test coverage and to avoid + // having to move the clock forward by 1s after every Reconcile call. + utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.ReconcilerRateLimiting, false) + + // reconcileCache has to be created outside synctest.Test, otherwise + // the test would fail because of the cleanup go routine in the cache. + reconcileCache := cache.New[reconcileCacheEntry](cache.DefaultTTL) + + // Using synctest with Go 1.24 requires setting the GOEXPERIMENT env var to synctest. + // In Intellij this can be done via Settings > Language & Frameworks > Go > Build Tags > Experiments + synctest.Run(func() { + g := NewWithT(t) + + r := reconcilerWrapper{ + name: "cluster", + reconcileCache: reconcileCache, + } + + req := reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: metav1.NamespaceDefault, + Name: "cluster-1", + }, + } + + // Reset the metrics (otherwise they have data from other tests). + reconcileTotal.Reset() + reconcileTime.Reset() + + // Success + r.reconciler = reconcile.Func(func(_ context.Context, _ reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{}, nil + }) + res, err := r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.IsZero()).To(BeTrue()) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelError))).To(Equal(0)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeueAfter))).To(Equal(0)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeue))).To(Equal(0)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(1)) + g.Expect(histogramMetricValue(reconcileTime.WithLabelValues(r.name))).To(Equal(1)) + + // Error + r.reconciler = reconcile.Func(func(_ context.Context, _ reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{RequeueAfter: 5 * time.Second}, errors.New("error") // RequeueAfter should be dropped + }) + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).To(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(time.Duration(0))) + g.Expect(res.Requeue).To(BeFalse()) //nolint:staticcheck // We have to handle Requeue until it is removed + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelError))).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeueAfter))).To(Equal(0)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeue))).To(Equal(0)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(1)) + g.Expect(histogramMetricValue(reconcileTime.WithLabelValues(r.name))).To(Equal(2)) + + // RequeueAfter + r.reconciler = reconcile.Func(func(_ context.Context, _ reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + }) + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(5 * time.Second)) + g.Expect(res.Requeue).To(BeFalse()) //nolint:staticcheck // We have to handle Requeue until it is removed + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelError))).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeueAfter))).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeue))).To(Equal(0)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(1)) + g.Expect(histogramMetricValue(reconcileTime.WithLabelValues(r.name))).To(Equal(3)) + + // Requeue + r.reconciler = reconcile.Func(func(_ context.Context, _ reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{Requeue: true}, nil + }) + res, err = r.Reconcile(t.Context(), req) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(time.Duration(0))) + g.Expect(res.Requeue).To(BeTrue()) //nolint:staticcheck // We have to handle Requeue until it is removed + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelError))).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeueAfter))).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelRequeue))).To(Equal(1)) + g.Expect(counterMetricValue(reconcileTotal.WithLabelValues(r.name, labelSuccess))).To(Equal(1)) + g.Expect(histogramMetricValue(reconcileTime.WithLabelValues(r.name))).To(Equal(4)) + }) +} + +func counterMetricValue(values prometheus.Counter) int { + var dto dto.Metric + if err := values.Write(&dto); err != nil { + panic(err) + } + return int(*dto.GetCounter().Value) +} + +func histogramMetricValue(values prometheus.Observer) int { + var dto dto.Metric + if err := values.(prometheus.Histogram).Write(&dto); err != nil { + panic(err) + } + return int(dto.GetHistogram().GetSampleCount()) +} + +func TestShouldRequeue(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + now time.Time + reconcileAfter time.Time + wantRequeue bool + wantRequeueAfter time.Duration + }{ + { + name: "Don't requeue, reconcileAfter is zero", + now: now, + reconcileAfter: time.Time{}, + wantRequeue: false, + wantRequeueAfter: time.Duration(0), + }, + { + name: "Requeue after 15s", + now: now, + reconcileAfter: now.Add(time.Duration(15) * time.Second), + wantRequeue: true, + wantRequeueAfter: time.Duration(15) * time.Second, + }, + { + name: "Don't requeue, reconcileAfter is now", + now: now, + reconcileAfter: now, + wantRequeue: false, + wantRequeueAfter: time.Duration(0), + }, + { + name: "Don't requeue, reconcileAfter is before now", + now: now, + reconcileAfter: now.Add(-time.Duration(60) * time.Second), + wantRequeue: false, + wantRequeueAfter: time.Duration(0), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + gotRequeueAfter, gotRequeue := reconcileCacheEntry{ReconcileAfter: tt.reconcileAfter}.ShouldRequeue(tt.now) + g.Expect(gotRequeue).To(Equal(tt.wantRequeue)) + g.Expect(gotRequeueAfter).To(Equal(tt.wantRequeueAfter)) + }) + } +} diff --git a/internal/util/controller/metrics.go b/internal/util/controller/metrics.go new file mode 100644 index 000000000000..348b69f4a8c1 --- /dev/null +++ b/internal/util/controller/metrics.go @@ -0,0 +1,67 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// Note: Additional metrics have been added because the ReconcileTotal & ReconcileTime +// controller-runtime metrics are also counting the Requeues for rate-limiting. +// Note: The following controller-runtime metrics are correct even with rate-limiting: +// ReconcileErrors, TerminalReconcileErrors, ReconcilePanics, WorkerCount, ActiveWorkers. + +var ( + // reconcileTotal is a prometheus counter metrics which holds the total + // number of reconciliations per controller. It has two labels. controller label refers + // to the controller name and result label refers to the reconcile result i.e. + // success, error, requeue, requeue_after. + // Note: The difference between the CAPI and the controller-runtime metric is that the + // CAPI metric excludes rate-limiting related reconciles. + reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "capi_reconcile_total", + Help: "Total number of reconciliations per controller", + }, []string{"controller", "result"}) + + // reconcileTime is a prometheus metric which keeps track of the duration + // of reconciliations. + // Note: The difference between the CAPI and the controller-runtime metric is that the + // CAPI metric excludes rate-limiting related reconciles. + reconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "capi_reconcile_time_seconds", + Help: "Length of time per reconciliation per controller", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"controller"}) +) + +const ( + labelError = "error" + labelRequeueAfter = "requeue_after" + labelRequeue = "requeue" + labelSuccess = "success" +) + +func init() { + metrics.Registry.MustRegister(reconcileTotal, reconcileTime) +} diff --git a/test/e2e/config/docker.yaml b/test/e2e/config/docker.yaml index a6fe66837f96..1306ac4ad066 100644 --- a/test/e2e/config/docker.yaml +++ b/test/e2e/config/docker.yaml @@ -400,6 +400,7 @@ variables: EXP_RUNTIME_SDK: "true" EXP_MACHINE_SET_PREFLIGHT_CHECKS: "true" EXP_PRIORITY_QUEUE: "false" + EXP_RECONCILER_RATE_LIMITING: "true" EXP_IN_PLACE_UPDATES: "true" EXP_MACHINE_TAINT_PROPAGATION: "true" CAPI_DIAGNOSTICS_ADDRESS: ":8080" diff --git a/test/extension/handlers/topologymutation/handler_integration_test.go b/test/extension/handlers/topologymutation/handler_integration_test.go index fb6c1edba0ba..acb0ead8c20f 100644 --- a/test/extension/handlers/topologymutation/handler_integration_test.go +++ b/test/extension/handlers/topologymutation/handler_integration_test.go @@ -25,6 +25,7 @@ import ( "io" "os" "testing" + "time" . "github.com/onsi/gomega" "github.com/pkg/errors" @@ -56,6 +57,7 @@ import ( "sigs.k8s.io/cluster-api/exp/topology/desiredstate" "sigs.k8s.io/cluster-api/exp/topology/scope" "sigs.k8s.io/cluster-api/feature" + "sigs.k8s.io/cluster-api/util/cache" "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/contract" "sigs.k8s.io/cluster-api/webhooks" @@ -119,6 +121,8 @@ func TestHandler(t *testing.T) { clientWithV1Beta2ContractCRD, clustercache.NewFakeEmptyClusterCache(), runtimeClient, + cache.New[cache.HookEntry](cache.HookCacheDefaultTTL), + cache.New[desiredstate.GenerateUpgradePlanCacheEntry](10*time.Minute), ) g.Expect(err).ToNot(HaveOccurred()) diff --git a/test/infrastructure/docker/config/manager/manager.yaml b/test/infrastructure/docker/config/manager/manager.yaml index 95be516f73e1..5c515148d0b9 100644 --- a/test/infrastructure/docker/config/manager/manager.yaml +++ b/test/infrastructure/docker/config/manager/manager.yaml @@ -20,7 +20,7 @@ spec: - "--leader-elect" - "--diagnostics-address=${CAPI_DIAGNOSTICS_ADDRESS:=:8443}" - "--insecure-diagnostics=${CAPI_INSECURE_DIAGNOSTICS:=false}" - - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},ClusterTopology=${CLUSTER_TOPOLOGY:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false}" + - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=true},ClusterTopology=${CLUSTER_TOPOLOGY:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false},ReconcilerRateLimiting=${EXP_RECONCILER_RATE_LIMITING:=false}" image: controller:latest name: manager env: diff --git a/test/infrastructure/docker/internal/controllers/devcluster_controller.go b/test/infrastructure/docker/internal/controllers/devcluster_controller.go index 27e455ec7e7a..c5f7e69cd431 100644 --- a/test/infrastructure/docker/internal/controllers/devcluster_controller.go +++ b/test/infrastructure/docker/internal/controllers/devcluster_controller.go @@ -26,12 +26,12 @@ import ( kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta2" "sigs.k8s.io/cluster-api/test/infrastructure/docker/internal/controllers/backends" @@ -68,17 +68,14 @@ func (r *DevClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Ma } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "devcluster") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&infrav1.DevCluster{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(util.ClusterToInfrastructureMapFunc(ctx, infrav1.GroupVersion.WithKind("DevCluster"), mgr.GetClient(), &infrav1.DevCluster{})), - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), - )), + predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), ).Complete(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") diff --git a/test/infrastructure/docker/internal/controllers/devmachine_controller.go b/test/infrastructure/docker/internal/controllers/devmachine_controller.go index 3187ab35de13..3beb5866af0a 100644 --- a/test/infrastructure/docker/internal/controllers/devmachine_controller.go +++ b/test/infrastructure/docker/internal/controllers/devmachine_controller.go @@ -25,7 +25,6 @@ import ( kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -33,6 +32,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/controllers/clustercache" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta2" "sigs.k8s.io/cluster-api/test/infrastructure/docker/internal/controllers/backends" @@ -73,27 +73,22 @@ func (r *DevMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Ma return err } - err = ctrl.NewControllerManagedBy(mgr). + err = capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&infrav1.DevMachine{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1.GroupVersion.WithKind("DevMachine"))), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &infrav1.DevCluster{}, handler.EnqueueRequestsFromMapFunc(r.DevClusterToDevMachines), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToDevMachines), - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), - )), + predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), ). WatchesRawSource(r.ClusterCache.GetClusterSource("devmachine", clusterToDevMachines)). Complete(r) diff --git a/test/infrastructure/docker/internal/controllers/devmachinetemplate_controller.go b/test/infrastructure/docker/internal/controllers/devmachinetemplate_controller.go index 5bb6d5d92d2c..e7ea35d6f6d0 100644 --- a/test/infrastructure/docker/internal/controllers/devmachinetemplate_controller.go +++ b/test/infrastructure/docker/internal/controllers/devmachinetemplate_controller.go @@ -27,6 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta2" "sigs.k8s.io/cluster-api/util/patch" @@ -92,7 +93,7 @@ func (r *DevMachineTemplateReconciler) SetupWithManager(ctx context.Context, mgr return errors.New("Client and ContainerRuntime must not be nil") } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "devmachinetemplate") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&infrav1.DevMachineTemplate{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). diff --git a/test/infrastructure/docker/internal/controllers/dockercluster_controller.go b/test/infrastructure/docker/internal/controllers/dockercluster_controller.go index 641e069467a0..af41a772afb6 100644 --- a/test/infrastructure/docker/internal/controllers/dockercluster_controller.go +++ b/test/infrastructure/docker/internal/controllers/dockercluster_controller.go @@ -24,12 +24,12 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta2" dockerbackend "sigs.k8s.io/cluster-api/test/infrastructure/docker/internal/controllers/backends/docker" @@ -126,17 +126,14 @@ func (r *DockerClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl return errors.New("Client and ContainerRuntime must not be nil") } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "dockercluster") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&infrav1.DockerCluster{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(util.ClusterToInfrastructureMapFunc(ctx, infrav1.GroupVersion.WithKind("DockerCluster"), mgr.GetClient(), &infrav1.DockerCluster{})), - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), - )), + predicates.ClusterPausedTransitions(mgr.GetScheme(), predicateLog), ).Complete(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") diff --git a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go index edc9a3ddaf8b..4773abd043e3 100644 --- a/test/infrastructure/docker/internal/controllers/dockermachine_controller.go +++ b/test/infrastructure/docker/internal/controllers/dockermachine_controller.go @@ -25,7 +25,6 @@ import ( kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -33,6 +32,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/controllers/clustercache" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta2" dockerbackend "sigs.k8s.io/cluster-api/test/infrastructure/docker/internal/controllers/backends/docker" @@ -188,27 +188,22 @@ func (r *DockerMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl return err } - err = ctrl.NewControllerManagedBy(mgr). + err = capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&infrav1.DockerMachine{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1.GroupVersion.WithKind("DockerMachine"))), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &infrav1.DockerCluster{}, handler.EnqueueRequestsFromMapFunc(r.dockerClusterToDockerMachines), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToDockerMachines), - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), - )), + predicates.ClusterPausedTransitionsOrInfrastructureProvisioned(mgr.GetScheme(), predicateLog), ). WatchesRawSource(r.ClusterCache.GetClusterSource("dockermachine", clusterToDockerMachines)). Complete(r) diff --git a/test/infrastructure/docker/internal/controllers/dockermachinepool_controller.go b/test/infrastructure/docker/internal/controllers/dockermachinepool_controller.go index cd2fbbac758b..65c378964c4f 100644 --- a/test/infrastructure/docker/internal/controllers/dockermachinepool_controller.go +++ b/test/infrastructure/docker/internal/controllers/dockermachinepool_controller.go @@ -31,7 +31,6 @@ import ( "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -39,6 +38,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/controllers/external" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta2" @@ -174,7 +174,7 @@ func (r *DockerMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr return err } - c, err := ctrl.NewControllerManagedBy(mgr). + c, err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&infrav1.DockerMachinePool{}). WithOptions(options). WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). @@ -182,21 +182,16 @@ func (r *DockerMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr &clusterv1.MachinePool{}, handler.EnqueueRequestsFromMapFunc(util.MachinePoolToInfrastructureMapFunc(ctx, infrav1.GroupVersion.WithKind("DockerMachinePool"))), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &infrav1.DockerMachine{}, handler.EnqueueRequestsFromMapFunc(dockerMachineToDockerMachinePool), - builder.WithPredicates(predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog)), ). Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(clusterToDockerMachinePools), - builder.WithPredicates(predicates.All(mgr.GetScheme(), predicateLog, - predicates.ResourceIsChanged(mgr.GetScheme(), predicateLog), - //nolint:staticcheck // This usage will be removed when adding v1beta2 status and implementing the Paused condition. - predicates.ClusterUnpausedAndInfrastructureProvisioned(mgr.GetScheme(), predicateLog), - )), + //nolint:staticcheck // This usage will be removed when adding v1beta2 status and implementing the Paused condition. + predicates.ClusterUnpausedAndInfrastructureProvisioned(mgr.GetScheme(), predicateLog), ).Build(r) if err != nil { return errors.Wrap(err, "failed setting up with a controller manager") diff --git a/test/infrastructure/docker/internal/controllers/dockermachinetemplate_controller.go b/test/infrastructure/docker/internal/controllers/dockermachinetemplate_controller.go index f41e92957aa2..4f741a249ac0 100644 --- a/test/infrastructure/docker/internal/controllers/dockermachinetemplate_controller.go +++ b/test/infrastructure/docker/internal/controllers/dockermachinetemplate_controller.go @@ -24,6 +24,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" + capicontrollerutil "sigs.k8s.io/cluster-api/internal/util/controller" "sigs.k8s.io/cluster-api/test/infrastructure/container" infrav1 "sigs.k8s.io/cluster-api/test/infrastructure/docker/api/v1beta2" "sigs.k8s.io/cluster-api/util/patch" @@ -77,7 +78,7 @@ func (r *DockerMachineTemplateReconciler) SetupWithManager(ctx context.Context, return errors.New("Client and ContainerRuntime must not be nil") } predicateLog := ctrl.LoggerFrom(ctx).WithValues("controller", "dockermachinetemplate") - err := ctrl.NewControllerManagedBy(mgr). + err := capicontrollerutil.NewControllerManagedBy(mgr, predicateLog). For(&infrav1.DockerMachineTemplate{}). WithOptions(options). WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)). diff --git a/util/cache/cache.go b/util/cache/cache.go index 353ea170d3dd..f26839c0d3e4 100644 --- a/util/cache/cache.go +++ b/util/cache/cache.go @@ -17,18 +17,23 @@ limitations under the License. package cache import ( + "fmt" "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" kcache "k8s.io/client-go/tools/cache" - ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" + runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog" ) const ( // DefaultTTL is the duration for which we keep entries in the cache. DefaultTTL = 10 * time.Minute + // HookCacheDefaultTTL is the duration for which we keep entries in the hook cache. + HookCacheDefaultTTL = 24 * time.Hour + // expirationInterval is the interval in which we will remove expired entries // from the cache. expirationInterval = 10 * time.Hour @@ -49,6 +54,9 @@ type Cache[E Entry] interface { // Has checks if the given key (still) exists in the Cache. // Note: entries expire after the ttl. Has(key string) (E, bool) + + // Len returns the number of entries in the cache. + Len() int } // New creates a new cache. @@ -96,47 +104,45 @@ func (r *cache[E]) Has(key string) (E, bool) { return *new(E), false } -// ReconcileEntry is an Entry for the Cache that stores the -// earliest time after which the next Reconcile should be executed. -type ReconcileEntry struct { - Request ctrl.Request - ReconcileAfter time.Time +func (r *cache[E]) Len() int { + return len(r.ListKeys()) +} + +// HookEntry is an Entry for the hook Cache. +type HookEntry struct { + ObjectKey client.ObjectKey + HookName string + ReconcileAfter time.Time + ResponseMessage string } -// NewReconcileEntry creates a new ReconcileEntry based on an object and a reconcileAfter time. -func NewReconcileEntry(obj metav1.Object, reconcileAfter time.Time) ReconcileEntry { - return ReconcileEntry{ - Request: ctrl.Request{ - NamespacedName: types.NamespacedName{ - Namespace: obj.GetNamespace(), - Name: obj.GetName(), - }, - }, - ReconcileAfter: reconcileAfter, +// NewHookEntry creates a new HookEntry based on an object and a reconcileAfter time. +func NewHookEntry(obj client.Object, hook runtimecatalog.Hook, reconcileAfter time.Time, responseMessage string) HookEntry { + return HookEntry{ + ObjectKey: client.ObjectKeyFromObject(obj), + HookName: runtimecatalog.HookName(hook), + ReconcileAfter: reconcileAfter, + ResponseMessage: responseMessage, } } -// NewReconcileEntryKey returns the key of a ReconcileEntry based on an object. -func NewReconcileEntryKey(obj metav1.Object) string { - return ReconcileEntry{ - Request: ctrl.Request{ - NamespacedName: types.NamespacedName{ - Namespace: obj.GetNamespace(), - Name: obj.GetName(), - }, - }, +// NewHookEntryKey returns the key of a HookEntry based on an object. +func NewHookEntryKey(obj client.Object, hook runtimecatalog.Hook) string { + return HookEntry{ + ObjectKey: client.ObjectKeyFromObject(obj), + HookName: runtimecatalog.HookName(hook), }.Key() } -var _ Entry = &ReconcileEntry{} +var _ Entry = &HookEntry{} -// Key returns the cache key of a ReconcileEntry. -func (r ReconcileEntry) Key() string { - return r.Request.String() +// Key returns the cache key of a HookEntry. +func (r HookEntry) Key() string { + return fmt.Sprintf("%s: %s", r.HookName, r.ObjectKey.String()) } // ShouldRequeue returns if the current Reconcile should be requeued. -func (r ReconcileEntry) ShouldRequeue(now time.Time) (requeueAfter time.Duration, requeue bool) { +func (r HookEntry) ShouldRequeue(now time.Time) (requeueAfter time.Duration, requeue bool) { if r.ReconcileAfter.IsZero() { return time.Duration(0), false } @@ -147,3 +153,16 @@ func (r ReconcileEntry) ShouldRequeue(now time.Time) (requeueAfter time.Duration return time.Duration(0), false } + +// ToResponse uses the values from a HookEntry to set status, message and retryAfterSeconds on a RetryResponseObject. +// requeueAfter is passed in to make it possible to use durations computed from previous ShouldRequeue calls. +func (r HookEntry) ToResponse(resp runtimehooksv1.RetryResponseObject, requeueAfter time.Duration) runtimehooksv1.RetryResponseObject { + resp.SetStatus(runtimehooksv1.ResponseStatusSuccess) + resp.SetMessage(r.ResponseMessage) + if requeueAfter > 0 { + // Note: We have to set at least 1 if requeueAfter > 0 otherwise components like + // the HookResponseTracker won't detect this correctly as a blocking response. + resp.SetRetryAfterSeconds(max(int32(requeueAfter.Round(time.Second).Seconds()), 1)) + } + return resp +} diff --git a/util/cache/cache_test.go b/util/cache/cache_test.go index 8ef9bcf7fba1..4f16a62b793f 100644 --- a/util/cache/cache_test.go +++ b/util/cache/cache_test.go @@ -17,6 +17,7 @@ limitations under the License. package cache import ( + "fmt" "testing" "time" @@ -24,12 +25,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" ) func TestCache(t *testing.T) { g := NewWithT(t) - c := New[ReconcileEntry](DefaultTTL) + c := New[HookEntry](DefaultTTL) machine := &clusterv1.Machine{ ObjectMeta: metav1.ObjectMeta{ @@ -37,16 +39,63 @@ func TestCache(t *testing.T) { Name: "test-machine", }, } - entry := NewReconcileEntry(machine, time.Now()) - + entry := NewHookEntry(machine, runtimehooksv1.UpdateMachine, time.Now(), "response message") _, ok := c.Has(entry.Key()) g.Expect(ok).To(BeFalse()) + entryKey := NewHookEntryKey(machine, runtimehooksv1.UpdateMachine) + _, ok = c.Has(entryKey) + g.Expect(ok).To(BeFalse()) + c.Add(entry) entryFromCache, ok := c.Has(entry.Key()) g.Expect(ok).To(BeTrue()) g.Expect(entryFromCache).To(Equal(entry)) + + entryFromCache, ok = c.Has(entryKey) + g.Expect(ok).To(BeTrue()) + g.Expect(entryFromCache).To(Equal(entry)) + + tests := []struct { + requeueAfter time.Duration + expectedRetryAfterSeconds int32 + }{ + { + requeueAfter: 0, + expectedRetryAfterSeconds: 0, + }, + { + requeueAfter: 1 * time.Millisecond, + expectedRetryAfterSeconds: 1, // rounded up. + }, + { + requeueAfter: 1500 * time.Millisecond, + expectedRetryAfterSeconds: 2, // rounded up. + }, + { + requeueAfter: 3 * time.Second, + expectedRetryAfterSeconds: 3, + }, + } + for i := range tests { + tt := tests[i] + t.Run(fmt.Sprintf("requeueAfter: %s", tt.requeueAfter), func(t *testing.T) { + g := NewWithT(t) + + g.Expect(entryFromCache.ToResponse(&runtimehooksv1.UpdateMachineResponse{}, tt.requeueAfter)).To(Equal( + &runtimehooksv1.UpdateMachineResponse{ + CommonRetryResponse: runtimehooksv1.CommonRetryResponse{ + CommonResponse: runtimehooksv1.CommonResponse{ + Status: runtimehooksv1.ResponseStatusSuccess, + Message: entry.ResponseMessage, + }, + RetryAfterSeconds: tt.expectedRetryAfterSeconds, + }, + }, + )) + }) + } } func TestShouldRequeueDrain(t *testing.T) { @@ -92,7 +141,7 @@ func TestShouldRequeueDrain(t *testing.T) { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - gotRequeueAfter, gotRequeue := ReconcileEntry{ReconcileAfter: tt.reconcileAfter}.ShouldRequeue(tt.now) + gotRequeueAfter, gotRequeue := HookEntry{ReconcileAfter: tt.reconcileAfter}.ShouldRequeue(tt.now) g.Expect(gotRequeue).To(Equal(tt.wantRequeue)) g.Expect(gotRequeueAfter).To(Equal(tt.wantRequeueAfter)) })