From f5cd6816aa15aec023ec794c6d50b9f417d6acca Mon Sep 17 00:00:00 2001 From: Moritz Johner Date: Tue, 7 Nov 2023 09:51:27 +0100 Subject: [PATCH] feat: fix cert-controller readiness probe (#2857) readiness probes are being executed independently from the leader election status. The current implementation depends on leader election (client cache etc.) to run properly. This commit fixes that by short-circuiting the readiness probes when the mgr is not the leader. This bug surfaces when `leader-election=true` and cert-controller `replicas>=2`. Signed-off-by: Moritz Johner --- cmd/certcontroller.go | 5 +- pkg/controllers/crds/crds_controller.go | 69 +++++++++++++------ pkg/controllers/crds/suite_test.go | 4 +- pkg/controllers/webhookconfig/suite_test.go | 5 +- .../webhookconfig/webhookconfig.go | 39 ++++++++--- 5 files changed, 84 insertions(+), 38 deletions(-) diff --git a/cmd/certcontroller.go b/cmd/certcontroller.go index 161c886e0..d670ccaeb 100644 --- a/cmd/certcontroller.go +++ b/cmd/certcontroller.go @@ -87,7 +87,8 @@ var certcontrollerCmd = &cobra.Command{ setupLog.Error(err, "unable to start manager") os.Exit(1) } - crdctrl := crds.New(mgr.GetClient(), mgr.GetScheme(), + + crdctrl := crds.New(mgr.GetClient(), mgr.GetScheme(), mgr.Elected(), ctrl.Log.WithName("controllers").WithName("webhook-certs-updater"), crdRequeueInterval, serviceName, serviceNamespace, secretName, secretNamespace, crdNames) if err := crdctrl.SetupWithManager(mgr, controller.Options{ @@ -97,7 +98,7 @@ var certcontrollerCmd = &cobra.Command{ os.Exit(1) } - whc := webhookconfig.New(mgr.GetClient(), mgr.GetScheme(), + whc := webhookconfig.New(mgr.GetClient(), mgr.GetScheme(), mgr.Elected(), ctrl.Log.WithName("controllers").WithName("webhook-certs-updater"), serviceName, serviceNamespace, secretName, secretNamespace, crdRequeueInterval) diff --git a/pkg/controllers/crds/crds_controller.go b/pkg/controllers/crds/crds_controller.go index 04bb835bb..ac06f03a9 100644 --- a/pkg/controllers/crds/crds_controller.go +++ b/pkg/controllers/crds/crds_controller.go @@ -72,26 +72,30 @@ type Reconciler struct { RequeueInterval time.Duration // the controller is ready when all crds are injected - rdyMu *sync.Mutex - readyStatusMap map[string]bool + // and the controller is elected as leader + leaderChan <-chan struct{} + leaderElected bool + readyStatusMapMu *sync.Mutex + readyStatusMap map[string]bool } -func New(k8sClient client.Client, scheme *runtime.Scheme, logger logr.Logger, +func New(k8sClient client.Client, scheme *runtime.Scheme, leaderChan <-chan struct{}, logger logr.Logger, interval time.Duration, svcName, svcNamespace, secretName, secretNamespace string, resources []string) *Reconciler { return &Reconciler{ - Client: k8sClient, - Log: logger, - Scheme: scheme, - SvcName: svcName, - SvcNamespace: svcNamespace, - SecretName: secretName, - SecretNamespace: secretNamespace, - RequeueInterval: interval, - CrdResources: resources, - CAName: "external-secrets", - CAOrganization: "external-secrets", - rdyMu: &sync.Mutex{}, - readyStatusMap: map[string]bool{}, + Client: k8sClient, + Log: logger, + Scheme: scheme, + SvcName: svcName, + SvcNamespace: svcNamespace, + SecretName: secretName, + SecretNamespace: secretNamespace, + RequeueInterval: interval, + CrdResources: resources, + CAName: "external-secrets", + CAOrganization: "external-secrets", + leaderChan: leaderChan, + readyStatusMapMu: &sync.Mutex{}, + readyStatusMap: map[string]bool{}, } } @@ -117,14 +121,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu err := r.updateCRD(ctx, req) if err != nil { log.Error(err, "failed to inject conversion webhook") - r.rdyMu.Lock() + r.readyStatusMapMu.Lock() r.readyStatusMap[req.NamespacedName.Name] = false - r.rdyMu.Unlock() + r.readyStatusMapMu.Unlock() return ctrl.Result{}, err } - r.rdyMu.Lock() + r.readyStatusMapMu.Lock() r.readyStatusMap[req.NamespacedName.Name] = true - r.rdyMu.Unlock() + r.readyStatusMapMu.Unlock() } return ctrl.Result{RequeueAfter: r.RequeueInterval}, nil } @@ -132,14 +136,35 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu // ReadyCheck reviews if all webhook configs have been injected into the CRDs // and if the referenced webhook service is ready. func (r *Reconciler) ReadyCheck(_ *http.Request) error { + // skip readiness check if we're not leader + // as we depend on caches and being able to reconcile Webhooks + if !r.leaderElected { + select { + case <-r.leaderChan: + r.leaderElected = true + default: + return nil + } + } + if err := r.checkCRDs(); err != nil { + return err + } + return r.checkEndpoints() +} + +func (r Reconciler) checkCRDs() error { for _, res := range r.CrdResources { - r.rdyMu.Lock() + r.readyStatusMapMu.Lock() rdy := r.readyStatusMap[res] - r.rdyMu.Unlock() + r.readyStatusMapMu.Unlock() if !rdy { return fmt.Errorf(errResNotReady, res) } } + return nil +} + +func (r Reconciler) checkEndpoints() error { var eps corev1.Endpoints err := r.Get(context.TODO(), types.NamespacedName{ Name: r.SvcName, diff --git a/pkg/controllers/crds/suite_test.go b/pkg/controllers/crds/suite_test.go index d4b31da0b..a489242f3 100644 --- a/pkg/controllers/crds/suite_test.go +++ b/pkg/controllers/crds/suite_test.go @@ -77,7 +77,9 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) Expect(k8sClient).ToNot(BeNil()) - rec := New(k8sClient, k8sManager.GetScheme(), log, time.Second*1, + leaderChan := make(chan struct{}) + close(leaderChan) + rec := New(k8sClient, k8sManager.GetScheme(), leaderChan, log, time.Second*1, "foo", "default", "foo", "default", []string{ "secretstores.test.io", }) diff --git a/pkg/controllers/webhookconfig/suite_test.go b/pkg/controllers/webhookconfig/suite_test.go index 1e7e9d676..7d7b3b071 100644 --- a/pkg/controllers/webhookconfig/suite_test.go +++ b/pkg/controllers/webhookconfig/suite_test.go @@ -84,8 +84,9 @@ var _ = BeforeSuite(func() { k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) Expect(err).ToNot(HaveOccurred()) Expect(k8sClient).ToNot(BeNil()) - - reconciler = New(k8sClient, k8sManager.GetScheme(), ctrl.Log, ctrlSvcName, ctrlSvcNamespace, ctrlSecretName, ctrlSecretNamespace, time.Second) + leaderChan := make(chan struct{}) + close(leaderChan) + reconciler = New(k8sClient, k8sManager.GetScheme(), leaderChan, ctrl.Log, ctrlSvcName, ctrlSvcNamespace, ctrlSecretName, ctrlSecretNamespace, time.Second) reconciler.SetupWithManager(k8sManager, controller.Options{}) Expect(err).ToNot(HaveOccurred()) diff --git a/pkg/controllers/webhookconfig/webhookconfig.go b/pkg/controllers/webhookconfig/webhookconfig.go index 39c124f10..f1e42923a 100644 --- a/pkg/controllers/webhookconfig/webhookconfig.go +++ b/pkg/controllers/webhookconfig/webhookconfig.go @@ -46,11 +46,16 @@ type Reconciler struct { SecretName string SecretNamespace string - rdyMu *sync.Mutex - ready bool + // store state for the readiness probe. + // we're ready when we're not the leader or + // if we've reconciled the webhook config when we're the leader. + leaderChan <-chan struct{} + leaderElected bool + webhookReadyMu *sync.Mutex + webhookReady bool } -func New(k8sClient client.Client, scheme *runtime.Scheme, +func New(k8sClient client.Client, scheme *runtime.Scheme, leaderChan <-chan struct{}, log logr.Logger, svcName, svcNamespace, secretName, secretNamespace string, requeueInterval time.Duration) *Reconciler { return &Reconciler{ @@ -62,8 +67,10 @@ func New(k8sClient client.Client, scheme *runtime.Scheme, SvcNamespace: svcNamespace, SecretName: secretName, SecretNamespace: secretNamespace, - rdyMu: &sync.Mutex{}, - ready: false, + leaderChan: leaderChan, + leaderElected: false, + webhookReadyMu: &sync.Mutex{}, + webhookReady: false, } } @@ -109,9 +116,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu // right now we only have one single // webhook config we care about - r.rdyMu.Lock() - defer r.rdyMu.Unlock() - r.ready = true + r.webhookReadyMu.Lock() + defer r.webhookReadyMu.Unlock() + r.webhookReady = true return ctrl.Result{ RequeueAfter: r.RequeueDuration, }, nil @@ -126,9 +133,19 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) } func (r *Reconciler) ReadyCheck(_ *http.Request) error { - r.rdyMu.Lock() - defer r.rdyMu.Unlock() - if !r.ready { + // skip readiness check if we're not leader + // as we depend on caches and being able to reconcile Webhooks + if !r.leaderElected { + select { + case <-r.leaderChan: + r.leaderElected = true + default: + return nil + } + } + r.webhookReadyMu.Lock() + defer r.webhookReadyMu.Unlock() + if !r.webhookReady { return fmt.Errorf(errWebhookNotReady) } var eps v1.Endpoints