From a9849f20ff8bc5955768fe4887295b841ec3bd3a Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Fri, 20 Oct 2023 16:03:10 +0300 Subject: [PATCH] nfd-master: fix retry of node updates This patch addresses issues with slow node status (extended resources) updates. Previously we did just a few retries in quick succession which could result in the node update failing, just because node status was updated slower than our retry window. The patch mitigates the issue by increasing the number of tries to 15. In addition, it creates a ratelimiter with a longer per-item (per-node) base delay. The patch also fixes the e2e-tests to expose the issue. --- pkg/nfd-master/node-updater-pool.go | 13 +++++++++++-- test/e2e/gomega.go | 2 +- test/e2e/node_feature_discovery_test.go | 1 + 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pkg/nfd-master/node-updater-pool.go b/pkg/nfd-master/node-updater-pool.go index 55ce2a712..39eb2234b 100644 --- a/pkg/nfd-master/node-updater-pool.go +++ b/pkg/nfd-master/node-updater-pool.go @@ -18,7 +18,9 @@ package nfdmaster import ( "sync" + "time" + "golang.org/x/time/rate" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" ) @@ -48,7 +50,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI nodeUpdateRequests.Inc() if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil { - if queue.NumRequeues(nodeName) < 5 { + if queue.NumRequeues(nodeName) < 15 { klog.InfoS("retrying node update", "nodeName", nodeName) queue.AddRateLimited(nodeName) return true @@ -77,7 +79,14 @@ func (u *nodeUpdaterPool) start(parallelism int) { } klog.InfoS("starting the NFD master node updater pool", "parallelism", parallelism) - u.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) + + // Create ratelimiter. Mimic workqueue.DefaultControllerRateLimiter() but + // with modified per-item (node) rate limiting parameters. + rl := workqueue.NewMaxOfRateLimiter( + workqueue.NewItemExponentialFailureRateLimiter(50*time.Millisecond, 100*time.Second), + &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)}, + ) + u.queue = workqueue.NewRateLimitingQueue(rl) for i := 0; i < parallelism; i++ { u.wg.Add(1) diff --git a/test/e2e/gomega.go b/test/e2e/gomega.go index c68363638..2465083b7 100644 --- a/test/e2e/gomega.go +++ b/test/e2e/gomega.go @@ -39,7 +39,7 @@ type k8sAnnotations map[string]string func eventuallyNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) AsyncAssertion { return Eventually(func(g Gomega, ctx context.Context) ([]corev1.Node, error) { return getNonControlPlaneNodes(ctx, cli) - }).WithPolling(1 * time.Second).WithTimeout(10 * time.Second).WithContext(ctx) + }).WithPolling(1 * time.Second).WithTimeout(20 * time.Second).WithContext(ctx) } // MatchLabels returns a specialized Gomega matcher for checking if a list of diff --git a/test/e2e/node_feature_discovery_test.go b/test/e2e/node_feature_discovery_test.go index 1a744ecd6..6b9adc6ea 100644 --- a/test/e2e/node_feature_discovery_test.go +++ b/test/e2e/node_feature_discovery_test.go @@ -793,6 +793,7 @@ core: Expect(err).NotTo(HaveOccurred()) By("Verfiying node status capacity from NodeFeatureRules #4") + expectedCapacity = map[string]corev1.ResourceList{"*": {}} eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchCapacity(expectedCapacity, nodes, false)) By("Deleting nfd-worker daemonset")