1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

nfd-master: fix retry of node updates

This patch addresses issues with slow node status (extended resources)
updates. Previously we did just a few retries in quick succession which
could result in the node update failing, just because node status was
updated slower than our retry window. The patch mitigates the issue by
increasing the number of tries to 15. In addition, it creates a
ratelimiter with a longer per-item (per-node) base delay.

The patch also fixes the e2e-tests to expose the issue.
This commit is contained in:
Markus Lehtonen 2023-10-20 16:03:10 +03:00
parent b6231b60fc
commit a9849f20ff
3 changed files with 13 additions and 3 deletions

View file

@ -18,7 +18,9 @@ package nfdmaster
import (
"sync"
"time"
"golang.org/x/time/rate"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
)
@ -48,7 +50,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI
nodeUpdateRequests.Inc()
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
if queue.NumRequeues(nodeName) < 5 {
if queue.NumRequeues(nodeName) < 15 {
klog.InfoS("retrying node update", "nodeName", nodeName)
queue.AddRateLimited(nodeName)
return true
@ -77,7 +79,14 @@ func (u *nodeUpdaterPool) start(parallelism int) {
}
klog.InfoS("starting the NFD master node updater pool", "parallelism", parallelism)
u.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
// Create ratelimiter. Mimic workqueue.DefaultControllerRateLimiter() but
// with modified per-item (node) rate limiting parameters.
rl := workqueue.NewMaxOfRateLimiter(
workqueue.NewItemExponentialFailureRateLimiter(50*time.Millisecond, 100*time.Second),
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
)
u.queue = workqueue.NewRateLimitingQueue(rl)
for i := 0; i < parallelism; i++ {
u.wg.Add(1)

View file

@ -39,7 +39,7 @@ type k8sAnnotations map[string]string
func eventuallyNonControlPlaneNodes(ctx context.Context, cli clientset.Interface) AsyncAssertion {
return Eventually(func(g Gomega, ctx context.Context) ([]corev1.Node, error) {
return getNonControlPlaneNodes(ctx, cli)
}).WithPolling(1 * time.Second).WithTimeout(10 * time.Second).WithContext(ctx)
}).WithPolling(1 * time.Second).WithTimeout(20 * time.Second).WithContext(ctx)
}
// MatchLabels returns a specialized Gomega matcher for checking if a list of

View file

@ -793,6 +793,7 @@ core:
Expect(err).NotTo(HaveOccurred())
By("Verfiying node status capacity from NodeFeatureRules #4")
expectedCapacity = map[string]corev1.ResourceList{"*": {}}
eventuallyNonControlPlaneNodes(ctx, f.ClientSet).Should(MatchCapacity(expectedCapacity, nodes, false))
By("Deleting nfd-worker daemonset")