From 6b2d10753f3d45337748a3c37a21632ba0a08b02 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen <markus.lehtonen@intel.com> Date: Thu, 13 Apr 2023 16:19:22 +0300 Subject: [PATCH] nfd-master: re-try on node update failures Change the NFD API handler to re-try on node update failures. Will work around transient failures, making sure that failed nodes (i.e. nodes that we failed to update) don't need to wait for the 1 hour resync period before being tried again. --- pkg/nfd-master/nfd-master.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index a82541bef..811ad576d 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -329,21 +329,25 @@ func (m *nfdMaster) nfdAPIUpdateHandler() { case <-rateLimit: // Check what we need to do // TODO: we might want to update multiple nodes in parallel + errUpdateAll := false + errNodes := make(map[string]struct{}) if updateAll { if err := m.nfdAPIUpdateAllNodes(); err != nil { klog.Error(err) + errUpdateAll = true } } else { for nodeName := range updateNodes { if err := m.nfdAPIUpdateOneNode(nodeName); err != nil { klog.Error(err) + errNodes[nodeName] = struct{}{} } } } - // Reset "work queue" and timer - updateAll = false - updateNodes = make(map[string]struct{}) + // Reset "work queue" and timer, will cause re-try if errors happened + updateAll = errUpdateAll + updateNodes = errNodes rateLimit = time.After(time.Second) } }