From 6b2d10753f3d45337748a3c37a21632ba0a08b02 Mon Sep 17 00:00:00 2001
From: Markus Lehtonen <markus.lehtonen@intel.com>
Date: Thu, 13 Apr 2023 16:19:22 +0300
Subject: [PATCH] nfd-master: re-try on node update failures

Change the NFD API handler to re-try on node update failures. Will work
around transient failures, making sure that failed nodes (i.e. nodes
that we failed to update) don't need to wait for the 1 hour resync
period before being tried again.
---
 pkg/nfd-master/nfd-master.go | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go
index a82541bef..811ad576d 100644
--- a/pkg/nfd-master/nfd-master.go
+++ b/pkg/nfd-master/nfd-master.go
@@ -329,21 +329,25 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
 		case <-rateLimit:
 			// Check what we need to do
 			// TODO: we might want to update multiple nodes in parallel
+			errUpdateAll := false
+			errNodes := make(map[string]struct{})
 			if updateAll {
 				if err := m.nfdAPIUpdateAllNodes(); err != nil {
 					klog.Error(err)
+					errUpdateAll = true
 				}
 			} else {
 				for nodeName := range updateNodes {
 					if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
 						klog.Error(err)
+						errNodes[nodeName] = struct{}{}
 					}
 				}
 			}
 
-			// Reset "work queue" and timer
-			updateAll = false
-			updateNodes = make(map[string]struct{})
+			// Reset "work queue" and timer, will cause re-try if errors happened
+			updateAll = errUpdateAll
+			updateNodes = errNodes
 			rateLimit = time.After(time.Second)
 		}
 	}