1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2025-03-31 04:04:51 +00:00

nfd-master: re-try on node update failures

Change the NFD API handler to re-try on node update failures. Will work
around transient failures, making sure that failed nodes (i.e. nodes
that we failed to update) don't need to wait for the 1 hour resync
period before being tried again.
This commit is contained in:
Markus Lehtonen 2023-04-13 16:19:22 +03:00
parent e75be0b257
commit 6b2d10753f

View file

@ -329,21 +329,25 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
case <-rateLimit: case <-rateLimit:
// Check what we need to do // Check what we need to do
// TODO: we might want to update multiple nodes in parallel // TODO: we might want to update multiple nodes in parallel
errUpdateAll := false
errNodes := make(map[string]struct{})
if updateAll { if updateAll {
if err := m.nfdAPIUpdateAllNodes(); err != nil { if err := m.nfdAPIUpdateAllNodes(); err != nil {
klog.Error(err) klog.Error(err)
errUpdateAll = true
} }
} else { } else {
for nodeName := range updateNodes { for nodeName := range updateNodes {
if err := m.nfdAPIUpdateOneNode(nodeName); err != nil { if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
klog.Error(err) klog.Error(err)
errNodes[nodeName] = struct{}{}
} }
} }
} }
// Reset "work queue" and timer // Reset "work queue" and timer, will cause re-try if errors happened
updateAll = false updateAll = errUpdateAll
updateNodes = make(map[string]struct{}) updateNodes = errNodes
rateLimit = time.After(time.Second) rateLimit = time.After(time.Second)
} }
} }