1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2025-03-28 02:37:11 +00:00

nfd-master: re-try on node update failures

Change the NFD API handler to re-try on node update failures. Will work
around transient failures, making sure that failed nodes (i.e. nodes
that we failed to update) don't need to wait for the 1 hour resync
period before being tried again.
This commit is contained in:
Markus Lehtonen 2023-04-13 16:19:22 +03:00
parent e75be0b257
commit 6b2d10753f

View file

@ -329,21 +329,25 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
case <-rateLimit:
// Check what we need to do
// TODO: we might want to update multiple nodes in parallel
errUpdateAll := false
errNodes := make(map[string]struct{})
if updateAll {
if err := m.nfdAPIUpdateAllNodes(); err != nil {
klog.Error(err)
errUpdateAll = true
}
} else {
for nodeName := range updateNodes {
if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
klog.Error(err)
errNodes[nodeName] = struct{}{}
}
}
}
// Reset "work queue" and timer
updateAll = false
updateNodes = make(map[string]struct{})
// Reset "work queue" and timer, will cause re-try if errors happened
updateAll = errUpdateAll
updateNodes = errNodes
rateLimit = time.After(time.Second)
}
}