mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-28 02:37:11 +00:00
nfd-master: re-try on node update failures
Change the NFD API handler to re-try on node update failures. Will work around transient failures, making sure that failed nodes (i.e. nodes that we failed to update) don't need to wait for the 1 hour resync period before being tried again.
This commit is contained in:
parent
e75be0b257
commit
6b2d10753f
1 changed files with 7 additions and 3 deletions
|
@ -329,21 +329,25 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
|
|||
case <-rateLimit:
|
||||
// Check what we need to do
|
||||
// TODO: we might want to update multiple nodes in parallel
|
||||
errUpdateAll := false
|
||||
errNodes := make(map[string]struct{})
|
||||
if updateAll {
|
||||
if err := m.nfdAPIUpdateAllNodes(); err != nil {
|
||||
klog.Error(err)
|
||||
errUpdateAll = true
|
||||
}
|
||||
} else {
|
||||
for nodeName := range updateNodes {
|
||||
if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
|
||||
klog.Error(err)
|
||||
errNodes[nodeName] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset "work queue" and timer
|
||||
updateAll = false
|
||||
updateNodes = make(map[string]struct{})
|
||||
// Reset "work queue" and timer, will cause re-try if errors happened
|
||||
updateAll = errUpdateAll
|
||||
updateNodes = errNodes
|
||||
rateLimit = time.After(time.Second)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue