mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-31 04:04:51 +00:00
nfd-master: re-try on node update failures
Change the NFD API handler to re-try on node update failures. Will work around transient failures, making sure that failed nodes (i.e. nodes that we failed to update) don't need to wait for the 1 hour resync period before being tried again.
This commit is contained in:
parent
e75be0b257
commit
6b2d10753f
1 changed files with 7 additions and 3 deletions
|
@ -329,21 +329,25 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
|
||||||
case <-rateLimit:
|
case <-rateLimit:
|
||||||
// Check what we need to do
|
// Check what we need to do
|
||||||
// TODO: we might want to update multiple nodes in parallel
|
// TODO: we might want to update multiple nodes in parallel
|
||||||
|
errUpdateAll := false
|
||||||
|
errNodes := make(map[string]struct{})
|
||||||
if updateAll {
|
if updateAll {
|
||||||
if err := m.nfdAPIUpdateAllNodes(); err != nil {
|
if err := m.nfdAPIUpdateAllNodes(); err != nil {
|
||||||
klog.Error(err)
|
klog.Error(err)
|
||||||
|
errUpdateAll = true
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for nodeName := range updateNodes {
|
for nodeName := range updateNodes {
|
||||||
if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
|
if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
|
||||||
klog.Error(err)
|
klog.Error(err)
|
||||||
|
errNodes[nodeName] = struct{}{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset "work queue" and timer
|
// Reset "work queue" and timer, will cause re-try if errors happened
|
||||||
updateAll = false
|
updateAll = errUpdateAll
|
||||||
updateNodes = make(map[string]struct{})
|
updateNodes = errNodes
|
||||||
rateLimit = time.After(time.Second)
|
rateLimit = time.After(time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue