mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
nfd-master: tweak list options for NodeFeature informer
Fix cache syncing problems on big clusters with thousands of NodeFeature objects. On the initial list (sync) the client-go cache reflector sets the ResourceVersion to "0" (instead of leaving it empty). This causes problems in the api server with (apiserver) logs like: E writers.go:122] apiserver was unable to write a JSON response: http: Handler timeout E status.go:71] apiserver received an error that is not an metav1.Status: &errors.errorString{s:"http: Handler timeout"}: http: Handler timeout On the nfd-master side we see corresponding log snippets like: W reflector.go:547] failed to list *v1alpha1.NodeFeature: stream error when reading response body, may be caused by closed connection. Please retry. Original error: stream error: stream ID 1521; INTERNAL_ERROR; received from peer I trace.go:236] "Reflector ListAndWatch" name:*** (***) (total time: 61126ms): ---"Objects listed" error:stream error when reading response body, may be caused by closed connection. Please retry. Original error: stream error: stream ID 1521; INTERNAL_ERROR; received from peer 61126ms (***) Decreasing the page size (opts.Limits) does not have any effect on the timeouts. However, setting ResourceVersion to an empty value seems to get the paging on its tracks, eliminating the timeouts. TODO: investigate in Kubernetes upstream the root cause of the timeouts with ResourceVersion="0".
This commit is contained in:
parent
bd8d74d6f2
commit
a2068f7ce3
1 changed files with 12 additions and 1 deletions
|
@ -29,6 +29,7 @@ import (
|
|||
nfdclientset "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned"
|
||||
nfdscheme "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned/scheme"
|
||||
nfdinformers "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions"
|
||||
nfdinformersv1alpha1 "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions/nfd/v1alpha1"
|
||||
nfdlisters "sigs.k8s.io/node-feature-discovery/api/generated/listers/nfd/v1alpha1"
|
||||
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/utils"
|
||||
|
@ -67,13 +68,23 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC
|
|||
}
|
||||
|
||||
nfdClient := nfdclientset.NewForConfigOrDie(config)
|
||||
|
||||
klog.V(2).InfoS("initializing new NFD API controller", "options", utils.DelayedDumper(nfdApiControllerOptions))
|
||||
|
||||
informerFactory := nfdinformers.NewSharedInformerFactory(nfdClient, nfdApiControllerOptions.ResyncPeriod)
|
||||
|
||||
// Add informer for NodeFeature objects
|
||||
if !nfdApiControllerOptions.DisableNodeFeature {
|
||||
featureInformer := informerFactory.Nfd().V1alpha1().NodeFeatures()
|
||||
tweakListOpts := func(opts *metav1.ListOptions) {
|
||||
// Tweak list opts on initial sync to avoid timeouts on the apiserver.
|
||||
// NodeFeature objects are huge and the Kubernetes apiserver
|
||||
// (v1.30) experiences http handler timeouts when the resource
|
||||
// version is set to some non-empty value (TODO: find out why).
|
||||
if opts.ResourceVersion == "0" {
|
||||
opts.ResourceVersion = ""
|
||||
}
|
||||
}
|
||||
featureInformer := nfdinformersv1alpha1.New(informerFactory, "", tweakListOpts).NodeFeatures()
|
||||
if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
|
||||
AddFunc: func(obj interface{}) {
|
||||
nfr := obj.(*nfdv1alpha1.NodeFeature)
|
||||
|
|
Loading…
Reference in a new issue