From a2068f7ce3391093a6e6bbdec1ba0fa7840125ba Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Thu, 25 Jul 2024 13:56:14 +0300 Subject: [PATCH] nfd-master: tweak list options for NodeFeature informer Fix cache syncing problems on big clusters with thousands of NodeFeature objects. On the initial list (sync) the client-go cache reflector sets the ResourceVersion to "0" (instead of leaving it empty). This causes problems in the api server with (apiserver) logs like: E writers.go:122] apiserver was unable to write a JSON response: http: Handler timeout E status.go:71] apiserver received an error that is not an metav1.Status: &errors.errorString{s:"http: Handler timeout"}: http: Handler timeout On the nfd-master side we see corresponding log snippets like: W reflector.go:547] failed to list *v1alpha1.NodeFeature: stream error when reading response body, may be caused by closed connection. Please retry. Original error: stream error: stream ID 1521; INTERNAL_ERROR; received from peer I trace.go:236] "Reflector ListAndWatch" name:*** (***) (total time: 61126ms): ---"Objects listed" error:stream error when reading response body, may be caused by closed connection. Please retry. Original error: stream error: stream ID 1521; INTERNAL_ERROR; received from peer 61126ms (***) Decreasing the page size (opts.Limits) does not have any effect on the timeouts. However, setting ResourceVersion to an empty value seems to get the paging on its tracks, eliminating the timeouts. TODO: investigate in Kubernetes upstream the root cause of the timeouts with ResourceVersion="0". --- pkg/nfd-master/nfd-api-controller.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pkg/nfd-master/nfd-api-controller.go b/pkg/nfd-master/nfd-api-controller.go index fa9ac738f..81b5bd35e 100644 --- a/pkg/nfd-master/nfd-api-controller.go +++ b/pkg/nfd-master/nfd-api-controller.go @@ -29,6 +29,7 @@ import ( nfdclientset "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned" nfdscheme "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned/scheme" nfdinformers "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions" + nfdinformersv1alpha1 "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions/nfd/v1alpha1" nfdlisters "sigs.k8s.io/node-feature-discovery/api/generated/listers/nfd/v1alpha1" nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1" "sigs.k8s.io/node-feature-discovery/pkg/utils" @@ -67,13 +68,23 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC } nfdClient := nfdclientset.NewForConfigOrDie(config) + klog.V(2).InfoS("initializing new NFD API controller", "options", utils.DelayedDumper(nfdApiControllerOptions)) informerFactory := nfdinformers.NewSharedInformerFactory(nfdClient, nfdApiControllerOptions.ResyncPeriod) // Add informer for NodeFeature objects if !nfdApiControllerOptions.DisableNodeFeature { - featureInformer := informerFactory.Nfd().V1alpha1().NodeFeatures() + tweakListOpts := func(opts *metav1.ListOptions) { + // Tweak list opts on initial sync to avoid timeouts on the apiserver. + // NodeFeature objects are huge and the Kubernetes apiserver + // (v1.30) experiences http handler timeouts when the resource + // version is set to some non-empty value (TODO: find out why). + if opts.ResourceVersion == "0" { + opts.ResourceVersion = "" + } + } + featureInformer := nfdinformersv1alpha1.New(informerFactory, "", tweakListOpts).NodeFeatures() if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { nfr := obj.(*nfdv1alpha1.NodeFeature)