From 4c4db8be407e59abafd1af4d374020bf7bb58b90 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 11:25:56 -0800 Subject: [PATCH 01/23] add configurable pagination to nfd-master --- api/nfd/go.mod | 5 +++-- cmd/nfd-master/main.go | 2 ++ pkg/nfd-master/nfd-api-controller.go | 4 ++++ pkg/nfd-master/nfd-master.go | 4 +++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/api/nfd/go.mod b/api/nfd/go.mod index cf24a9355..1f4ba3960 100644 --- a/api/nfd/go.mod +++ b/api/nfd/go.mod @@ -1,7 +1,8 @@ module sigs.k8s.io/node-feature-discovery/api/nfd -go 1.22.2 -toolchain go1.23.7 +go 1.23.0 + +toolchain go1.24.0 require ( github.com/stretchr/testify v1.8.4 diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 774b47335..ce7e2789d 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -117,6 +117,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "in the same format as in the config file (i.e. json or yaml). These options") flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false, "Enables a leader election. Enable this when running more than one replica on nfd master.") + flagset.Int64Var(&args.ListSize, "node-feature-informer-list-size", 0, + "the list size to use when listing node features to sync informer cache") args.Klog = klogutils.InitKlogFlags(flagset) diff --git a/pkg/nfd-master/nfd-api-controller.go b/pkg/nfd-master/nfd-api-controller.go index a3870f489..9a795fe57 100644 --- a/pkg/nfd-master/nfd-api-controller.go +++ b/pkg/nfd-master/nfd-api-controller.go @@ -57,6 +57,7 @@ type nfdApiControllerOptions struct { ResyncPeriod time.Duration K8sClient k8sclient.Interface NodeFeatureNamespaceSelector *metav1.LabelSelector + ListSize int64 } func init() { @@ -101,6 +102,9 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC if opts.ResourceVersion == "0" { opts.ResourceVersion = "" } + if nfdApiControllerOptions.ListSize != 0 { + opts.Limit = nfdApiControllerOptions.ListSize + } } featureInformer := nfdinformersv1alpha1.New(informerFactory, "", tweakListOpts).NodeFeatures() if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index fbe12e040..0de556ea2 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -45,7 +45,6 @@ import ( restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" - "k8s.io/klog/v2" controller "k8s.io/kubernetes/pkg/controller" taintutils "k8s.io/kubernetes/pkg/util/taints" "sigs.k8s.io/yaml" @@ -122,6 +121,8 @@ type Args struct { Prune bool Options string EnableLeaderElection bool + MetricsPort int + ListSize int64 Overrides ConfigOverrideArgs } @@ -1295,6 +1296,7 @@ func (m *nfdMaster) startNfdApiController() error { ResyncPeriod: m.config.ResyncPeriod.Duration, K8sClient: m.k8sClient, NodeFeatureNamespaceSelector: m.config.Restrictions.NodeFeatureNamespaceSelector, + ListSize: m.args.ListSize, }) if err != nil { return fmt.Errorf("failed to initialize CRD controller: %w", err) From 96d47a4a9b412f3409e7855e70c38bcc22b69bc9 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 17:39:13 -0800 Subject: [PATCH 02/23] add comments and better arg description --- cmd/nfd-master/main.go | 2 +- pkg/nfd-master/nfd-api-controller.go | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index ce7e2789d..a4f08ed1c 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -118,7 +118,7 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false, "Enables a leader election. Enable this when running more than one replica on nfd master.") flagset.Int64Var(&args.ListSize, "node-feature-informer-list-size", 0, - "the list size to use when listing node features to sync informer cache") + "The list size to use when listing node features to sync informer cache. Size of zero disables pagination.") args.Klog = klogutils.InitKlogFlags(flagset) diff --git a/pkg/nfd-master/nfd-api-controller.go b/pkg/nfd-master/nfd-api-controller.go index 9a795fe57..5820f67d3 100644 --- a/pkg/nfd-master/nfd-api-controller.go +++ b/pkg/nfd-master/nfd-api-controller.go @@ -102,9 +102,7 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC if opts.ResourceVersion == "0" { opts.ResourceVersion = "" } - if nfdApiControllerOptions.ListSize != 0 { - opts.Limit = nfdApiControllerOptions.ListSize - } + opts.Limit = nfdApiControllerOptions.ListSize // value of 0 disables pagination } featureInformer := nfdinformersv1alpha1.New(informerFactory, "", tweakListOpts).NodeFeatures() if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ From 4bb892fd4d4b2cc0c412997af8e09101ee6e6dc9 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 17:40:08 -0800 Subject: [PATCH 03/23] shorten flag name --- cmd/nfd-master/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index a4f08ed1c..835583dea 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -117,7 +117,7 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "in the same format as in the config file (i.e. json or yaml). These options") flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false, "Enables a leader election. Enable this when running more than one replica on nfd master.") - flagset.Int64Var(&args.ListSize, "node-feature-informer-list-size", 0, + flagset.Int64Var(&args.ListSize, "informer-list-size", 0, "The list size to use when listing node features to sync informer cache. Size of zero disables pagination.") args.Klog = klogutils.InitKlogFlags(flagset) From b23893c1eee6d40ad9e28772429ca63d5f988dbc Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 17:43:47 -0800 Subject: [PATCH 04/23] add documentation --- docs/usage/nfd-master.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/usage/nfd-master.md b/docs/usage/nfd-master.md index 2c4ba201d..cec99430d 100644 --- a/docs/usage/nfd-master.md +++ b/docs/usage/nfd-master.md @@ -84,3 +84,10 @@ If you have RBAC authorization enabled (as is the default e.g. with clusters initialized with kubeadm) you need to configure the appropriate ClusterRoles, ClusterRoleBindings and a ServiceAccount for NFD to create node labels. The provided template will configure these for you. + +## Informer List Pagination + +When NFD Master starts up it starts an informer on the nodefeatures resources. +These resources can be large and in a large cluster this initial list call to sync the informer cache can be +expensive and heavy on api-server/etcd. You can use the `informer-list-size` argument to NFD master to control pagination size +to help control the load during NFD-Master restart. From 6c12e4d8b6135473980b9aaea691fdb9c89bc3e9 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 17:51:36 -0800 Subject: [PATCH 05/23] add documentation --- docs/reference/master-commandline-reference.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/reference/master-commandline-reference.md b/docs/reference/master-commandline-reference.md index e485c7841..82a8e06d3 100644 --- a/docs/reference/master-commandline-reference.md +++ b/docs/reference/master-commandline-reference.md @@ -11,7 +11,7 @@ sort: 1 {: .no_toc .text-delta} 1. TOC -{:toc} + {:toc} --- @@ -173,6 +173,19 @@ Example: nfd-master -deny-label-ns=*.vendor.com,vendor-2.io ``` +### -informer-list-size + +The `-informer-list-size` flag is used to control pagination during informer cache sync on nfd-master startup. +This is useful to control load on api-server/etcd as listing `nodefeatures` can be expensive, especially in large clusters. + +Default: 0 (no pagination) + +Example: + +```bash +nfd-master -informer-list-size=200 +``` + ### -config The `-config` flag specifies the path of the nfd-master configuration file to From 931b61afac79b6d7d35d2dc50f7451ef94891b05 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 17:52:04 -0800 Subject: [PATCH 06/23] clean --- docs/reference/master-commandline-reference.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/master-commandline-reference.md b/docs/reference/master-commandline-reference.md index 82a8e06d3..3f2204c35 100644 --- a/docs/reference/master-commandline-reference.md +++ b/docs/reference/master-commandline-reference.md @@ -11,7 +11,7 @@ sort: 1 {: .no_toc .text-delta} 1. TOC - {:toc} +{:toc} --- From a2f73944ea3610555a7543f492ec27653c11807f Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 19:27:32 -0800 Subject: [PATCH 07/23] fix listing to paginate continue --- .../nfd/v1alpha1/nodefeature.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go b/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go index 4341709a7..09f6c9700 100644 --- a/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go +++ b/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go @@ -20,6 +20,7 @@ package v1alpha1 import ( context "context" + "go.uber.org/zap" time "time" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -62,7 +63,23 @@ func NewFilteredNodeFeatureInformer(client versioned.Interface, namespace string if tweakListOptions != nil { tweakListOptions(&options) } - return client.NfdV1alpha1().NodeFeatures(namespace).List(context.TODO(), options) + if options.Limit == 0 { + return client.NfdV1alpha1().NodeFeatures(namespace).List(context.TODO(), options) + } + featureList := &apinfdv1alpha1.NodeFeatureList{} + // do paginated list + for { + features, err := client.NfdV1alpha1().NodeFeatures(namespace).List(context.TODO(), options) + if err != nil { + return nil, err + } + featureList.Items = append(featureList.Items, features.Items...) + if features.Continue == "" { + break + } + options.Continue = features.Continue + } + return featureList, nil }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { From f7344ca2f90b4f1ca75e78cb17f2c244aec8e22f Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 19:39:41 -0800 Subject: [PATCH 08/23] fix dep --- .../informers/externalversions/nfd/v1alpha1/nodefeature.go | 1 - 1 file changed, 1 deletion(-) diff --git a/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go b/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go index 09f6c9700..d1db396d0 100644 --- a/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go +++ b/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go @@ -20,7 +20,6 @@ package v1alpha1 import ( context "context" - "go.uber.org/zap" time "time" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" From 00d8fd662e6eebd21ecc14d28ee78e0aaef75ca7 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 19:52:22 -0800 Subject: [PATCH 09/23] limit is automatically handled for you --- .../nfd/v1alpha1/nodefeature.go | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go b/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go index d1db396d0..4341709a7 100644 --- a/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go +++ b/api/generated/informers/externalversions/nfd/v1alpha1/nodefeature.go @@ -62,23 +62,7 @@ func NewFilteredNodeFeatureInformer(client versioned.Interface, namespace string if tweakListOptions != nil { tweakListOptions(&options) } - if options.Limit == 0 { - return client.NfdV1alpha1().NodeFeatures(namespace).List(context.TODO(), options) - } - featureList := &apinfdv1alpha1.NodeFeatureList{} - // do paginated list - for { - features, err := client.NfdV1alpha1().NodeFeatures(namespace).List(context.TODO(), options) - if err != nil { - return nil, err - } - featureList.Items = append(featureList.Items, features.Items...) - if features.Continue == "" { - break - } - options.Continue = features.Continue - } - return featureList, nil + return client.NfdV1alpha1().NodeFeatures(namespace).List(context.TODO(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { From ea99b73decfe3bc15691db31a39afed7e65fca22 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Sat, 4 Jan 2025 20:04:43 -0800 Subject: [PATCH 10/23] update default --- cmd/nfd-master/main.go | 4 ++-- docs/reference/master-commandline-reference.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 835583dea..76db46cef 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -117,8 +117,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "in the same format as in the config file (i.e. json or yaml). These options") flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false, "Enables a leader election. Enable this when running more than one replica on nfd master.") - flagset.Int64Var(&args.ListSize, "informer-list-size", 0, - "The list size to use when listing node features to sync informer cache. Size of zero disables pagination.") + flagset.Int64Var(&args.ListSize, "informer-list-size", 200, + "The list size to use when listing node features to sync informer cache.") args.Klog = klogutils.InitKlogFlags(flagset) diff --git a/docs/reference/master-commandline-reference.md b/docs/reference/master-commandline-reference.md index 3f2204c35..64f4cb841 100644 --- a/docs/reference/master-commandline-reference.md +++ b/docs/reference/master-commandline-reference.md @@ -178,7 +178,7 @@ nfd-master -deny-label-ns=*.vendor.com,vendor-2.io The `-informer-list-size` flag is used to control pagination during informer cache sync on nfd-master startup. This is useful to control load on api-server/etcd as listing `nodefeatures` can be expensive, especially in large clusters. -Default: 0 (no pagination) +Default: 200 Example: From f37bbad8e8661bb004f64454b88df10e0ddef8cd Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Wed, 8 Jan 2025 09:33:47 -0800 Subject: [PATCH 11/23] Update cmd/nfd-master/main.go Co-authored-by: Markus Lehtonen --- cmd/nfd-master/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 76db46cef..621c2cf42 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -118,7 +118,7 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false, "Enables a leader election. Enable this when running more than one replica on nfd master.") flagset.Int64Var(&args.ListSize, "informer-list-size", 200, - "The list size to use when listing node features to sync informer cache.") + "The list size to use when listing NodeFeature objects to sync informer cache.") args.Klog = klogutils.InitKlogFlags(flagset) From 1f3b2b7f99bb7bf2dc9c06c7b1007268808b8285 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Wed, 8 Jan 2025 09:33:58 -0800 Subject: [PATCH 12/23] Update docs/reference/master-commandline-reference.md Co-authored-by: Markus Lehtonen --- docs/reference/master-commandline-reference.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/master-commandline-reference.md b/docs/reference/master-commandline-reference.md index 64f4cb841..aa45c05d1 100644 --- a/docs/reference/master-commandline-reference.md +++ b/docs/reference/master-commandline-reference.md @@ -176,7 +176,7 @@ nfd-master -deny-label-ns=*.vendor.com,vendor-2.io ### -informer-list-size The `-informer-list-size` flag is used to control pagination during informer cache sync on nfd-master startup. -This is useful to control load on api-server/etcd as listing `nodefeatures` can be expensive, especially in large clusters. +This is useful to control load on api-server/etcd as listing NodeFeature objects can be expensive, especially in large clusters. Default: 200 From a0e601b069d4bb1912e4fab57483bf73442fffd9 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Thu, 9 Jan 2025 11:36:32 -0800 Subject: [PATCH 13/23] update flag name --- cmd/nfd-master/main.go | 2 +- docs/reference/master-commandline-reference.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 621c2cf42..8de28337f 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -117,7 +117,7 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "in the same format as in the config file (i.e. json or yaml). These options") flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false, "Enables a leader election. Enable this when running more than one replica on nfd master.") - flagset.Int64Var(&args.ListSize, "informer-list-size", 200, + flagset.Int64Var(&args.ListSize, "informer-page-size", 200, "The list size to use when listing NodeFeature objects to sync informer cache.") args.Klog = klogutils.InitKlogFlags(flagset) diff --git a/docs/reference/master-commandline-reference.md b/docs/reference/master-commandline-reference.md index aa45c05d1..5a202ffdf 100644 --- a/docs/reference/master-commandline-reference.md +++ b/docs/reference/master-commandline-reference.md @@ -173,9 +173,9 @@ Example: nfd-master -deny-label-ns=*.vendor.com,vendor-2.io ``` -### -informer-list-size +### -informer-page-size -The `-informer-list-size` flag is used to control pagination during informer cache sync on nfd-master startup. +The `-informer-page-size` flag is used to control pagination during informer cache sync on nfd-master startup. This is useful to control load on api-server/etcd as listing NodeFeature objects can be expensive, especially in large clusters. Default: 200 @@ -183,7 +183,7 @@ Default: 200 Example: ```bash -nfd-master -informer-list-size=200 +nfd-master -informer-page-size=20 ``` ### -config From 33968a321d1dc5090bda21bad03b324930c7baad Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Thu, 9 Jan 2025 11:39:16 -0800 Subject: [PATCH 14/23] line length --- docs/reference/master-commandline-reference.md | 6 ++++-- docs/usage/nfd-master.md | 7 ++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/reference/master-commandline-reference.md b/docs/reference/master-commandline-reference.md index 5a202ffdf..aa4ac0d96 100644 --- a/docs/reference/master-commandline-reference.md +++ b/docs/reference/master-commandline-reference.md @@ -175,8 +175,10 @@ nfd-master -deny-label-ns=*.vendor.com,vendor-2.io ### -informer-page-size -The `-informer-page-size` flag is used to control pagination during informer cache sync on nfd-master startup. -This is useful to control load on api-server/etcd as listing NodeFeature objects can be expensive, especially in large clusters. +The `-informer-page-size` flag is used to control pagination +during informer cache sync on nfd-master startup. +This is useful to control load on api-server/etcd as listing +NodeFeature objects can be expensive, especially in large clusters. Default: 200 diff --git a/docs/usage/nfd-master.md b/docs/usage/nfd-master.md index cec99430d..03fa97134 100644 --- a/docs/usage/nfd-master.md +++ b/docs/usage/nfd-master.md @@ -88,6 +88,7 @@ labels. The provided template will configure these for you. ## Informer List Pagination When NFD Master starts up it starts an informer on the nodefeatures resources. -These resources can be large and in a large cluster this initial list call to sync the informer cache can be -expensive and heavy on api-server/etcd. You can use the `informer-list-size` argument to NFD master to control pagination size -to help control the load during NFD-Master restart. +These resources can be large and in a large cluster this initial list call +to sync the informer cache can be expensive and heavy on api-server/etcd. +You can use the `informer-list-size` argument to NFD master to +control pagination size to help control the load during NFD-Master restart. From aaf456aa1773eac95c9d5dac1306da427959265c Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Thu, 9 Jan 2025 11:40:52 -0800 Subject: [PATCH 15/23] trailing spaces --- docs/reference/master-commandline-reference.md | 4 ++-- docs/usage/nfd-master.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/reference/master-commandline-reference.md b/docs/reference/master-commandline-reference.md index aa4ac0d96..441f3d769 100644 --- a/docs/reference/master-commandline-reference.md +++ b/docs/reference/master-commandline-reference.md @@ -175,9 +175,9 @@ nfd-master -deny-label-ns=*.vendor.com,vendor-2.io ### -informer-page-size -The `-informer-page-size` flag is used to control pagination +The `-informer-page-size` flag is used to control pagination during informer cache sync on nfd-master startup. -This is useful to control load on api-server/etcd as listing +This is useful to control load on api-server/etcd as listing NodeFeature objects can be expensive, especially in large clusters. Default: 200 diff --git a/docs/usage/nfd-master.md b/docs/usage/nfd-master.md index 03fa97134..eb2823cab 100644 --- a/docs/usage/nfd-master.md +++ b/docs/usage/nfd-master.md @@ -88,7 +88,7 @@ labels. The provided template will configure these for you. ## Informer List Pagination When NFD Master starts up it starts an informer on the nodefeatures resources. -These resources can be large and in a large cluster this initial list call -to sync the informer cache can be expensive and heavy on api-server/etcd. -You can use the `informer-list-size` argument to NFD master to +These resources can be large and in a large cluster this initial list call +to sync the informer cache can be expensive and heavy on api-server/etcd. +You can use the `informer-list-size` argument to NFD master to control pagination size to help control the load during NFD-Master restart. From a3712554fe0ad16b063d1d2e0d943336b6995557 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Thu, 13 Feb 2025 10:21:57 -0800 Subject: [PATCH 16/23] config file for informerPageSize --- cmd/nfd-master/main.go | 4 ++-- .../helm/node-feature-discovery/values.yaml | 1 + docs/reference/master-configuration-reference.md | 15 +++++++++++++++ pkg/nfd-master/nfd-master.go | 9 +++++++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 8de28337f..e55211bba 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -117,8 +117,6 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "in the same format as in the config file (i.e. json or yaml). These options") flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false, "Enables a leader election. Enable this when running more than one replica on nfd master.") - flagset.Int64Var(&args.ListSize, "informer-page-size", 200, - "The list size to use when listing NodeFeature objects to sync informer cache.") args.Klog = klogutils.InitKlogFlags(flagset) @@ -142,6 +140,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) flagset.Var(overrides.ResyncPeriod, "resync-period", "Specify the NFD API controller resync period.") overrides.NfdApiParallelism = flagset.Int("nfd-api-parallelism", 10, "Defines the maximum number of goroutines responsible of updating nodes. "+ "Can be used for the throttling mechanism.") + overrides.InformerPageSize = flagset.Int64("informer-page-size", 200, + "The list size to use when listing NodeFeature objects to sync informer cache.") return args, overrides } diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index d71b7bb5b..93abcc715 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -66,6 +66,7 @@ master: # # this value has to be greater than 0 # retryPeriod: 2s # nfdApiParallelism: 10 + # informerPageSize: 50 ### port: 8080 instance: diff --git a/docs/reference/master-configuration-reference.md b/docs/reference/master-configuration-reference.md index 5ade9a3ea..0fa791153 100644 --- a/docs/reference/master-configuration-reference.md +++ b/docs/reference/master-configuration-reference.md @@ -216,6 +216,21 @@ Example: nfdApiParallelism: 1 ``` +## informerPageSize + +The `informerPageSize` option is used to control pagination +during informer cache sync on nfd-master startup. +This is useful to control load on api-server/etcd as listing +NodeFeature objects can be expensive, especially in large clusters. + +Default: 200 + +Example: + +```yaml +informerPageSize: 50 +``` + ## klog The following options specify the logger configuration. Most of which can be diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 0de556ea2..767cc18b8 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -91,6 +91,7 @@ type NFDConfig struct { NfdApiParallelism int Klog klogutils.KlogConfigOpts Restrictions Restrictions + InformerPageSize int64 } // LeaderElectionConfig contains the configuration for leader election @@ -109,6 +110,7 @@ type ConfigOverrideArgs struct { NoPublish *bool ResyncPeriod *utils.DurationVal NfdApiParallelism *int + InformerPageSize *int64 } // Args holds command line arguments @@ -122,7 +124,6 @@ type Args struct { Options string EnableLeaderElection bool MetricsPort int - ListSize int64 Overrides ConfigOverrideArgs } @@ -243,6 +244,7 @@ func newDefaultConfig() *NFDConfig { NfdApiParallelism: 10, EnableTaints: false, ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour}, + InformerPageSize: 200, LeaderElection: LeaderElectionConfig{ LeaseDuration: utils.DurationVal{Duration: time.Duration(15) * time.Second}, RetryPeriod: utils.DurationVal{Duration: time.Duration(2) * time.Second}, @@ -1191,6 +1193,9 @@ func (m *nfdMaster) configure(filepath string, overrides string) error { if m.args.Overrides.NfdApiParallelism != nil { c.NfdApiParallelism = *m.args.Overrides.NfdApiParallelism } + if m.args.Overrides.InformerPageSize != nil { + c.InformerPageSize = *m.args.Overrides.InformerPageSize + } if c.NfdApiParallelism <= 0 { return fmt.Errorf("the maximum number of concurrent labelers should be a non-zero positive number") @@ -1296,7 +1301,7 @@ func (m *nfdMaster) startNfdApiController() error { ResyncPeriod: m.config.ResyncPeriod.Duration, K8sClient: m.k8sClient, NodeFeatureNamespaceSelector: m.config.Restrictions.NodeFeatureNamespaceSelector, - ListSize: m.args.ListSize, + ListSize: m.config.InformerPageSize, }) if err != nil { return fmt.Errorf("failed to initialize CRD controller: %w", err) From a8e862373c9565cf5bb2e572ae3d08d43641f4a8 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Thu, 13 Feb 2025 10:26:45 -0800 Subject: [PATCH 17/23] update comment --- pkg/nfd-master/nfd-api-controller.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/nfd-master/nfd-api-controller.go b/pkg/nfd-master/nfd-api-controller.go index 5820f67d3..231b88ac4 100644 --- a/pkg/nfd-master/nfd-api-controller.go +++ b/pkg/nfd-master/nfd-api-controller.go @@ -25,8 +25,6 @@ import ( k8sclient "k8s.io/client-go/kubernetes" restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" - nfdclientset "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned" nfdscheme "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned/scheme" nfdinformers "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions" @@ -98,7 +96,13 @@ func newNfdController(config *restclient.Config, nfdApiControllerOptions nfdApiC // Tweak list opts on initial sync to avoid timeouts on the apiserver. // NodeFeature objects are huge and the Kubernetes apiserver // (v1.30) experiences http handler timeouts when the resource - // version is set to some non-empty value (TODO: find out why). + // version is set to some non-empty value + // https://github.com/kubernetes/kubernetes/blob/ace55542575fb098b3e413692bbe2bc20d2348ba/staging/src/k8s.io/apiserver/pkg/storage/cacher/cacher.go#L600-L616 if you set resource version to 0 + // it serves the request from apiservers cache and doesn't use pagination otherwise pagination will default to 500 + // so that's why this is required on large clusters + // So by setting this we're making it go to ETCD instead of from api-server cache, there's some WIP in k/k + // that seems to imply they're working on improving this behavior where you'll be able to paginate from apiserver cache + // it's not supported yet (2/2025), would be good to track this though kubernetes/kubernetes#108003 if opts.ResourceVersion == "0" { opts.ResourceVersion = "" } From 5a7d1fb36481cda89801d2dc9031f58fc70177ce Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Tue, 18 Feb 2025 20:50:29 -0800 Subject: [PATCH 18/23] goland removed dep --- pkg/nfd-master/nfd-api-controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/nfd-master/nfd-api-controller.go b/pkg/nfd-master/nfd-api-controller.go index 231b88ac4..2a32dc531 100644 --- a/pkg/nfd-master/nfd-api-controller.go +++ b/pkg/nfd-master/nfd-api-controller.go @@ -25,6 +25,8 @@ import ( k8sclient "k8s.io/client-go/kubernetes" restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" + nfdclientset "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned" nfdscheme "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned/scheme" nfdinformers "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions" From 47139d8b250319da25ea12743e47a133dac797ee Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Tue, 18 Feb 2025 21:01:39 -0800 Subject: [PATCH 19/23] format --- pkg/nfd-master/nfd-api-controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/nfd-master/nfd-api-controller.go b/pkg/nfd-master/nfd-api-controller.go index 2a32dc531..eb9dd2433 100644 --- a/pkg/nfd-master/nfd-api-controller.go +++ b/pkg/nfd-master/nfd-api-controller.go @@ -26,7 +26,7 @@ import ( restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" - + nfdclientset "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned" nfdscheme "sigs.k8s.io/node-feature-discovery/api/generated/clientset/versioned/scheme" nfdinformers "sigs.k8s.io/node-feature-discovery/api/generated/informers/externalversions" From dec49598cc3d692d9c31015d41c7dfd4baf7a1aa Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Wed, 26 Mar 2025 12:16:34 -0700 Subject: [PATCH 20/23] make templates --- deployment/components/master-config/nfd-master.conf.example | 1 + deployment/helm/node-feature-discovery/values.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/deployment/components/master-config/nfd-master.conf.example b/deployment/components/master-config/nfd-master.conf.example index 124848552..7dc990cd0 100644 --- a/deployment/components/master-config/nfd-master.conf.example +++ b/deployment/components/master-config/nfd-master.conf.example @@ -3,6 +3,7 @@ # extraLabelNs: ["added.ns.io","added.kubernets.io"] # denyLabelNs: ["denied.ns.io","denied.kubernetes.io"] # enableTaints: false +# informerPageSize: 200 # labelWhiteList: "foo" # resyncPeriod: "2h" # restrictions: diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index 93abcc715..2c3c591e6 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -27,6 +27,7 @@ master: # extraLabelNs: ["added.ns.io","added.kubernets.io"] # denyLabelNs: ["denied.ns.io","denied.kubernetes.io"] # enableTaints: false + # informerPageSize: 200 # labelWhiteList: "foo" # resyncPeriod: "2h" # restrictions: From c376ea2f6367b81b13fbca9764bec3da5763353b Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Wed, 26 Mar 2025 12:17:54 -0700 Subject: [PATCH 21/23] fix go mod --- api/nfd/go.mod | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/api/nfd/go.mod b/api/nfd/go.mod index 1f4ba3960..cf24a9355 100644 --- a/api/nfd/go.mod +++ b/api/nfd/go.mod @@ -1,8 +1,7 @@ module sigs.k8s.io/node-feature-discovery/api/nfd -go 1.23.0 - -toolchain go1.24.0 +go 1.22.2 +toolchain go1.23.7 require ( github.com/stretchr/testify v1.8.4 From 154f1ddff2923c55a1c8d40d0efca1df33a608e4 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Wed, 26 Mar 2025 12:20:57 -0700 Subject: [PATCH 22/23] make templates --- deployment/helm/node-feature-discovery/values.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index 2c3c591e6..5e76747f9 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -67,7 +67,6 @@ master: # # this value has to be greater than 0 # retryPeriod: 2s # nfdApiParallelism: 10 - # informerPageSize: 50 ### port: 8080 instance: From b05a8443e774029dbd069f33c80f77572aa5e6d7 Mon Sep 17 00:00:00 2001 From: Igor Velichkovich Date: Wed, 26 Mar 2025 12:26:59 -0700 Subject: [PATCH 23/23] fix klog --- pkg/nfd-master/nfd-master.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 767cc18b8..8d620ec26 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -45,6 +45,7 @@ import ( restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" + "k8s.io/klog/v2" controller "k8s.io/kubernetes/pkg/controller" taintutils "k8s.io/kubernetes/pkg/util/taints" "sigs.k8s.io/yaml"