1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

feat: parallelize nodes update

This PR aims to optimize the process of updating nodes with
corresponding features. In fact, previously, we were updating nodes
sequentially even though they are independent from each other.
Therefore, we integrated new components: LabelersNodePool which is
responsible for spininng a goroutine whenever there's a request for
updating nodes, and a Workqueue which is responsible for holding nodes names
that should be updated.

Signed-off-by: AhmedGrati <ahmedgrati1999@gmail.com>
This commit is contained in:
AhmedGrati 2023-05-13 20:03:53 +01:00
parent d64398f85e
commit b3cfe17392
11 changed files with 305 additions and 43 deletions

View file

@ -239,3 +239,6 @@ site-build:
site-serve:
@mkdir -p docs/vendor/bundle
$(SITE_BUILD_CMD) sh -c "bundle install && jekyll serve $(JEKYLL_OPTS) -H 127.0.0.1"
benchmark:
go test -bench=./pkg/nfd-master -run=^# ./pkg/nfd-master

View file

@ -70,6 +70,8 @@ func main() {
args.Overrides.NoPublish = overrides.NoPublish
case "resync-period":
args.Overrides.ResyncPeriod = overrides.ResyncPeriod
case "nfd-api-parallelism":
args.Overrides.NfdApiParallelism = overrides.NfdApiParallelism
}
})
@ -156,5 +158,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
flagset.Var(overrides.ResyncPeriod, "resync-period",
"Specify the NFD API controller resync period."+
"It has an effect when the NodeFeature API has been enabled (with -enable-nodefeature-api).")
overrides.NfdApiParallelism = flagset.Int("nfd-api-parallelism", 10, "Defines the maximum number of goroutines responsible of updating nodes. "+
"Can be used for the throttling mechanism. It has effect only when -enable-nodefeature-api has been set.")
return args, overrides
}

View file

@ -11,3 +11,4 @@
# renewDeadline: 10s
# # this value has to be greater than 0
# retryPeriod: 2s
# nfdApiParallelism: 10

View file

@ -110,6 +110,9 @@ spec:
{{- if .Values.master.resyncPeriod }}
- "-resync-period={{ .Values.master.resyncPeriod }}"
{{- end }}
{{- if .Values.master.nfdApiParallelism | empty | not }}
- "-nfd-api-parallelism={{ .Values.master.nfdApiParallelism }}"
{{- end }}
{{- if .Values.tls.enable }}
- "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt"
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"

View file

@ -27,6 +27,7 @@ master:
# renewDeadline: 10s
# # this value has to be greater than 0
# retryPeriod: 2s
# nfdApiParallelism: 10
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
# The TCP port that nfd-master listens for incoming requests. Default: 8080
port: 8080
@ -39,6 +40,7 @@ master:
enableTaints: false
crdController: null
featureRulesController: null
nfdApiParallelism: null
deploymentAnnotations: {}
replicaCount: 1

View file

@ -131,6 +131,7 @@ We have introduced the following Chart parameters.
| `master.annotations` | dict | {} | NFD master pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) |
| `master.affinity` | dict | | NFD master pod required [node affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/) |
| `master.deploymentAnnotations` | dict | {} | NFD master deployment [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) |
| `master.nfdApiParallelism` | integer | 10 | Specifies the maximum number of concurrent node updates. |
| `master.config` | dict | | NFD master [configuration](../reference/master-configuration-reference) |
### Worker pod parameters

View file

@ -313,6 +313,21 @@ Example:
nfd-master -options='{"noPublish": true}'
```
### -nfd-api-parallelism
The `-nfd-api-parallelism` flag can be used to specify the maximum
number of concurrent node updates.
It takes effect only when `-enable-nodefeature-api` has been set.
Default: 10
Example:
```bash
nfd-master -nfd-api-parallelism=1
```
### Logging
The following logging-related flags are inherited from the

View file

@ -189,3 +189,18 @@ Example:
leaderElection:
retryPeriod: 2s
```
### nfdApiParallelism
The `nfdApiParallelism` option can be used to specify the maximum
number of concurrent node updates.
It takes effect only when `-enable-nodefeature-api` has been set.
Default: 10
Example:
```yaml
nfdApiParallelism: 1
```

View file

@ -34,9 +34,14 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
k8sclient "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"sigs.k8s.io/node-feature-discovery/pkg/apihelper"
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1"
"sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned/fake"
nfdscheme "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned/scheme"
nfdinformers "sigs.k8s.io/node-feature-discovery/pkg/generated/informers/externalversions"
"sigs.k8s.io/node-feature-discovery/pkg/labeler"
"sigs.k8s.io/node-feature-discovery/pkg/utils"
"sigs.k8s.io/node-feature-discovery/pkg/version"
@ -56,6 +61,60 @@ func newMockNode() *corev1.Node {
return &n
}
func mockNodeList() *corev1.NodeList {
l := corev1.NodeList{}
for i := 0; i < 1000; i++ {
n := corev1.Node{}
n.Name = fmt.Sprintf("node %v", i)
n.Labels = map[string]string{}
n.Annotations = map[string]string{}
n.Status.Capacity = corev1.ResourceList{}
l.Items = append(l.Items, n)
}
return &l
}
func newMockNfdAPIController(client *fake.Clientset) *nfdController {
c := &nfdController{
stopChan: make(chan struct{}, 1),
updateAllNodesChan: make(chan struct{}, 1),
updateOneNodeChan: make(chan string),
}
informerFactory := nfdinformers.NewSharedInformerFactory(client, 1*time.Hour)
// Add informer for NodeFeature objects
featureInformer := informerFactory.Nfd().V1alpha1().NodeFeatures()
if _, err := featureInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {},
UpdateFunc: func(oldObj, newObj interface{}) {},
DeleteFunc: func(obj interface{}) {},
}); err != nil {
return nil
}
c.featureLister = featureInformer.Lister()
// Add informer for NodeFeatureRule objects
ruleInformer := informerFactory.Nfd().V1alpha1().NodeFeatureRules()
if _, err := ruleInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(object interface{}) {},
UpdateFunc: func(oldObject, newObject interface{}) {},
DeleteFunc: func(object interface{}) {},
}); err != nil {
return nil
}
c.ruleLister = ruleInformer.Lister()
// Start informers
informerFactory.Start(c.stopChan)
utilruntime.Must(nfdv1alpha1.AddToScheme(nfdscheme.Scheme))
return c
}
func newMockMaster(apihelper apihelper.APIHelpers) *nfdMaster {
return &nfdMaster{
nodeName: mockNodeName,
@ -647,11 +706,14 @@ extraLabelNs: ["added.ns.io"]
writeConfig(`
extraLabelNs: ["override.ns.io"]
resyncPeriod: '2h'
nfdApiParallelism: 300
`)
So(func() interface{} { return master.config.ExtraLabelNs },
withTimeout, 2*time.Second, ShouldResemble, utils.StringSetVal{"override.ns.io": struct{}{}})
So(func() interface{} { return master.config.ResyncPeriod.Duration },
withTimeout, 2*time.Second, ShouldResemble, time.Duration(2)*time.Hour)
So(func() interface{} { return master.config.NfdApiParallelism },
withTimeout, 2*time.Second, ShouldResemble, 300)
// Removing config file should get back our defaults
err = os.RemoveAll(tmpDir)
@ -660,6 +722,8 @@ resyncPeriod: '2h'
withTimeout, 2*time.Second, ShouldResemble, utils.StringSetVal{})
So(func() interface{} { return master.config.ResyncPeriod.Duration },
withTimeout, 2*time.Second, ShouldResemble, time.Duration(1)*time.Hour)
So(func() interface{} { return master.config.NfdApiParallelism },
withTimeout, 2*time.Second, ShouldResemble, 10)
// Re-creating config dir and file should change the config
err = os.MkdirAll(configDir, 0755)
@ -667,15 +731,56 @@ resyncPeriod: '2h'
writeConfig(`
extraLabelNs: ["another.override.ns"]
resyncPeriod: '3m'
nfdApiParallelism: 100
`)
So(func() interface{} { return master.config.ExtraLabelNs },
withTimeout, 2*time.Second, ShouldResemble, utils.StringSetVal{"another.override.ns": struct{}{}})
So(func() interface{} { return master.config.ResyncPeriod.Duration },
withTimeout, 2*time.Second, ShouldResemble, time.Duration(3)*time.Minute)
So(func() interface{} { return master.config.NfdApiParallelism },
withTimeout, 2*time.Second, ShouldResemble, 100)
})
})
}
func BenchmarkNfdAPIUpdateAllNodes(b *testing.B) {
mockAPIHelper := new(apihelper.MockAPIHelpers)
mockMaster := newMockMaster(mockAPIHelper)
mockMaster.nfdController = newMockNfdAPIController(fake.NewSimpleClientset())
mockMaster.config.NoPublish = true
mockNodeUpdaterPool := newNodeUpdaterPool(mockMaster)
mockMaster.nodeUpdaterPool = mockNodeUpdaterPool
mockClient := &k8sclient.Clientset{}
statusPatches := []apihelper.JsonPatch{}
metadataPatches := []apihelper.JsonPatch{
{Op: "add", Path: "/metadata/annotations/nfd.node.kubernetes.io~1feature-labels", Value: ""}, {Op: "add", Path: "/metadata/annotations/nfd.node.kubernetes.io~1extended-resources", Value: ""},
}
mockAPIHelper.On("GetClient").Return(mockClient, nil)
mockAPIHelper.On("GetNodes", mockClient).Return(mockNodeList(), nil)
mockNodeUpdaterPool.start(10)
for i := 0; i < 1000; i++ {
nodeName := fmt.Sprintf("node %v", i)
node := corev1.Node{}
node.Name = nodeName
mockAPIHelper.On("GetNode", mockClient, nodeName).Return(&node, nil)
mockAPIHelper.On("PatchNodeStatus", mockClient, nodeName, mock.MatchedBy(jsonPatchMatcher(statusPatches))).Return(nil)
mockAPIHelper.On("PatchNode", mockClient, nodeName, mock.MatchedBy(jsonPatchMatcher(metadataPatches))).Return(nil)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = mockMaster.nfdAPIUpdateAllNodes()
}
fmt.Println(b.Elapsed())
}
// withTimeout is a custom assertion for polling a value asynchronously
// actual is a function for getting the actual value
// expected[0] is a time.Duration value specifying the timeout

View file

@ -69,14 +69,15 @@ type Annotations map[string]string
// NFDConfig contains the configuration settings of NfdMaster.
type NFDConfig struct {
DenyLabelNs utils.StringSetVal
ExtraLabelNs utils.StringSetVal
LabelWhiteList utils.RegexpVal
NoPublish bool
ResourceLabels utils.StringSetVal
EnableTaints bool
ResyncPeriod utils.DurationVal
LeaderElection LeaderElectionConfig
DenyLabelNs utils.StringSetVal
ExtraLabelNs utils.StringSetVal
LabelWhiteList utils.RegexpVal
NoPublish bool
ResourceLabels utils.StringSetVal
EnableTaints bool
ResyncPeriod utils.DurationVal
LeaderElection LeaderElectionConfig
NfdApiParallelism int
}
// LeaderElectionConfig contains the configuration for leader election
@ -88,13 +89,14 @@ type LeaderElectionConfig struct {
// ConfigOverrideArgs are args that override config file options
type ConfigOverrideArgs struct {
DenyLabelNs *utils.StringSetVal
ExtraLabelNs *utils.StringSetVal
LabelWhiteList *utils.RegexpVal
ResourceLabels *utils.StringSetVal
EnableTaints *bool
NoPublish *bool
ResyncPeriod *utils.DurationVal
DenyLabelNs *utils.StringSetVal
ExtraLabelNs *utils.StringSetVal
LabelWhiteList *utils.RegexpVal
ResourceLabels *utils.StringSetVal
EnableTaints *bool
NoPublish *bool
ResyncPeriod *utils.DurationVal
NfdApiParallelism *int
}
// Args holds command line arguments
@ -130,15 +132,16 @@ type NfdMaster interface {
type nfdMaster struct {
*nfdController
args Args
namespace string
nodeName string
configFilePath string
server *grpc.Server
stop chan struct{}
ready chan bool
apihelper apihelper.APIHelpers
kubeconfig *restclient.Config
args Args
namespace string
nodeName string
configFilePath string
server *grpc.Server
stop chan struct{}
ready chan bool
apihelper apihelper.APIHelpers
kubeconfig *restclient.Config
nodeUpdaterPool *nodeUpdaterPool
deniedNs
config *NFDConfig
}
@ -176,18 +179,22 @@ func NewNfdMaster(args *Args) (NfdMaster, error) {
if args.ConfigFile != "" {
nfd.configFilePath = filepath.Clean(args.ConfigFile)
}
nfd.nodeUpdaterPool = newNodeUpdaterPool(nfd)
return nfd, nil
}
func newDefaultConfig() *NFDConfig {
return &NFDConfig{
LabelWhiteList: utils.RegexpVal{Regexp: *regexp.MustCompile("")},
DenyLabelNs: utils.StringSetVal{},
ExtraLabelNs: utils.StringSetVal{},
NoPublish: false,
ResourceLabels: utils.StringSetVal{},
EnableTaints: false,
ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour},
LabelWhiteList: utils.RegexpVal{Regexp: *regexp.MustCompile("")},
DenyLabelNs: utils.StringSetVal{},
ExtraLabelNs: utils.StringSetVal{},
NoPublish: false,
NfdApiParallelism: 10,
ResourceLabels: utils.StringSetVal{},
EnableTaints: false,
ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour},
LeaderElection: LeaderElectionConfig{
LeaseDuration: utils.DurationVal{Duration: time.Duration(15) * time.Second},
RetryPeriod: utils.DurationVal{Duration: time.Duration(2) * time.Second},
@ -220,6 +227,8 @@ func (m *nfdMaster) Run() error {
}
}
m.nodeUpdaterPool.start(m.config.NfdApiParallelism)
// Create watcher for config file
configWatch, err := utils.CreateFsWatcher(time.Second, m.configFilePath)
if err != nil {
@ -276,6 +285,10 @@ func (m *nfdMaster) Run() error {
if m.nfdController != nil && m.args.EnableNodeFeatureApi {
m.nfdController.updateAllNodesChan <- struct{}{}
}
// Restart the node updater pool
m.nodeUpdaterPool.stop()
m.nodeUpdaterPool.start(m.config.NfdApiParallelism)
case <-m.stop:
klog.InfoS("shutting down nfd-master")
return nil
@ -359,10 +372,7 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
case nodeName := <-m.nfdController.updateOneNodeChan:
updateNodes[nodeName] = struct{}{}
case <-rateLimit:
// Check what we need to do
// TODO: we might want to update multiple nodes in parallel
errUpdateAll := false
errNodes := make(map[string]struct{})
if updateAll {
if err := m.nfdAPIUpdateAllNodes(); err != nil {
klog.ErrorS(err, "failed to update nodes")
@ -370,16 +380,13 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
}
} else {
for nodeName := range updateNodes {
if err := m.nfdAPIUpdateOneNode(nodeName); err != nil {
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
errNodes[nodeName] = struct{}{}
}
m.nodeUpdaterPool.queue.Add(nodeName)
}
}
// Reset "work queue" and timer, will cause re-try if errors happened
// Reset "work queue" and timer
updateAll = errUpdateAll
updateNodes = errNodes
updateNodes = map[string]struct{}{}
rateLimit = time.After(time.Second)
}
}
@ -393,6 +400,8 @@ func (m *nfdMaster) Stop() {
m.nfdController.stop()
}
m.nodeUpdaterPool.stop()
close(m.stop)
}
@ -645,9 +654,7 @@ func (m *nfdMaster) nfdAPIUpdateAllNodes() error {
}
for _, node := range nodes.Items {
if err := m.nfdAPIUpdateOneNode(node.Name); err != nil {
return err
}
m.nodeUpdaterPool.queue.Add(node.Name)
}
return nil
@ -1161,6 +1168,13 @@ func (m *nfdMaster) configure(filepath string, overrides string) error {
if m.args.Overrides.ResyncPeriod != nil {
c.ResyncPeriod = *m.args.Overrides.ResyncPeriod
}
if m.args.Overrides.NfdApiParallelism != nil {
c.NfdApiParallelism = *m.args.Overrides.NfdApiParallelism
}
if c.NfdApiParallelism <= 0 {
return fmt.Errorf("the maximum number of concurrent labelers should be a non-zero positive number")
}
m.config = c
if !c.NoPublish {

View file

@ -0,0 +1,98 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nfdmaster
import (
"sync"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
)
type nodeUpdaterPool struct {
queue workqueue.RateLimitingInterface
sync.Mutex
wg sync.WaitGroup
nfdMaster *nfdMaster
}
func newNodeUpdaterPool(nfdMaster *nfdMaster) *nodeUpdaterPool {
return &nodeUpdaterPool{
nfdMaster: nfdMaster,
wg: sync.WaitGroup{},
}
}
func (u *nodeUpdaterPool) processNodeLabelRequest(queue workqueue.RateLimitingInterface) bool {
nodeName, quit := queue.Get()
if quit {
return false
}
defer queue.Done(nodeName)
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
if queue.NumRequeues(nodeName) < 5 {
klog.InfoS("retrying labeling request for node", "nodeName", nodeName)
queue.AddRateLimited(nodeName)
return true
} else {
klog.ErrorS(err, "error labeling node", "nodeName", nodeName)
}
}
queue.Forget(nodeName)
return true
}
func (u *nodeUpdaterPool) runNodeUpdater(queue workqueue.RateLimitingInterface) {
for u.processNodeLabelRequest(queue) {
}
u.wg.Done()
}
func (u *nodeUpdaterPool) start(parallelism int) {
u.Lock()
defer u.Unlock()
if u.queue != nil && !u.queue.ShuttingDown() {
klog.InfoS("the NFD master node updater pool is already running.")
return
}
klog.InfoS("starting the NFD master node updater pool", "parallelism", parallelism)
u.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
for i := 0; i < parallelism; i++ {
u.wg.Add(1)
go u.runNodeUpdater(u.queue)
}
}
func (u *nodeUpdaterPool) stop() {
u.Lock()
defer u.Unlock()
if u.queue == nil || u.queue.ShuttingDown() {
klog.InfoS("the NFD master node updater pool is not running.")
return
}
klog.InfoS("stopping the NFD master node updater pool")
u.queue.ShutDown()
u.wg.Wait()
}