1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

Add new flag enable-leader-election for nfd-master.

It allows NFD-master to be run in active-passive way when running
multiple instances of NFD-master to prevent multiple components
from updating same custom resources.

Signed-off-by: PiotrProkop <pprokop@nvidia.com>
This commit is contained in:
PiotrProkop 2023-05-05 12:01:32 +02:00
parent cd45baef8d
commit 272fd4784f
11 changed files with 201 additions and 3 deletions

View file

@ -128,6 +128,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
flagset.StringVar(&args.Options, "options", "",
"Specify config options from command line. Config options are specified "+
"in the same format as in the config file (i.e. json or yaml). These options")
flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false,
"Enables a leader election. Enable this when running more than one replica on nfd master.")
overrides := &master.ConfigOverrideArgs{
LabelWhiteList: &utils.RegexpVal{},

View file

@ -22,3 +22,18 @@ rules:
- get
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- apiGroups:
- coordination.k8s.io
resources:
- leases
resourceNames:
- "nfd-master.nfd.kubernetes.io"
verbs:
- get
- update

View file

@ -5,3 +5,9 @@
# enableTaints: false
# labelWhiteList: "foo"
# resyncPeriod: "2h"
# leaderElection:
# leaseDuration: 15s
# # this value has to be lower than leaseDuration and greater than retryPeriod*1.2
# renewDeadline: 10s
# # this value has to be greater than 0
# retryPeriod: 2s

View file

@ -25,6 +25,21 @@ rules:
- get
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- apiGroups:
- coordination.k8s.io
resources:
- leases
resourceNames:
- "nfd-master.nfd.kubernetes.io"
verbs:
- get
- update
{{- end }}
---

View file

@ -82,6 +82,9 @@ spec:
- "-port={{ .Values.master.port | default "8080" }}"
{{- if .Values.enableNodeFeatureApi }}
- "-enable-nodefeature-api"
{{- if gt (int .Values.master.replicaCount) 1 }}
- "-enable-leader-election"
{{- end }}
{{- end }}
{{- if .Values.master.extraLabelNs | empty | not }}
- "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}"

View file

@ -21,6 +21,12 @@ master:
# enableTaints: false
# labelWhiteList: "foo"
# resyncPeriod: "2h"
# leaderElection:
# leaseDuration: 15s
# # this value has to be lower than leaseDuration and greater than retryPeriod*1.2
# renewDeadline: 10s
# # this value has to be greater than 0
# retryPeriod: 2s
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
# The TCP port that nfd-master listens for incoming requests. Default: 8080
port: 8080

View file

@ -151,6 +151,20 @@ Example:
nfd-master -enable-nodefeature-api
```
### -enable-leader-election
The `-enable-leader-election` flag enables leader election for NFD-Master.
It is advised to turn on this flag when running more than one instance of
NFD-Master.
This flag takes effect only when combined with `-enable-nodefeature-api` flag.
Default: false
```bash
nfd-master -enable-nodefeature-api -enable-leader-election
```
### -enable-taints
The `-enable-taints` flag enables/disables node tainting feature of NFD.

View file

@ -114,7 +114,7 @@ Example:
labelWhiteList: "foo"
```
### resyncPeriod
## resyncPeriod
The `resyncPeriod` option specifies the NFD API controller resync period.
The resync means nfd-master replaying all NodeFeature and NodeFeatureRule objects,
@ -128,5 +128,64 @@ Default: 1 hour.
Example:
```yaml
resyncPeriod=2h
resyncPeriod: 2h
```
## leaderElection
The `leaderElection` section exposes configuration to tweak leader election.
### leaderElection.leaseDuration
`leaderElection.leaseDuration` is the duration that non-leader candidates will
wait to force acquire leadership. This is measured against time of
last observed ack.
A client needs to wait a full LeaseDuration without observing a change to
the record before it can attempt to take over. When all clients are
shutdown and a new set of clients are started with different names against
the same leader record, they must wait the full LeaseDuration before
attempting to acquire the lease. Thus LeaseDuration should be as short as
possible (within your tolerance for clock skew rate) to avoid a possible
long waits in the scenario.
Default: 15 seconds.
Example:
```yaml
leaderElection:
leaseDurtation: 15s
```
### leaderElection.renewDeadline
`leaderElection.renewDeadline` is the duration that the acting master will retry
refreshing leadership before giving up.
This value has to be lower than leaseDuration and greater than retryPeriod*1.2.
Default: 10 seconds.
Example:
```yaml
leaderElection:
renewDeadline: 10s
```
### leaderElection.retryPeriod
`leaderElection.retryPeriod` is the duration the LeaderElector clients should wait
between tries of actions.
It has to be greater than 0.
Default: 2 seconds.
Example:
```yaml
leaderElection:
retryPeriod: 2s
```

View file

@ -29,6 +29,11 @@ are created on the node (note the allowed
> its [`-enable-nodefeature-api`](../reference/worker-commandline-reference.md#-enable-nodefeature-api)
> flag.
When `-enable-nodefeature-api` option is enabled and NFD-Master is intended to run
with more than one replica, it is advised to use `-enable-leader-election` flag.
This flag turns on leader election for NFD-Master and let only one replica
to act on changes in NodeFeature and NodeFeatureRule objects.
## NodeFeatureRule controller
NFD-Master acts as the controller for

View file

@ -558,6 +558,10 @@ denyLabelNs: ["denied.ns.io","denied.kubernetes.io"]
resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"]
enableTaints: false
labelWhiteList: "foo"
leaderElection:
leaseDuration: 20s
renewDeadline: 4s
retryPeriod: 30s
`)
f.Close()
So(err, ShouldBeNil)
@ -573,6 +577,9 @@ labelWhiteList: "foo"
So(master.config.ResourceLabels, ShouldResemble, utils.StringSetVal{"vendor-1.com/feature-1": struct{}{}, "vendor-2.io/feature-2": struct{}{}}) // from cmdline
So(master.config.DenyLabelNs, ShouldResemble, utils.StringSetVal{"denied.ns.io": struct{}{}, "denied.kubernetes.io": struct{}{}})
So(master.config.LabelWhiteList.String(), ShouldEqual, "foo")
So(master.config.LeaderElection.LeaseDuration.Seconds(), ShouldEqual, float64(20))
So(master.config.LeaderElection.RenewDeadline.Seconds(), ShouldEqual, float64(4))
So(master.config.LeaderElection.RetryPeriod.Seconds(), ShouldEqual, float64(30))
})
})

View file

@ -30,6 +30,7 @@ import (
"strings"
"time"
"github.com/google/uuid"
"golang.org/x/net/context"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
@ -38,9 +39,12 @@ import (
"google.golang.org/grpc/peer"
corev1 "k8s.io/api/core/v1"
k8sQuantity "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8sLabels "k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/client-go/tools/leaderelection/resourcelock"
"k8s.io/klog/v2"
controller "k8s.io/kubernetes/pkg/controller"
taintutils "k8s.io/kubernetes/pkg/util/taints"
@ -71,6 +75,14 @@ type NFDConfig struct {
ResourceLabels utils.StringSetVal
EnableTaints bool
ResyncPeriod utils.DurationVal
LeaderElection LeaderElectionConfig
}
// LeaderElectionConfig contains the configuration for leader election
type LeaderElectionConfig struct {
LeaseDuration utils.DurationVal
RenewDeadline utils.DurationVal
RetryPeriod utils.DurationVal
}
// ConfigOverrideArgs are args that override config file options
@ -98,6 +110,7 @@ type Args struct {
Prune bool
VerifyNodeName bool
Options string
EnableLeaderElection bool
Overrides ConfigOverrideArgs
}
@ -174,6 +187,11 @@ func newDefaultConfig() *NFDConfig {
ResourceLabels: utils.StringSetVal{},
EnableTaints: false,
ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour},
LeaderElection: LeaderElectionConfig{
LeaseDuration: utils.DurationVal{Duration: time.Duration(15) * time.Second},
RetryPeriod: utils.DurationVal{Duration: time.Duration(2) * time.Second},
RenewDeadline: utils.DurationVal{Duration: time.Duration(10) * time.Second},
},
}
}
@ -221,8 +239,12 @@ func (m *nfdMaster) Run() error {
// Run updater that handles events from the nfd CRD API.
if m.nfdController != nil {
if m.args.EnableLeaderElection {
go m.nfdAPIUpdateHandlerWithLeaderElection()
} else {
go m.nfdAPIUpdateHandler()
}
}
// Notify that we're ready to accept connections
m.ready <- true
@ -1230,3 +1252,47 @@ func (m *nfdMaster) startNfdApiController() error {
}
return nil
}
func (m *nfdMaster) nfdAPIUpdateHandlerWithLeaderElection() {
ctx := context.Background()
client, err := m.apihelper.GetClient()
if err != nil {
klog.ErrorS(err, "failed to get Kubernetes client")
m.Stop()
}
lock := &resourcelock.LeaseLock{
LeaseMeta: metav1.ObjectMeta{
Name: "nfd-master.nfd.kubernetes.io",
Namespace: m.namespace,
},
Client: client.CoordinationV1(),
LockConfig: resourcelock.ResourceLockConfig{
// add uuid to prevent situation where 2 nfd-master nodes run on same node
Identity: m.nodeName + "_" + uuid.NewString(),
},
}
config := leaderelection.LeaderElectionConfig{
Lock: lock,
// make it configurable?
LeaseDuration: m.config.LeaderElection.LeaseDuration.Duration,
RetryPeriod: m.config.LeaderElection.RetryPeriod.Duration,
RenewDeadline: m.config.LeaderElection.RenewDeadline.Duration,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(_ context.Context) {
m.nfdAPIUpdateHandler()
},
OnStoppedLeading: func() {
// We lost the lock.
klog.ErrorS(err, "leaderelection lock was lost")
m.Stop()
},
},
}
leaderElector, err := leaderelection.NewLeaderElector(config)
if err != nil {
klog.ErrorS(err, "couldn't create leader elector")
m.Stop()
}
leaderElector.Run(ctx)
}