mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
Add new flag enable-leader-election for nfd-master.
It allows NFD-master to be run in active-passive way when running multiple instances of NFD-master to prevent multiple components from updating same custom resources. Signed-off-by: PiotrProkop <pprokop@nvidia.com>
This commit is contained in:
parent
cd45baef8d
commit
272fd4784f
11 changed files with 201 additions and 3 deletions
|
@ -128,6 +128,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
|
|||
flagset.StringVar(&args.Options, "options", "",
|
||||
"Specify config options from command line. Config options are specified "+
|
||||
"in the same format as in the config file (i.e. json or yaml). These options")
|
||||
flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false,
|
||||
"Enables a leader election. Enable this when running more than one replica on nfd master.")
|
||||
|
||||
overrides := &master.ConfigOverrideArgs{
|
||||
LabelWhiteList: &utils.RegexpVal{},
|
||||
|
|
|
@ -22,3 +22,18 @@ rules:
|
|||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
- leases
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
- leases
|
||||
resourceNames:
|
||||
- "nfd-master.nfd.kubernetes.io"
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
|
|
|
@ -5,3 +5,9 @@
|
|||
# enableTaints: false
|
||||
# labelWhiteList: "foo"
|
||||
# resyncPeriod: "2h"
|
||||
# leaderElection:
|
||||
# leaseDuration: 15s
|
||||
# # this value has to be lower than leaseDuration and greater than retryPeriod*1.2
|
||||
# renewDeadline: 10s
|
||||
# # this value has to be greater than 0
|
||||
# retryPeriod: 2s
|
||||
|
|
|
@ -25,6 +25,21 @@ rules:
|
|||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
- leases
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
- leases
|
||||
resourceNames:
|
||||
- "nfd-master.nfd.kubernetes.io"
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
{{- end }}
|
||||
|
||||
---
|
||||
|
|
|
@ -82,6 +82,9 @@ spec:
|
|||
- "-port={{ .Values.master.port | default "8080" }}"
|
||||
{{- if .Values.enableNodeFeatureApi }}
|
||||
- "-enable-nodefeature-api"
|
||||
{{- if gt (int .Values.master.replicaCount) 1 }}
|
||||
- "-enable-leader-election"
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.master.extraLabelNs | empty | not }}
|
||||
- "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}"
|
||||
|
|
|
@ -21,6 +21,12 @@ master:
|
|||
# enableTaints: false
|
||||
# labelWhiteList: "foo"
|
||||
# resyncPeriod: "2h"
|
||||
# leaderElection:
|
||||
# leaseDuration: 15s
|
||||
# # this value has to be lower than leaseDuration and greater than retryPeriod*1.2
|
||||
# renewDeadline: 10s
|
||||
# # this value has to be greater than 0
|
||||
# retryPeriod: 2s
|
||||
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
|
||||
# The TCP port that nfd-master listens for incoming requests. Default: 8080
|
||||
port: 8080
|
||||
|
|
|
@ -151,6 +151,20 @@ Example:
|
|||
nfd-master -enable-nodefeature-api
|
||||
```
|
||||
|
||||
### -enable-leader-election
|
||||
|
||||
The `-enable-leader-election` flag enables leader election for NFD-Master.
|
||||
It is advised to turn on this flag when running more than one instance of
|
||||
NFD-Master.
|
||||
|
||||
This flag takes effect only when combined with `-enable-nodefeature-api` flag.
|
||||
|
||||
Default: false
|
||||
|
||||
```bash
|
||||
nfd-master -enable-nodefeature-api -enable-leader-election
|
||||
```
|
||||
|
||||
### -enable-taints
|
||||
|
||||
The `-enable-taints` flag enables/disables node tainting feature of NFD.
|
||||
|
|
|
@ -114,7 +114,7 @@ Example:
|
|||
labelWhiteList: "foo"
|
||||
```
|
||||
|
||||
### resyncPeriod
|
||||
## resyncPeriod
|
||||
|
||||
The `resyncPeriod` option specifies the NFD API controller resync period.
|
||||
The resync means nfd-master replaying all NodeFeature and NodeFeatureRule objects,
|
||||
|
@ -128,5 +128,64 @@ Default: 1 hour.
|
|||
Example:
|
||||
|
||||
```yaml
|
||||
resyncPeriod=2h
|
||||
resyncPeriod: 2h
|
||||
```
|
||||
|
||||
## leaderElection
|
||||
|
||||
The `leaderElection` section exposes configuration to tweak leader election.
|
||||
|
||||
### leaderElection.leaseDuration
|
||||
|
||||
`leaderElection.leaseDuration` is the duration that non-leader candidates will
|
||||
wait to force acquire leadership. This is measured against time of
|
||||
last observed ack.
|
||||
|
||||
A client needs to wait a full LeaseDuration without observing a change to
|
||||
the record before it can attempt to take over. When all clients are
|
||||
shutdown and a new set of clients are started with different names against
|
||||
the same leader record, they must wait the full LeaseDuration before
|
||||
attempting to acquire the lease. Thus LeaseDuration should be as short as
|
||||
possible (within your tolerance for clock skew rate) to avoid a possible
|
||||
long waits in the scenario.
|
||||
|
||||
Default: 15 seconds.
|
||||
|
||||
Example:
|
||||
|
||||
```yaml
|
||||
leaderElection:
|
||||
leaseDurtation: 15s
|
||||
```
|
||||
|
||||
### leaderElection.renewDeadline
|
||||
|
||||
`leaderElection.renewDeadline` is the duration that the acting master will retry
|
||||
refreshing leadership before giving up.
|
||||
|
||||
This value has to be lower than leaseDuration and greater than retryPeriod*1.2.
|
||||
|
||||
Default: 10 seconds.
|
||||
|
||||
Example:
|
||||
|
||||
```yaml
|
||||
leaderElection:
|
||||
renewDeadline: 10s
|
||||
```
|
||||
|
||||
### leaderElection.retryPeriod
|
||||
|
||||
`leaderElection.retryPeriod` is the duration the LeaderElector clients should wait
|
||||
between tries of actions.
|
||||
|
||||
It has to be greater than 0.
|
||||
|
||||
Default: 2 seconds.
|
||||
|
||||
Example:
|
||||
|
||||
```yaml
|
||||
leaderElection:
|
||||
retryPeriod: 2s
|
||||
```
|
||||
|
|
|
@ -29,6 +29,11 @@ are created on the node (note the allowed
|
|||
> its [`-enable-nodefeature-api`](../reference/worker-commandline-reference.md#-enable-nodefeature-api)
|
||||
> flag.
|
||||
|
||||
When `-enable-nodefeature-api` option is enabled and NFD-Master is intended to run
|
||||
with more than one replica, it is advised to use `-enable-leader-election` flag.
|
||||
This flag turns on leader election for NFD-Master and let only one replica
|
||||
to act on changes in NodeFeature and NodeFeatureRule objects.
|
||||
|
||||
## NodeFeatureRule controller
|
||||
|
||||
NFD-Master acts as the controller for
|
||||
|
|
|
@ -558,6 +558,10 @@ denyLabelNs: ["denied.ns.io","denied.kubernetes.io"]
|
|||
resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"]
|
||||
enableTaints: false
|
||||
labelWhiteList: "foo"
|
||||
leaderElection:
|
||||
leaseDuration: 20s
|
||||
renewDeadline: 4s
|
||||
retryPeriod: 30s
|
||||
`)
|
||||
f.Close()
|
||||
So(err, ShouldBeNil)
|
||||
|
@ -573,6 +577,9 @@ labelWhiteList: "foo"
|
|||
So(master.config.ResourceLabels, ShouldResemble, utils.StringSetVal{"vendor-1.com/feature-1": struct{}{}, "vendor-2.io/feature-2": struct{}{}}) // from cmdline
|
||||
So(master.config.DenyLabelNs, ShouldResemble, utils.StringSetVal{"denied.ns.io": struct{}{}, "denied.kubernetes.io": struct{}{}})
|
||||
So(master.config.LabelWhiteList.String(), ShouldEqual, "foo")
|
||||
So(master.config.LeaderElection.LeaseDuration.Seconds(), ShouldEqual, float64(20))
|
||||
So(master.config.LeaderElection.RenewDeadline.Seconds(), ShouldEqual, float64(4))
|
||||
So(master.config.LeaderElection.RetryPeriod.Seconds(), ShouldEqual, float64(30))
|
||||
})
|
||||
})
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/net/context"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials"
|
||||
|
@ -38,9 +39,12 @@ import (
|
|||
"google.golang.org/grpc/peer"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
k8sQuantity "k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
k8sLabels "k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
restclient "k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/leaderelection"
|
||||
"k8s.io/client-go/tools/leaderelection/resourcelock"
|
||||
"k8s.io/klog/v2"
|
||||
controller "k8s.io/kubernetes/pkg/controller"
|
||||
taintutils "k8s.io/kubernetes/pkg/util/taints"
|
||||
|
@ -71,6 +75,14 @@ type NFDConfig struct {
|
|||
ResourceLabels utils.StringSetVal
|
||||
EnableTaints bool
|
||||
ResyncPeriod utils.DurationVal
|
||||
LeaderElection LeaderElectionConfig
|
||||
}
|
||||
|
||||
// LeaderElectionConfig contains the configuration for leader election
|
||||
type LeaderElectionConfig struct {
|
||||
LeaseDuration utils.DurationVal
|
||||
RenewDeadline utils.DurationVal
|
||||
RetryPeriod utils.DurationVal
|
||||
}
|
||||
|
||||
// ConfigOverrideArgs are args that override config file options
|
||||
|
@ -98,6 +110,7 @@ type Args struct {
|
|||
Prune bool
|
||||
VerifyNodeName bool
|
||||
Options string
|
||||
EnableLeaderElection bool
|
||||
|
||||
Overrides ConfigOverrideArgs
|
||||
}
|
||||
|
@ -174,6 +187,11 @@ func newDefaultConfig() *NFDConfig {
|
|||
ResourceLabels: utils.StringSetVal{},
|
||||
EnableTaints: false,
|
||||
ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour},
|
||||
LeaderElection: LeaderElectionConfig{
|
||||
LeaseDuration: utils.DurationVal{Duration: time.Duration(15) * time.Second},
|
||||
RetryPeriod: utils.DurationVal{Duration: time.Duration(2) * time.Second},
|
||||
RenewDeadline: utils.DurationVal{Duration: time.Duration(10) * time.Second},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -221,7 +239,11 @@ func (m *nfdMaster) Run() error {
|
|||
|
||||
// Run updater that handles events from the nfd CRD API.
|
||||
if m.nfdController != nil {
|
||||
go m.nfdAPIUpdateHandler()
|
||||
if m.args.EnableLeaderElection {
|
||||
go m.nfdAPIUpdateHandlerWithLeaderElection()
|
||||
} else {
|
||||
go m.nfdAPIUpdateHandler()
|
||||
}
|
||||
}
|
||||
|
||||
// Notify that we're ready to accept connections
|
||||
|
@ -1230,3 +1252,47 @@ func (m *nfdMaster) startNfdApiController() error {
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *nfdMaster) nfdAPIUpdateHandlerWithLeaderElection() {
|
||||
ctx := context.Background()
|
||||
client, err := m.apihelper.GetClient()
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "failed to get Kubernetes client")
|
||||
m.Stop()
|
||||
}
|
||||
lock := &resourcelock.LeaseLock{
|
||||
LeaseMeta: metav1.ObjectMeta{
|
||||
Name: "nfd-master.nfd.kubernetes.io",
|
||||
Namespace: m.namespace,
|
||||
},
|
||||
Client: client.CoordinationV1(),
|
||||
LockConfig: resourcelock.ResourceLockConfig{
|
||||
// add uuid to prevent situation where 2 nfd-master nodes run on same node
|
||||
Identity: m.nodeName + "_" + uuid.NewString(),
|
||||
},
|
||||
}
|
||||
config := leaderelection.LeaderElectionConfig{
|
||||
Lock: lock,
|
||||
// make it configurable?
|
||||
LeaseDuration: m.config.LeaderElection.LeaseDuration.Duration,
|
||||
RetryPeriod: m.config.LeaderElection.RetryPeriod.Duration,
|
||||
RenewDeadline: m.config.LeaderElection.RenewDeadline.Duration,
|
||||
Callbacks: leaderelection.LeaderCallbacks{
|
||||
OnStartedLeading: func(_ context.Context) {
|
||||
m.nfdAPIUpdateHandler()
|
||||
},
|
||||
OnStoppedLeading: func() {
|
||||
// We lost the lock.
|
||||
klog.ErrorS(err, "leaderelection lock was lost")
|
||||
m.Stop()
|
||||
},
|
||||
},
|
||||
}
|
||||
leaderElector, err := leaderelection.NewLeaderElector(config)
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "couldn't create leader elector")
|
||||
m.Stop()
|
||||
}
|
||||
|
||||
leaderElector.Run(ctx)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue