mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
Add new flag enable-leader-election for nfd-master.
It allows NFD-master to be run in active-passive way when running multiple instances of NFD-master to prevent multiple components from updating same custom resources. Signed-off-by: PiotrProkop <pprokop@nvidia.com>
This commit is contained in:
parent
cd45baef8d
commit
272fd4784f
11 changed files with 201 additions and 3 deletions
|
@ -128,6 +128,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
|
||||||
flagset.StringVar(&args.Options, "options", "",
|
flagset.StringVar(&args.Options, "options", "",
|
||||||
"Specify config options from command line. Config options are specified "+
|
"Specify config options from command line. Config options are specified "+
|
||||||
"in the same format as in the config file (i.e. json or yaml). These options")
|
"in the same format as in the config file (i.e. json or yaml). These options")
|
||||||
|
flagset.BoolVar(&args.EnableLeaderElection, "enable-leader-election", false,
|
||||||
|
"Enables a leader election. Enable this when running more than one replica on nfd master.")
|
||||||
|
|
||||||
overrides := &master.ConfigOverrideArgs{
|
overrides := &master.ConfigOverrideArgs{
|
||||||
LabelWhiteList: &utils.RegexpVal{},
|
LabelWhiteList: &utils.RegexpVal{},
|
||||||
|
|
|
@ -22,3 +22,18 @@ rules:
|
||||||
- get
|
- get
|
||||||
- list
|
- list
|
||||||
- watch
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- coordination.k8s.io
|
||||||
|
resources:
|
||||||
|
- leases
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- coordination.k8s.io
|
||||||
|
resources:
|
||||||
|
- leases
|
||||||
|
resourceNames:
|
||||||
|
- "nfd-master.nfd.kubernetes.io"
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- update
|
||||||
|
|
|
@ -5,3 +5,9 @@
|
||||||
# enableTaints: false
|
# enableTaints: false
|
||||||
# labelWhiteList: "foo"
|
# labelWhiteList: "foo"
|
||||||
# resyncPeriod: "2h"
|
# resyncPeriod: "2h"
|
||||||
|
# leaderElection:
|
||||||
|
# leaseDuration: 15s
|
||||||
|
# # this value has to be lower than leaseDuration and greater than retryPeriod*1.2
|
||||||
|
# renewDeadline: 10s
|
||||||
|
# # this value has to be greater than 0
|
||||||
|
# retryPeriod: 2s
|
||||||
|
|
|
@ -25,6 +25,21 @@ rules:
|
||||||
- get
|
- get
|
||||||
- list
|
- list
|
||||||
- watch
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- coordination.k8s.io
|
||||||
|
resources:
|
||||||
|
- leases
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- coordination.k8s.io
|
||||||
|
resources:
|
||||||
|
- leases
|
||||||
|
resourceNames:
|
||||||
|
- "nfd-master.nfd.kubernetes.io"
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- update
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
@ -82,6 +82,9 @@ spec:
|
||||||
- "-port={{ .Values.master.port | default "8080" }}"
|
- "-port={{ .Values.master.port | default "8080" }}"
|
||||||
{{- if .Values.enableNodeFeatureApi }}
|
{{- if .Values.enableNodeFeatureApi }}
|
||||||
- "-enable-nodefeature-api"
|
- "-enable-nodefeature-api"
|
||||||
|
{{- if gt (int .Values.master.replicaCount) 1 }}
|
||||||
|
- "-enable-leader-election"
|
||||||
|
{{- end }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- if .Values.master.extraLabelNs | empty | not }}
|
{{- if .Values.master.extraLabelNs | empty | not }}
|
||||||
- "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}"
|
- "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}"
|
||||||
|
|
|
@ -21,6 +21,12 @@ master:
|
||||||
# enableTaints: false
|
# enableTaints: false
|
||||||
# labelWhiteList: "foo"
|
# labelWhiteList: "foo"
|
||||||
# resyncPeriod: "2h"
|
# resyncPeriod: "2h"
|
||||||
|
# leaderElection:
|
||||||
|
# leaseDuration: 15s
|
||||||
|
# # this value has to be lower than leaseDuration and greater than retryPeriod*1.2
|
||||||
|
# renewDeadline: 10s
|
||||||
|
# # this value has to be greater than 0
|
||||||
|
# retryPeriod: 2s
|
||||||
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
|
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
|
||||||
# The TCP port that nfd-master listens for incoming requests. Default: 8080
|
# The TCP port that nfd-master listens for incoming requests. Default: 8080
|
||||||
port: 8080
|
port: 8080
|
||||||
|
|
|
@ -151,6 +151,20 @@ Example:
|
||||||
nfd-master -enable-nodefeature-api
|
nfd-master -enable-nodefeature-api
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### -enable-leader-election
|
||||||
|
|
||||||
|
The `-enable-leader-election` flag enables leader election for NFD-Master.
|
||||||
|
It is advised to turn on this flag when running more than one instance of
|
||||||
|
NFD-Master.
|
||||||
|
|
||||||
|
This flag takes effect only when combined with `-enable-nodefeature-api` flag.
|
||||||
|
|
||||||
|
Default: false
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nfd-master -enable-nodefeature-api -enable-leader-election
|
||||||
|
```
|
||||||
|
|
||||||
### -enable-taints
|
### -enable-taints
|
||||||
|
|
||||||
The `-enable-taints` flag enables/disables node tainting feature of NFD.
|
The `-enable-taints` flag enables/disables node tainting feature of NFD.
|
||||||
|
|
|
@ -114,7 +114,7 @@ Example:
|
||||||
labelWhiteList: "foo"
|
labelWhiteList: "foo"
|
||||||
```
|
```
|
||||||
|
|
||||||
### resyncPeriod
|
## resyncPeriod
|
||||||
|
|
||||||
The `resyncPeriod` option specifies the NFD API controller resync period.
|
The `resyncPeriod` option specifies the NFD API controller resync period.
|
||||||
The resync means nfd-master replaying all NodeFeature and NodeFeatureRule objects,
|
The resync means nfd-master replaying all NodeFeature and NodeFeatureRule objects,
|
||||||
|
@ -128,5 +128,64 @@ Default: 1 hour.
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
resyncPeriod=2h
|
resyncPeriod: 2h
|
||||||
|
```
|
||||||
|
|
||||||
|
## leaderElection
|
||||||
|
|
||||||
|
The `leaderElection` section exposes configuration to tweak leader election.
|
||||||
|
|
||||||
|
### leaderElection.leaseDuration
|
||||||
|
|
||||||
|
`leaderElection.leaseDuration` is the duration that non-leader candidates will
|
||||||
|
wait to force acquire leadership. This is measured against time of
|
||||||
|
last observed ack.
|
||||||
|
|
||||||
|
A client needs to wait a full LeaseDuration without observing a change to
|
||||||
|
the record before it can attempt to take over. When all clients are
|
||||||
|
shutdown and a new set of clients are started with different names against
|
||||||
|
the same leader record, they must wait the full LeaseDuration before
|
||||||
|
attempting to acquire the lease. Thus LeaseDuration should be as short as
|
||||||
|
possible (within your tolerance for clock skew rate) to avoid a possible
|
||||||
|
long waits in the scenario.
|
||||||
|
|
||||||
|
Default: 15 seconds.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
leaderElection:
|
||||||
|
leaseDurtation: 15s
|
||||||
|
```
|
||||||
|
|
||||||
|
### leaderElection.renewDeadline
|
||||||
|
|
||||||
|
`leaderElection.renewDeadline` is the duration that the acting master will retry
|
||||||
|
refreshing leadership before giving up.
|
||||||
|
|
||||||
|
This value has to be lower than leaseDuration and greater than retryPeriod*1.2.
|
||||||
|
|
||||||
|
Default: 10 seconds.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
leaderElection:
|
||||||
|
renewDeadline: 10s
|
||||||
|
```
|
||||||
|
|
||||||
|
### leaderElection.retryPeriod
|
||||||
|
|
||||||
|
`leaderElection.retryPeriod` is the duration the LeaderElector clients should wait
|
||||||
|
between tries of actions.
|
||||||
|
|
||||||
|
It has to be greater than 0.
|
||||||
|
|
||||||
|
Default: 2 seconds.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
leaderElection:
|
||||||
|
retryPeriod: 2s
|
||||||
```
|
```
|
||||||
|
|
|
@ -29,6 +29,11 @@ are created on the node (note the allowed
|
||||||
> its [`-enable-nodefeature-api`](../reference/worker-commandline-reference.md#-enable-nodefeature-api)
|
> its [`-enable-nodefeature-api`](../reference/worker-commandline-reference.md#-enable-nodefeature-api)
|
||||||
> flag.
|
> flag.
|
||||||
|
|
||||||
|
When `-enable-nodefeature-api` option is enabled and NFD-Master is intended to run
|
||||||
|
with more than one replica, it is advised to use `-enable-leader-election` flag.
|
||||||
|
This flag turns on leader election for NFD-Master and let only one replica
|
||||||
|
to act on changes in NodeFeature and NodeFeatureRule objects.
|
||||||
|
|
||||||
## NodeFeatureRule controller
|
## NodeFeatureRule controller
|
||||||
|
|
||||||
NFD-Master acts as the controller for
|
NFD-Master acts as the controller for
|
||||||
|
|
|
@ -558,6 +558,10 @@ denyLabelNs: ["denied.ns.io","denied.kubernetes.io"]
|
||||||
resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"]
|
resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"]
|
||||||
enableTaints: false
|
enableTaints: false
|
||||||
labelWhiteList: "foo"
|
labelWhiteList: "foo"
|
||||||
|
leaderElection:
|
||||||
|
leaseDuration: 20s
|
||||||
|
renewDeadline: 4s
|
||||||
|
retryPeriod: 30s
|
||||||
`)
|
`)
|
||||||
f.Close()
|
f.Close()
|
||||||
So(err, ShouldBeNil)
|
So(err, ShouldBeNil)
|
||||||
|
@ -573,6 +577,9 @@ labelWhiteList: "foo"
|
||||||
So(master.config.ResourceLabels, ShouldResemble, utils.StringSetVal{"vendor-1.com/feature-1": struct{}{}, "vendor-2.io/feature-2": struct{}{}}) // from cmdline
|
So(master.config.ResourceLabels, ShouldResemble, utils.StringSetVal{"vendor-1.com/feature-1": struct{}{}, "vendor-2.io/feature-2": struct{}{}}) // from cmdline
|
||||||
So(master.config.DenyLabelNs, ShouldResemble, utils.StringSetVal{"denied.ns.io": struct{}{}, "denied.kubernetes.io": struct{}{}})
|
So(master.config.DenyLabelNs, ShouldResemble, utils.StringSetVal{"denied.ns.io": struct{}{}, "denied.kubernetes.io": struct{}{}})
|
||||||
So(master.config.LabelWhiteList.String(), ShouldEqual, "foo")
|
So(master.config.LabelWhiteList.String(), ShouldEqual, "foo")
|
||||||
|
So(master.config.LeaderElection.LeaseDuration.Seconds(), ShouldEqual, float64(20))
|
||||||
|
So(master.config.LeaderElection.RenewDeadline.Seconds(), ShouldEqual, float64(4))
|
||||||
|
So(master.config.LeaderElection.RetryPeriod.Seconds(), ShouldEqual, float64(30))
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
"google.golang.org/grpc/credentials"
|
"google.golang.org/grpc/credentials"
|
||||||
|
@ -38,9 +39,12 @@ import (
|
||||||
"google.golang.org/grpc/peer"
|
"google.golang.org/grpc/peer"
|
||||||
corev1 "k8s.io/api/core/v1"
|
corev1 "k8s.io/api/core/v1"
|
||||||
k8sQuantity "k8s.io/apimachinery/pkg/api/resource"
|
k8sQuantity "k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
k8sLabels "k8s.io/apimachinery/pkg/labels"
|
k8sLabels "k8s.io/apimachinery/pkg/labels"
|
||||||
"k8s.io/client-go/kubernetes"
|
"k8s.io/client-go/kubernetes"
|
||||||
restclient "k8s.io/client-go/rest"
|
restclient "k8s.io/client-go/rest"
|
||||||
|
"k8s.io/client-go/tools/leaderelection"
|
||||||
|
"k8s.io/client-go/tools/leaderelection/resourcelock"
|
||||||
"k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
controller "k8s.io/kubernetes/pkg/controller"
|
controller "k8s.io/kubernetes/pkg/controller"
|
||||||
taintutils "k8s.io/kubernetes/pkg/util/taints"
|
taintutils "k8s.io/kubernetes/pkg/util/taints"
|
||||||
|
@ -71,6 +75,14 @@ type NFDConfig struct {
|
||||||
ResourceLabels utils.StringSetVal
|
ResourceLabels utils.StringSetVal
|
||||||
EnableTaints bool
|
EnableTaints bool
|
||||||
ResyncPeriod utils.DurationVal
|
ResyncPeriod utils.DurationVal
|
||||||
|
LeaderElection LeaderElectionConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// LeaderElectionConfig contains the configuration for leader election
|
||||||
|
type LeaderElectionConfig struct {
|
||||||
|
LeaseDuration utils.DurationVal
|
||||||
|
RenewDeadline utils.DurationVal
|
||||||
|
RetryPeriod utils.DurationVal
|
||||||
}
|
}
|
||||||
|
|
||||||
// ConfigOverrideArgs are args that override config file options
|
// ConfigOverrideArgs are args that override config file options
|
||||||
|
@ -98,6 +110,7 @@ type Args struct {
|
||||||
Prune bool
|
Prune bool
|
||||||
VerifyNodeName bool
|
VerifyNodeName bool
|
||||||
Options string
|
Options string
|
||||||
|
EnableLeaderElection bool
|
||||||
|
|
||||||
Overrides ConfigOverrideArgs
|
Overrides ConfigOverrideArgs
|
||||||
}
|
}
|
||||||
|
@ -174,6 +187,11 @@ func newDefaultConfig() *NFDConfig {
|
||||||
ResourceLabels: utils.StringSetVal{},
|
ResourceLabels: utils.StringSetVal{},
|
||||||
EnableTaints: false,
|
EnableTaints: false,
|
||||||
ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour},
|
ResyncPeriod: utils.DurationVal{Duration: time.Duration(1) * time.Hour},
|
||||||
|
LeaderElection: LeaderElectionConfig{
|
||||||
|
LeaseDuration: utils.DurationVal{Duration: time.Duration(15) * time.Second},
|
||||||
|
RetryPeriod: utils.DurationVal{Duration: time.Duration(2) * time.Second},
|
||||||
|
RenewDeadline: utils.DurationVal{Duration: time.Duration(10) * time.Second},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -221,7 +239,11 @@ func (m *nfdMaster) Run() error {
|
||||||
|
|
||||||
// Run updater that handles events from the nfd CRD API.
|
// Run updater that handles events from the nfd CRD API.
|
||||||
if m.nfdController != nil {
|
if m.nfdController != nil {
|
||||||
go m.nfdAPIUpdateHandler()
|
if m.args.EnableLeaderElection {
|
||||||
|
go m.nfdAPIUpdateHandlerWithLeaderElection()
|
||||||
|
} else {
|
||||||
|
go m.nfdAPIUpdateHandler()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Notify that we're ready to accept connections
|
// Notify that we're ready to accept connections
|
||||||
|
@ -1230,3 +1252,47 @@ func (m *nfdMaster) startNfdApiController() error {
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *nfdMaster) nfdAPIUpdateHandlerWithLeaderElection() {
|
||||||
|
ctx := context.Background()
|
||||||
|
client, err := m.apihelper.GetClient()
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "failed to get Kubernetes client")
|
||||||
|
m.Stop()
|
||||||
|
}
|
||||||
|
lock := &resourcelock.LeaseLock{
|
||||||
|
LeaseMeta: metav1.ObjectMeta{
|
||||||
|
Name: "nfd-master.nfd.kubernetes.io",
|
||||||
|
Namespace: m.namespace,
|
||||||
|
},
|
||||||
|
Client: client.CoordinationV1(),
|
||||||
|
LockConfig: resourcelock.ResourceLockConfig{
|
||||||
|
// add uuid to prevent situation where 2 nfd-master nodes run on same node
|
||||||
|
Identity: m.nodeName + "_" + uuid.NewString(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
config := leaderelection.LeaderElectionConfig{
|
||||||
|
Lock: lock,
|
||||||
|
// make it configurable?
|
||||||
|
LeaseDuration: m.config.LeaderElection.LeaseDuration.Duration,
|
||||||
|
RetryPeriod: m.config.LeaderElection.RetryPeriod.Duration,
|
||||||
|
RenewDeadline: m.config.LeaderElection.RenewDeadline.Duration,
|
||||||
|
Callbacks: leaderelection.LeaderCallbacks{
|
||||||
|
OnStartedLeading: func(_ context.Context) {
|
||||||
|
m.nfdAPIUpdateHandler()
|
||||||
|
},
|
||||||
|
OnStoppedLeading: func() {
|
||||||
|
// We lost the lock.
|
||||||
|
klog.ErrorS(err, "leaderelection lock was lost")
|
||||||
|
m.Stop()
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
leaderElector, err := leaderelection.NewLeaderElector(config)
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "couldn't create leader elector")
|
||||||
|
m.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
|
leaderElector.Run(ctx)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue