mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
nfd-master: run a separate gRPC health server
This patch separates the gRPC health server from the deprecated gRPC server (disabled by default, replaced by the NodeFeature CRD API) used for node labeling requests. The new health server runs on hardcoded TCP port number 8082. The main motivation for this change is to make the Kubernetes' built-in gRPC liveness probes to function if TLS is enabled (as they don't support TLS). The health server itself is a naive implementation (as it was before), basically only checking that nfd-master has started and hasn't crashed. The patch adds a TODO note to improve the functionality.
This commit is contained in:
parent
b3919f3e0d
commit
a053efda64
4 changed files with 56 additions and 15 deletions
|
@ -33,6 +33,7 @@ import (
|
|||
const (
|
||||
// ProgramName is the canonical name of this program
|
||||
ProgramName = "nfd-master"
|
||||
GrpcHealthPort = 8082
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
@ -100,6 +101,7 @@ func main() {
|
|||
utils.ConfigureGrpcKlog()
|
||||
|
||||
// Get new NfdMaster instance
|
||||
args.GrpcHealthPort = GrpcHealthPort
|
||||
instance, err := master.NewNfdMaster(args)
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "failed to initialize NfdMaster instance")
|
||||
|
|
|
@ -23,12 +23,12 @@ spec:
|
|||
imagePullPolicy: Always
|
||||
livenessProbe:
|
||||
grpc:
|
||||
port: 8080
|
||||
port: 8082
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
grpc:
|
||||
port: 8080
|
||||
port: 8082
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
failureThreshold: 10
|
||||
|
@ -37,5 +37,3 @@ spec:
|
|||
ports:
|
||||
- name: metrics
|
||||
containerPort: 8081
|
||||
- name: grpc
|
||||
containerPort: 8080
|
||||
|
|
|
@ -43,12 +43,12 @@ spec:
|
|||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
livenessProbe:
|
||||
grpc:
|
||||
port: 8080
|
||||
port: 8082
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
grpc:
|
||||
port: 8080
|
||||
port: 8082
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
failureThreshold: 10
|
||||
|
|
|
@ -116,6 +116,9 @@ type Args struct {
|
|||
CrdController bool
|
||||
EnableNodeFeatureApi bool
|
||||
Port int
|
||||
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
|
||||
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
|
||||
GrpcHealthPort int
|
||||
Prune bool
|
||||
VerifyNodeName bool
|
||||
Options string
|
||||
|
@ -144,6 +147,7 @@ type nfdMaster struct {
|
|||
nodeName string
|
||||
configFilePath string
|
||||
server *grpc.Server
|
||||
healthServer *grpc.Server
|
||||
stop chan struct{}
|
||||
ready chan bool
|
||||
apihelper apihelper.APIHelpers
|
||||
|
@ -270,7 +274,11 @@ func (m *nfdMaster) Run() error {
|
|||
|
||||
// Run gRPC server
|
||||
grpcErr := make(chan error, 1)
|
||||
// If the NodeFeature API is enabled, don'tregister the labeler API
|
||||
// server. Otherwise, register the labeler server.
|
||||
if !m.args.EnableNodeFeatureApi {
|
||||
go m.runGrpcServer(grpcErr)
|
||||
}
|
||||
|
||||
// Run updater that handles events from the nfd CRD API.
|
||||
if m.nfdController != nil {
|
||||
|
@ -281,6 +289,13 @@ func (m *nfdMaster) Run() error {
|
|||
}
|
||||
}
|
||||
|
||||
// Start gRPC server for liveness probe (at this point we're "live")
|
||||
if m.args.GrpcHealthPort != 0 {
|
||||
if err := m.startGrpcHealthServer(grpcErr); err != nil {
|
||||
return fmt.Errorf("failed to start gRPC health server: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Notify that we're ready to accept connections
|
||||
m.ready <- true
|
||||
close(m.ready)
|
||||
|
@ -323,6 +338,32 @@ func (m *nfdMaster) Run() error {
|
|||
}
|
||||
}
|
||||
|
||||
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
|
||||
// TODO: improve status checking e.g. with watchdog in the main event loop and
|
||||
// cheking that node updater pool is alive.
|
||||
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
|
||||
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to listen: %w", err)
|
||||
}
|
||||
|
||||
s := grpc.NewServer()
|
||||
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
|
||||
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
lis.Close()
|
||||
}()
|
||||
if err := s.Serve(lis); err != nil {
|
||||
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
|
||||
}
|
||||
klog.InfoS("gRPC health server stopped")
|
||||
}()
|
||||
m.healthServer = s
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
|
||||
// Create server listening for TCP connections
|
||||
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
|
||||
|
@ -352,13 +393,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
|
|||
}
|
||||
m.server = grpc.NewServer(serverOpts...)
|
||||
|
||||
// If the NodeFeature API is enabled, don'tregister the labeler API
|
||||
// server. Otherwise, register the labeler server.
|
||||
if !m.args.EnableNodeFeatureApi {
|
||||
pb.RegisterLabelerServer(m.server, m)
|
||||
}
|
||||
|
||||
grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
|
||||
klog.InfoS("gRPC server serving", "port", m.args.Port)
|
||||
|
||||
// Run gRPC server
|
||||
|
@ -421,7 +457,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
|
|||
|
||||
// Stop NfdMaster
|
||||
func (m *nfdMaster) Stop() {
|
||||
if m.server != nil {
|
||||
m.server.GracefulStop()
|
||||
}
|
||||
if m.healthServer != nil {
|
||||
m.healthServer.GracefulStop()
|
||||
}
|
||||
|
||||
if m.nfdController != nil {
|
||||
m.nfdController.stop()
|
||||
|
|
Loading…
Reference in a new issue