1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

nfd-master: run a separate gRPC health server

This patch separates the gRPC health server from the deprecated gRPC
server (disabled by default, replaced by the NodeFeature CRD API) used
for node labeling requests. The new health server runs on hardcoded TCP
port number 8082.

The main motivation for this change is to make the Kubernetes' built-in
gRPC liveness probes to function if TLS is enabled (as they don't
support TLS).

The health server itself is a naive implementation (as it was before),
basically only checking that nfd-master has started and hasn't crashed.
The patch adds a TODO note to improve the functionality.
This commit is contained in:
Markus Lehtonen 2024-01-03 22:02:18 +02:00
parent b3919f3e0d
commit a053efda64
4 changed files with 56 additions and 15 deletions

View file

@ -32,7 +32,8 @@ import (
const ( const (
// ProgramName is the canonical name of this program // ProgramName is the canonical name of this program
ProgramName = "nfd-master" ProgramName = "nfd-master"
GrpcHealthPort = 8082
) )
func main() { func main() {
@ -100,6 +101,7 @@ func main() {
utils.ConfigureGrpcKlog() utils.ConfigureGrpcKlog()
// Get new NfdMaster instance // Get new NfdMaster instance
args.GrpcHealthPort = GrpcHealthPort
instance, err := master.NewNfdMaster(args) instance, err := master.NewNfdMaster(args)
if err != nil { if err != nil {
klog.ErrorS(err, "failed to initialize NfdMaster instance") klog.ErrorS(err, "failed to initialize NfdMaster instance")

View file

@ -23,12 +23,12 @@ spec:
imagePullPolicy: Always imagePullPolicy: Always
livenessProbe: livenessProbe:
grpc: grpc:
port: 8080 port: 8082
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 10 periodSeconds: 10
readinessProbe: readinessProbe:
grpc: grpc:
port: 8080 port: 8082
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 10 periodSeconds: 10
failureThreshold: 10 failureThreshold: 10
@ -37,5 +37,3 @@ spec:
ports: ports:
- name: metrics - name: metrics
containerPort: 8081 containerPort: 8081
- name: grpc
containerPort: 8080

View file

@ -43,12 +43,12 @@ spec:
imagePullPolicy: {{ .Values.image.pullPolicy }} imagePullPolicy: {{ .Values.image.pullPolicy }}
livenessProbe: livenessProbe:
grpc: grpc:
port: 8080 port: 8082
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 10 periodSeconds: 10
readinessProbe: readinessProbe:
grpc: grpc:
port: 8080 port: 8082
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 10 periodSeconds: 10
failureThreshold: 10 failureThreshold: 10

View file

@ -116,6 +116,9 @@ type Args struct {
CrdController bool CrdController bool
EnableNodeFeatureApi bool EnableNodeFeatureApi bool
Port int Port int
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
GrpcHealthPort int
Prune bool Prune bool
VerifyNodeName bool VerifyNodeName bool
Options string Options string
@ -144,6 +147,7 @@ type nfdMaster struct {
nodeName string nodeName string
configFilePath string configFilePath string
server *grpc.Server server *grpc.Server
healthServer *grpc.Server
stop chan struct{} stop chan struct{}
ready chan bool ready chan bool
apihelper apihelper.APIHelpers apihelper apihelper.APIHelpers
@ -270,7 +274,11 @@ func (m *nfdMaster) Run() error {
// Run gRPC server // Run gRPC server
grpcErr := make(chan error, 1) grpcErr := make(chan error, 1)
go m.runGrpcServer(grpcErr) // If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
go m.runGrpcServer(grpcErr)
}
// Run updater that handles events from the nfd CRD API. // Run updater that handles events from the nfd CRD API.
if m.nfdController != nil { if m.nfdController != nil {
@ -281,6 +289,13 @@ func (m *nfdMaster) Run() error {
} }
} }
// Start gRPC server for liveness probe (at this point we're "live")
if m.args.GrpcHealthPort != 0 {
if err := m.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}
// Notify that we're ready to accept connections // Notify that we're ready to accept connections
m.ready <- true m.ready <- true
close(m.ready) close(m.ready)
@ -323,6 +338,32 @@ func (m *nfdMaster) Run() error {
} }
} }
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
// TODO: improve status checking e.g. with watchdog in the main event loop and
// cheking that node updater pool is alive.
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}
s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
m.healthServer = s
return nil
}
func (m *nfdMaster) runGrpcServer(errChan chan<- error) { func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
// Create server listening for TCP connections // Create server listening for TCP connections
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port)) lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
@ -352,13 +393,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
} }
m.server = grpc.NewServer(serverOpts...) m.server = grpc.NewServer(serverOpts...)
// If the NodeFeature API is enabled, don'tregister the labeler API pb.RegisterLabelerServer(m.server, m)
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
pb.RegisterLabelerServer(m.server, m)
}
grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
klog.InfoS("gRPC server serving", "port", m.args.Port) klog.InfoS("gRPC server serving", "port", m.args.Port)
// Run gRPC server // Run gRPC server
@ -421,7 +457,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
// Stop NfdMaster // Stop NfdMaster
func (m *nfdMaster) Stop() { func (m *nfdMaster) Stop() {
m.server.GracefulStop() if m.server != nil {
m.server.GracefulStop()
}
if m.healthServer != nil {
m.healthServer.GracefulStop()
}
if m.nfdController != nil { if m.nfdController != nil {
m.nfdController.stop() m.nfdController.stop()