1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

nfd-master: run a separate gRPC health server

This patch separates the gRPC health server from the deprecated gRPC
server (disabled by default, replaced by the NodeFeature CRD API) used
for node labeling requests. The new health server runs on hardcoded TCP
port number 8082.

The main motivation for this change is to make the Kubernetes' built-in
gRPC liveness probes to function if TLS is enabled (as they don't
support TLS).

The health server itself is a naive implementation (as it was before),
basically only checking that nfd-master has started and hasn't crashed.
The patch adds a TODO note to improve the functionality.
This commit is contained in:
Markus Lehtonen 2024-01-03 22:02:18 +02:00
parent b3919f3e0d
commit a053efda64
4 changed files with 56 additions and 15 deletions

View file

@ -32,7 +32,8 @@ import (
const (
// ProgramName is the canonical name of this program
ProgramName = "nfd-master"
ProgramName = "nfd-master"
GrpcHealthPort = 8082
)
func main() {
@ -100,6 +101,7 @@ func main() {
utils.ConfigureGrpcKlog()
// Get new NfdMaster instance
args.GrpcHealthPort = GrpcHealthPort
instance, err := master.NewNfdMaster(args)
if err != nil {
klog.ErrorS(err, "failed to initialize NfdMaster instance")

View file

@ -23,12 +23,12 @@ spec:
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
@ -37,5 +37,3 @@ spec:
ports:
- name: metrics
containerPort: 8081
- name: grpc
containerPort: 8080

View file

@ -43,12 +43,12 @@ spec:
imagePullPolicy: {{ .Values.image.pullPolicy }}
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10

View file

@ -116,6 +116,9 @@ type Args struct {
CrdController bool
EnableNodeFeatureApi bool
Port int
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
GrpcHealthPort int
Prune bool
VerifyNodeName bool
Options string
@ -144,6 +147,7 @@ type nfdMaster struct {
nodeName string
configFilePath string
server *grpc.Server
healthServer *grpc.Server
stop chan struct{}
ready chan bool
apihelper apihelper.APIHelpers
@ -270,7 +274,11 @@ func (m *nfdMaster) Run() error {
// Run gRPC server
grpcErr := make(chan error, 1)
go m.runGrpcServer(grpcErr)
// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
go m.runGrpcServer(grpcErr)
}
// Run updater that handles events from the nfd CRD API.
if m.nfdController != nil {
@ -281,6 +289,13 @@ func (m *nfdMaster) Run() error {
}
}
// Start gRPC server for liveness probe (at this point we're "live")
if m.args.GrpcHealthPort != 0 {
if err := m.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}
// Notify that we're ready to accept connections
m.ready <- true
close(m.ready)
@ -323,6 +338,32 @@ func (m *nfdMaster) Run() error {
}
}
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
// TODO: improve status checking e.g. with watchdog in the main event loop and
// cheking that node updater pool is alive.
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}
s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
m.healthServer = s
return nil
}
func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
// Create server listening for TCP connections
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
@ -352,13 +393,8 @@ func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
}
m.server = grpc.NewServer(serverOpts...)
// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
pb.RegisterLabelerServer(m.server, m)
}
pb.RegisterLabelerServer(m.server, m)
grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
klog.InfoS("gRPC server serving", "port", m.args.Port)
// Run gRPC server
@ -421,7 +457,12 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
// Stop NfdMaster
func (m *nfdMaster) Stop() {
m.server.GracefulStop()
if m.server != nil {
m.server.GracefulStop()
}
if m.healthServer != nil {
m.healthServer.GracefulStop()
}
if m.nfdController != nil {
m.nfdController.stop()