1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2025-03-06 16:57:10 +00:00

Merge pull request #1946 from marquiz/devel/health-master

nfd-master: serve metrics and healthz endpoint on the same port
This commit is contained in:
Kubernetes Prow Robot 2025-02-17 01:22:22 -08:00 committed by GitHub
commit 26534880dd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 63 additions and 129 deletions

View file

@ -108,10 +108,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
"Config file to use.") "Config file to use.")
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "", flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
"Kubeconfig to use") "Kubeconfig to use")
flagset.IntVar(&args.MetricsPort, "metrics", 8081, flagset.IntVar(&args.Port, "port", 8080,
"Port on which to expose metrics.") "Port which metrics and healthz endpoints are served on")
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
"Port on which to expose the grpc health endpoint.")
flagset.BoolVar(&args.Prune, "prune", false, flagset.BoolVar(&args.Prune, "prune", false,
"Prune all NFD related attributes from all nodes of the cluster and exit.") "Prune all NFD related attributes from all nodes of the cluster and exit.")
flagset.StringVar(&args.Options, "options", "", flagset.StringVar(&args.Options, "options", "",

View file

@ -33,14 +33,16 @@ spec:
port: 8082 port: 8082
failureThreshold: 30 failureThreshold: 30
livenessProbe: livenessProbe:
grpc: httpGet:
port: 8082 path: /healthz
port: http
readinessProbe: readinessProbe:
grpc: httpGet:
port: 8082 path: /healthz
port: http
failureThreshold: 10 failureThreshold: 10
command: command:
- "nfd-master" - "nfd-master"
ports: ports:
- name: metrics - name: http
containerPort: 8081 containerPort: 8080

View file

@ -49,8 +49,9 @@ spec:
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }} imagePullPolicy: {{ .Values.image.pullPolicy }}
startupProbe: startupProbe:
grpc: httpGet:
port: {{ .Values.master.healthPort | default "8082" }} path: /healthz
port: http
{{- with .Values.master.startupProbe.initialDelaySeconds }} {{- with .Values.master.startupProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }} initialDelaySeconds: {{ . }}
{{- end }} {{- end }}
@ -64,8 +65,9 @@ spec:
timeoutSeconds: {{ . }} timeoutSeconds: {{ . }}
{{- end }} {{- end }}
livenessProbe: livenessProbe:
grpc: httpGet:
port: {{ .Values.master.healthPort | default "8082" }} path: /healthz
port: http
{{- with .Values.master.livenessProbe.initialDelaySeconds }} {{- with .Values.master.livenessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }} initialDelaySeconds: {{ . }}
{{- end }} {{- end }}
@ -79,8 +81,9 @@ spec:
timeoutSeconds: {{ . }} timeoutSeconds: {{ . }}
{{- end }} {{- end }}
readinessProbe: readinessProbe:
grpc: httpGet:
port: {{ .Values.master.healthPort | default "8082" }} path: /healthz
port: http
{{- with .Values.master.readinessProbe.initialDelaySeconds }} {{- with .Values.master.readinessProbe.initialDelaySeconds }}
initialDelaySeconds: {{ . }} initialDelaySeconds: {{ . }}
{{- end }} {{- end }}
@ -97,10 +100,8 @@ spec:
successThreshold: {{ . }} successThreshold: {{ . }}
{{- end }} {{- end }}
ports: ports:
- containerPort: {{ .Values.master.metricsPort | default "8081" }} - containerPort: {{ .Values.master.port | default "8080" }}
name: metrics name: http
- containerPort: {{ .Values.master.healthPort | default "8082" }}
name: health
env: env:
- name: NODE_NAME - name: NODE_NAME
valueFrom: valueFrom:
@ -140,8 +141,7 @@ spec:
{{- range $key, $value := .Values.featureGates }} {{- range $key, $value := .Values.featureGates }}
- "-feature-gates={{ $key }}={{ $value }}" - "-feature-gates={{ $key }}={{ $value }}"
{{- end }} {{- end }}
- "-metrics={{ .Values.master.metricsPort | default "8081" }}" - "-port={{ .Values.master.port | default "8080" }}"
- "-grpc-health={{ .Values.master.healthPort | default "8082" }}"
{{- with .Values.master.extraArgs }} {{- with .Values.master.extraArgs }}
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}

View file

@ -67,8 +67,7 @@ master:
# retryPeriod: 2s # retryPeriod: 2s
# nfdApiParallelism: 10 # nfdApiParallelism: 10
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE> ### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
metricsPort: 8081 port: 8080
healthPort: 8082
instance: instance:
featureApi: featureApi:
resyncPeriod: resyncPeriod:
@ -149,20 +148,14 @@ master:
values: [""] values: [""]
startupProbe: startupProbe:
grpc:
port: 8082
failureThreshold: 30 failureThreshold: 30
# periodSeconds: 10 # periodSeconds: 10
livenessProbe: livenessProbe: {}
grpc:
port: 8082
# failureThreshold: 3 # failureThreshold: 3
# initialDelaySeconds: 0 # initialDelaySeconds: 0
# periodSeconds: 10 # periodSeconds: 10
# timeoutSeconds: 1 # timeoutSeconds: 1
readinessProbe: readinessProbe:
grpc:
port: 8082
failureThreshold: 10 failureThreshold: 10
# initialDelaySeconds: 0 # initialDelaySeconds: 0
# periodSeconds: 10 # periodSeconds: 10

View file

@ -176,8 +176,7 @@ API's you need to install the prometheus operator in your cluster.
| `master.*` | dict | | NFD master deployment configuration | | `master.*` | dict | | NFD master deployment configuration |
| `master.enable` | bool | true | Specifies whether nfd-master should be deployed | | `master.enable` | bool | true | Specifies whether nfd-master should be deployed |
| `master.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace | | `master.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace |
| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus operator. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. | | `master.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
| `master.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. |
| `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments | | `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments |
| `master.resyncPeriod` | string | | NFD API controller resync period. | | `master.resyncPeriod` | string | | NFD API controller resync period. |
| `master.extraLabelNs` | array | [] | List of allowed extra label namespaces | | `master.extraLabelNs` | array | [] | List of allowed extra label namespaces |

View file

@ -20,7 +20,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"maps" "maps"
"net" "net/http"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
@ -31,10 +31,9 @@ import (
"time" "time"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"golang.org/x/net/context" "golang.org/x/net/context"
"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality" apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -115,18 +114,14 @@ type ConfigOverrideArgs struct {
// Args holds command line arguments // Args holds command line arguments
type Args struct { type Args struct {
ConfigFile string ConfigFile string
Instance string Instance string
Klog map[string]*utils.KlogFlagVal Klog map[string]*utils.KlogFlagVal
Kubeconfig string Kubeconfig string
Port int Port int
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
GrpcHealthPort int
Prune bool Prune bool
Options string Options string
EnableLeaderElection bool EnableLeaderElection bool
MetricsPort int
Overrides ConfigOverrideArgs Overrides ConfigOverrideArgs
} }
@ -139,7 +134,6 @@ type deniedNs struct {
type NfdMaster interface { type NfdMaster interface {
Run() error Run() error
Stop() Stop()
WaitForReady(time.Duration) bool
} }
type nfdMaster struct { type nfdMaster struct {
@ -149,10 +143,7 @@ type nfdMaster struct {
namespace string namespace string
nodeName string nodeName string
configFilePath string configFilePath string
server *grpc.Server
healthServer *grpc.Server
stop chan struct{} stop chan struct{}
ready chan struct{}
kubeconfig *restclient.Config kubeconfig *restclient.Config
k8sClient k8sclient.Interface k8sClient k8sclient.Interface
nfdClient nfdclientset.Interface nfdClient nfdclientset.Interface
@ -166,7 +157,6 @@ func NewNfdMaster(opts ...NfdMasterOption) (NfdMaster, error) {
nfd := &nfdMaster{ nfd := &nfdMaster{
nodeName: utils.NodeName(), nodeName: utils.NodeName(),
namespace: utils.GetKubernetesNamespace(), namespace: utils.GetKubernetesNamespace(),
ready: make(chan struct{}),
stop: make(chan struct{}), stop: make(chan struct{}),
} }
@ -298,22 +288,22 @@ func (m *nfdMaster) Run() error {
} }
} }
httpMux := http.NewServeMux()
// Register to metrics server // Register to metrics server
if m.args.MetricsPort > 0 { promRegistry := prometheus.NewRegistry()
m := utils.CreateMetricsServer(m.args.MetricsPort, promRegistry.MustRegister(
buildInfo, buildInfo,
nodeUpdateRequests, nodeUpdateRequests,
nodeUpdates, nodeUpdates,
nodeUpdateFailures, nodeUpdateFailures,
nodeLabelsRejected, nodeLabelsRejected,
nodeERsRejected, nodeERsRejected,
nodeTaintsRejected, nodeTaintsRejected,
nfrProcessingTime, nfrProcessingTime,
nfrProcessingErrors) nfrProcessingErrors)
go m.Run() httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
registerVersion(version.Get()) registerVersion(version.Get())
defer m.Stop()
}
// Run updater that handles events from the nfd CRD API. // Run updater that handles events from the nfd CRD API.
if m.nfdController != nil { if m.nfdController != nil {
@ -324,60 +314,29 @@ func (m *nfdMaster) Run() error {
} }
} }
// Start gRPC server for liveness probe (at this point we're "live") // Register health probe (at this point we're "ready and live")
grpcErr := make(chan error) httpMux.HandleFunc("/healthz", m.Healthz)
if m.args.GrpcHealthPort != 0 {
if err := m.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}
}
// Notify that we're ready to accept connections // Start HTTP server
close(m.ready) httpServer := http.Server{Addr: fmt.Sprintf(":%d", m.args.Port), Handler: httpMux}
go func() {
klog.InfoS("http server starting", "port", httpServer.Addr)
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
}()
defer httpServer.Close()
// NFD-Master main event loop <-m.stop
for { klog.InfoS("shutting down nfd-master")
select { return nil
case err := <-grpcErr:
return fmt.Errorf("error in serving gRPC: %w", err)
case <-m.stop:
klog.InfoS("shutting down nfd-master")
return nil
}
}
} }
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes. func (m *nfdMaster) Healthz(writer http.ResponseWriter, _ *http.Request) {
// TODO: improve status checking e.g. with watchdog in the main event loop and writer.WriteHeader(http.StatusOK)
// cheking that node updater pool is alive.
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}
s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")
}()
m.healthServer = s
return nil
} }
// nfdAPIUpdateHandler handles events from the nfd API controller. // nfdAPIUpdateHandler handles events from the nfd API controller.
func (m *nfdMaster) nfdAPIUpdateHandler() { func (m *nfdMaster) nfdAPIUpdateHandler() {
// We want to unconditionally update all nodes at startup if gRPC is // We want to unconditionally update all nodes at startup
// disabled (i.e. NodeFeature API is enabled)
updateAll := true updateAll := true
updateNodes := make(map[string]struct{}) updateNodes := make(map[string]struct{})
nodeFeatureGroup := make(map[string]struct{}) nodeFeatureGroup := make(map[string]struct{})
@ -431,13 +390,6 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
// Stop NfdMaster // Stop NfdMaster
func (m *nfdMaster) Stop() { func (m *nfdMaster) Stop() {
if m.server != nil {
m.server.GracefulStop()
}
if m.healthServer != nil {
m.healthServer.GracefulStop()
}
if m.nfdController != nil { if m.nfdController != nil {
m.nfdController.stop() m.nfdController.stop()
} }
@ -447,16 +399,6 @@ func (m *nfdMaster) Stop() {
close(m.stop) close(m.stop)
} }
// Wait until NfdMaster is able able to accept connections.
func (m *nfdMaster) WaitForReady(timeout time.Duration) bool {
select {
case <-m.ready:
return true
case <-time.After(timeout):
}
return false
}
// Prune erases all NFD related properties from the node objects of the cluster. // Prune erases all NFD related properties from the node objects of the cluster.
func (m *nfdMaster) prune() error { func (m *nfdMaster) prune() error {
if m.config.NoPublish { if m.config.NoPublish {