mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-06 16:57:10 +00:00
Merge pull request #1946 from marquiz/devel/health-master
nfd-master: serve metrics and healthz endpoint on the same port
This commit is contained in:
commit
26534880dd
6 changed files with 63 additions and 129 deletions
|
@ -108,10 +108,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
|
||||||
"Config file to use.")
|
"Config file to use.")
|
||||||
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
|
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
|
||||||
"Kubeconfig to use")
|
"Kubeconfig to use")
|
||||||
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
|
flagset.IntVar(&args.Port, "port", 8080,
|
||||||
"Port on which to expose metrics.")
|
"Port which metrics and healthz endpoints are served on")
|
||||||
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
|
|
||||||
"Port on which to expose the grpc health endpoint.")
|
|
||||||
flagset.BoolVar(&args.Prune, "prune", false,
|
flagset.BoolVar(&args.Prune, "prune", false,
|
||||||
"Prune all NFD related attributes from all nodes of the cluster and exit.")
|
"Prune all NFD related attributes from all nodes of the cluster and exit.")
|
||||||
flagset.StringVar(&args.Options, "options", "",
|
flagset.StringVar(&args.Options, "options", "",
|
||||||
|
|
|
@ -33,14 +33,16 @@ spec:
|
||||||
port: 8082
|
port: 8082
|
||||||
failureThreshold: 30
|
failureThreshold: 30
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
grpc:
|
httpGet:
|
||||||
port: 8082
|
path: /healthz
|
||||||
|
port: http
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
grpc:
|
httpGet:
|
||||||
port: 8082
|
path: /healthz
|
||||||
|
port: http
|
||||||
failureThreshold: 10
|
failureThreshold: 10
|
||||||
command:
|
command:
|
||||||
- "nfd-master"
|
- "nfd-master"
|
||||||
ports:
|
ports:
|
||||||
- name: metrics
|
- name: http
|
||||||
containerPort: 8081
|
containerPort: 8080
|
||||||
|
|
|
@ -49,8 +49,9 @@ spec:
|
||||||
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
|
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
|
||||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||||
startupProbe:
|
startupProbe:
|
||||||
grpc:
|
httpGet:
|
||||||
port: {{ .Values.master.healthPort | default "8082" }}
|
path: /healthz
|
||||||
|
port: http
|
||||||
{{- with .Values.master.startupProbe.initialDelaySeconds }}
|
{{- with .Values.master.startupProbe.initialDelaySeconds }}
|
||||||
initialDelaySeconds: {{ . }}
|
initialDelaySeconds: {{ . }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
@ -64,8 +65,9 @@ spec:
|
||||||
timeoutSeconds: {{ . }}
|
timeoutSeconds: {{ . }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
grpc:
|
httpGet:
|
||||||
port: {{ .Values.master.healthPort | default "8082" }}
|
path: /healthz
|
||||||
|
port: http
|
||||||
{{- with .Values.master.livenessProbe.initialDelaySeconds }}
|
{{- with .Values.master.livenessProbe.initialDelaySeconds }}
|
||||||
initialDelaySeconds: {{ . }}
|
initialDelaySeconds: {{ . }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
@ -79,8 +81,9 @@ spec:
|
||||||
timeoutSeconds: {{ . }}
|
timeoutSeconds: {{ . }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
grpc:
|
httpGet:
|
||||||
port: {{ .Values.master.healthPort | default "8082" }}
|
path: /healthz
|
||||||
|
port: http
|
||||||
{{- with .Values.master.readinessProbe.initialDelaySeconds }}
|
{{- with .Values.master.readinessProbe.initialDelaySeconds }}
|
||||||
initialDelaySeconds: {{ . }}
|
initialDelaySeconds: {{ . }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
@ -97,10 +100,8 @@ spec:
|
||||||
successThreshold: {{ . }}
|
successThreshold: {{ . }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
ports:
|
ports:
|
||||||
- containerPort: {{ .Values.master.metricsPort | default "8081" }}
|
- containerPort: {{ .Values.master.port | default "8080" }}
|
||||||
name: metrics
|
name: http
|
||||||
- containerPort: {{ .Values.master.healthPort | default "8082" }}
|
|
||||||
name: health
|
|
||||||
env:
|
env:
|
||||||
- name: NODE_NAME
|
- name: NODE_NAME
|
||||||
valueFrom:
|
valueFrom:
|
||||||
|
@ -140,8 +141,7 @@ spec:
|
||||||
{{- range $key, $value := .Values.featureGates }}
|
{{- range $key, $value := .Values.featureGates }}
|
||||||
- "-feature-gates={{ $key }}={{ $value }}"
|
- "-feature-gates={{ $key }}={{ $value }}"
|
||||||
{{- end }}
|
{{- end }}
|
||||||
- "-metrics={{ .Values.master.metricsPort | default "8081" }}"
|
- "-port={{ .Values.master.port | default "8080" }}"
|
||||||
- "-grpc-health={{ .Values.master.healthPort | default "8082" }}"
|
|
||||||
{{- with .Values.master.extraArgs }}
|
{{- with .Values.master.extraArgs }}
|
||||||
{{- toYaml . | nindent 12 }}
|
{{- toYaml . | nindent 12 }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
|
@ -67,8 +67,7 @@ master:
|
||||||
# retryPeriod: 2s
|
# retryPeriod: 2s
|
||||||
# nfdApiParallelism: 10
|
# nfdApiParallelism: 10
|
||||||
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
|
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
|
||||||
metricsPort: 8081
|
port: 8080
|
||||||
healthPort: 8082
|
|
||||||
instance:
|
instance:
|
||||||
featureApi:
|
featureApi:
|
||||||
resyncPeriod:
|
resyncPeriod:
|
||||||
|
@ -149,20 +148,14 @@ master:
|
||||||
values: [""]
|
values: [""]
|
||||||
|
|
||||||
startupProbe:
|
startupProbe:
|
||||||
grpc:
|
|
||||||
port: 8082
|
|
||||||
failureThreshold: 30
|
failureThreshold: 30
|
||||||
# periodSeconds: 10
|
# periodSeconds: 10
|
||||||
livenessProbe:
|
livenessProbe: {}
|
||||||
grpc:
|
|
||||||
port: 8082
|
|
||||||
# failureThreshold: 3
|
# failureThreshold: 3
|
||||||
# initialDelaySeconds: 0
|
# initialDelaySeconds: 0
|
||||||
# periodSeconds: 10
|
# periodSeconds: 10
|
||||||
# timeoutSeconds: 1
|
# timeoutSeconds: 1
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
grpc:
|
|
||||||
port: 8082
|
|
||||||
failureThreshold: 10
|
failureThreshold: 10
|
||||||
# initialDelaySeconds: 0
|
# initialDelaySeconds: 0
|
||||||
# periodSeconds: 10
|
# periodSeconds: 10
|
||||||
|
|
|
@ -176,8 +176,7 @@ API's you need to install the prometheus operator in your cluster.
|
||||||
| `master.*` | dict | | NFD master deployment configuration |
|
| `master.*` | dict | | NFD master deployment configuration |
|
||||||
| `master.enable` | bool | true | Specifies whether nfd-master should be deployed |
|
| `master.enable` | bool | true | Specifies whether nfd-master should be deployed |
|
||||||
| `master.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace |
|
| `master.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace |
|
||||||
| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus operator. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. |
|
| `master.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
|
||||||
| `master.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `master.port` in NFD v0.18. |
|
|
||||||
| `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments |
|
| `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments |
|
||||||
| `master.resyncPeriod` | string | | NFD API controller resync period. |
|
| `master.resyncPeriod` | string | | NFD API controller resync period. |
|
||||||
| `master.extraLabelNs` | array | [] | List of allowed extra label namespaces |
|
| `master.extraLabelNs` | array | [] | List of allowed extra label namespaces |
|
||||||
|
|
|
@ -20,7 +20,7 @@ import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"maps"
|
"maps"
|
||||||
"net"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
@ -31,10 +31,9 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
"google.golang.org/grpc"
|
|
||||||
"google.golang.org/grpc/health"
|
|
||||||
"google.golang.org/grpc/health/grpc_health_v1"
|
|
||||||
corev1 "k8s.io/api/core/v1"
|
corev1 "k8s.io/api/core/v1"
|
||||||
apiequality "k8s.io/apimachinery/pkg/api/equality"
|
apiequality "k8s.io/apimachinery/pkg/api/equality"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
@ -115,18 +114,14 @@ type ConfigOverrideArgs struct {
|
||||||
|
|
||||||
// Args holds command line arguments
|
// Args holds command line arguments
|
||||||
type Args struct {
|
type Args struct {
|
||||||
ConfigFile string
|
ConfigFile string
|
||||||
Instance string
|
Instance string
|
||||||
Klog map[string]*utils.KlogFlagVal
|
Klog map[string]*utils.KlogFlagVal
|
||||||
Kubeconfig string
|
Kubeconfig string
|
||||||
Port int
|
Port int
|
||||||
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
|
|
||||||
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
|
|
||||||
GrpcHealthPort int
|
|
||||||
Prune bool
|
Prune bool
|
||||||
Options string
|
Options string
|
||||||
EnableLeaderElection bool
|
EnableLeaderElection bool
|
||||||
MetricsPort int
|
|
||||||
|
|
||||||
Overrides ConfigOverrideArgs
|
Overrides ConfigOverrideArgs
|
||||||
}
|
}
|
||||||
|
@ -139,7 +134,6 @@ type deniedNs struct {
|
||||||
type NfdMaster interface {
|
type NfdMaster interface {
|
||||||
Run() error
|
Run() error
|
||||||
Stop()
|
Stop()
|
||||||
WaitForReady(time.Duration) bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type nfdMaster struct {
|
type nfdMaster struct {
|
||||||
|
@ -149,10 +143,7 @@ type nfdMaster struct {
|
||||||
namespace string
|
namespace string
|
||||||
nodeName string
|
nodeName string
|
||||||
configFilePath string
|
configFilePath string
|
||||||
server *grpc.Server
|
|
||||||
healthServer *grpc.Server
|
|
||||||
stop chan struct{}
|
stop chan struct{}
|
||||||
ready chan struct{}
|
|
||||||
kubeconfig *restclient.Config
|
kubeconfig *restclient.Config
|
||||||
k8sClient k8sclient.Interface
|
k8sClient k8sclient.Interface
|
||||||
nfdClient nfdclientset.Interface
|
nfdClient nfdclientset.Interface
|
||||||
|
@ -166,7 +157,6 @@ func NewNfdMaster(opts ...NfdMasterOption) (NfdMaster, error) {
|
||||||
nfd := &nfdMaster{
|
nfd := &nfdMaster{
|
||||||
nodeName: utils.NodeName(),
|
nodeName: utils.NodeName(),
|
||||||
namespace: utils.GetKubernetesNamespace(),
|
namespace: utils.GetKubernetesNamespace(),
|
||||||
ready: make(chan struct{}),
|
|
||||||
stop: make(chan struct{}),
|
stop: make(chan struct{}),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -298,22 +288,22 @@ func (m *nfdMaster) Run() error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
httpMux := http.NewServeMux()
|
||||||
|
|
||||||
// Register to metrics server
|
// Register to metrics server
|
||||||
if m.args.MetricsPort > 0 {
|
promRegistry := prometheus.NewRegistry()
|
||||||
m := utils.CreateMetricsServer(m.args.MetricsPort,
|
promRegistry.MustRegister(
|
||||||
buildInfo,
|
buildInfo,
|
||||||
nodeUpdateRequests,
|
nodeUpdateRequests,
|
||||||
nodeUpdates,
|
nodeUpdates,
|
||||||
nodeUpdateFailures,
|
nodeUpdateFailures,
|
||||||
nodeLabelsRejected,
|
nodeLabelsRejected,
|
||||||
nodeERsRejected,
|
nodeERsRejected,
|
||||||
nodeTaintsRejected,
|
nodeTaintsRejected,
|
||||||
nfrProcessingTime,
|
nfrProcessingTime,
|
||||||
nfrProcessingErrors)
|
nfrProcessingErrors)
|
||||||
go m.Run()
|
httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
|
||||||
registerVersion(version.Get())
|
registerVersion(version.Get())
|
||||||
defer m.Stop()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run updater that handles events from the nfd CRD API.
|
// Run updater that handles events from the nfd CRD API.
|
||||||
if m.nfdController != nil {
|
if m.nfdController != nil {
|
||||||
|
@ -324,60 +314,29 @@ func (m *nfdMaster) Run() error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start gRPC server for liveness probe (at this point we're "live")
|
// Register health probe (at this point we're "ready and live")
|
||||||
grpcErr := make(chan error)
|
httpMux.HandleFunc("/healthz", m.Healthz)
|
||||||
if m.args.GrpcHealthPort != 0 {
|
|
||||||
if err := m.startGrpcHealthServer(grpcErr); err != nil {
|
|
||||||
return fmt.Errorf("failed to start gRPC health server: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Notify that we're ready to accept connections
|
// Start HTTP server
|
||||||
close(m.ready)
|
httpServer := http.Server{Addr: fmt.Sprintf(":%d", m.args.Port), Handler: httpMux}
|
||||||
|
go func() {
|
||||||
|
klog.InfoS("http server starting", "port", httpServer.Addr)
|
||||||
|
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
|
||||||
|
}()
|
||||||
|
defer httpServer.Close()
|
||||||
|
|
||||||
// NFD-Master main event loop
|
<-m.stop
|
||||||
for {
|
klog.InfoS("shutting down nfd-master")
|
||||||
select {
|
return nil
|
||||||
case err := <-grpcErr:
|
|
||||||
return fmt.Errorf("error in serving gRPC: %w", err)
|
|
||||||
|
|
||||||
case <-m.stop:
|
|
||||||
klog.InfoS("shutting down nfd-master")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
|
func (m *nfdMaster) Healthz(writer http.ResponseWriter, _ *http.Request) {
|
||||||
// TODO: improve status checking e.g. with watchdog in the main event loop and
|
writer.WriteHeader(http.StatusOK)
|
||||||
// cheking that node updater pool is alive.
|
|
||||||
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
|
|
||||||
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to listen: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
s := grpc.NewServer()
|
|
||||||
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
|
|
||||||
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
defer func() {
|
|
||||||
lis.Close()
|
|
||||||
}()
|
|
||||||
if err := s.Serve(lis); err != nil {
|
|
||||||
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
|
|
||||||
}
|
|
||||||
klog.InfoS("gRPC health server stopped")
|
|
||||||
}()
|
|
||||||
m.healthServer = s
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// nfdAPIUpdateHandler handles events from the nfd API controller.
|
// nfdAPIUpdateHandler handles events from the nfd API controller.
|
||||||
func (m *nfdMaster) nfdAPIUpdateHandler() {
|
func (m *nfdMaster) nfdAPIUpdateHandler() {
|
||||||
// We want to unconditionally update all nodes at startup if gRPC is
|
// We want to unconditionally update all nodes at startup
|
||||||
// disabled (i.e. NodeFeature API is enabled)
|
|
||||||
updateAll := true
|
updateAll := true
|
||||||
updateNodes := make(map[string]struct{})
|
updateNodes := make(map[string]struct{})
|
||||||
nodeFeatureGroup := make(map[string]struct{})
|
nodeFeatureGroup := make(map[string]struct{})
|
||||||
|
@ -431,13 +390,6 @@ func (m *nfdMaster) nfdAPIUpdateHandler() {
|
||||||
|
|
||||||
// Stop NfdMaster
|
// Stop NfdMaster
|
||||||
func (m *nfdMaster) Stop() {
|
func (m *nfdMaster) Stop() {
|
||||||
if m.server != nil {
|
|
||||||
m.server.GracefulStop()
|
|
||||||
}
|
|
||||||
if m.healthServer != nil {
|
|
||||||
m.healthServer.GracefulStop()
|
|
||||||
}
|
|
||||||
|
|
||||||
if m.nfdController != nil {
|
if m.nfdController != nil {
|
||||||
m.nfdController.stop()
|
m.nfdController.stop()
|
||||||
}
|
}
|
||||||
|
@ -447,16 +399,6 @@ func (m *nfdMaster) Stop() {
|
||||||
close(m.stop)
|
close(m.stop)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait until NfdMaster is able able to accept connections.
|
|
||||||
func (m *nfdMaster) WaitForReady(timeout time.Duration) bool {
|
|
||||||
select {
|
|
||||||
case <-m.ready:
|
|
||||||
return true
|
|
||||||
case <-time.After(timeout):
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prune erases all NFD related properties from the node objects of the cluster.
|
// Prune erases all NFD related properties from the node objects of the cluster.
|
||||||
func (m *nfdMaster) prune() error {
|
func (m *nfdMaster) prune() error {
|
||||||
if m.config.NoPublish {
|
if m.config.NoPublish {
|
||||||
|
|
Loading…
Add table
Reference in a new issue