mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-05 16:27:05 +00:00
nfd-topology-updater: serve metrics and healthz on the same port
Changes the gRPC health endpoint to plain http. At the same time starts serving both the metrics and healthz endpoints on a single port. Replaces the -metrics and -grpc-health command line flags with a single -port flag. Changes the Helm and kustomize deployments correspondingly.
This commit is contained in:
parent
4f24a38ad4
commit
0b9a8cf120
6 changed files with 43 additions and 78 deletions
|
@ -107,10 +107,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
|
|||
"Do not create or update NodeResourceTopology objects.")
|
||||
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
|
||||
"Kube config file.")
|
||||
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
|
||||
"Port on which to expose metrics.")
|
||||
flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082,
|
||||
"Port on which to expose the grpc health endpoint.")
|
||||
flagset.IntVar(&args.Port, "port", 8080,
|
||||
"Port which metrics and healthz endpoints are served on")
|
||||
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
|
||||
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
|
||||
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
|
||||
|
|
|
@ -20,13 +20,15 @@ spec:
|
|||
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
|
||||
imagePullPolicy: Always
|
||||
livenessProbe:
|
||||
grpc:
|
||||
port: 8082
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
grpc:
|
||||
port: 8082
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
failureThreshold: 10
|
||||
|
@ -41,5 +43,5 @@ spec:
|
|||
cpu: 50m
|
||||
memory: 40Mi
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 8081
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
|
|
|
@ -45,8 +45,9 @@ spec:
|
|||
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
|
||||
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
|
||||
livenessProbe:
|
||||
grpc:
|
||||
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
{{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }}
|
||||
initialDelaySeconds: {{ . }}
|
||||
{{- end }}
|
||||
|
@ -60,8 +61,9 @@ spec:
|
|||
timeoutSeconds: {{ . }}
|
||||
{{- end }}
|
||||
readinessProbe:
|
||||
grpc:
|
||||
port: {{ .Values.topologyUpdater.healthPort | default "8082" }}
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: http
|
||||
{{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }}
|
||||
initialDelaySeconds: {{ . }}
|
||||
{{- end }}
|
||||
|
@ -113,16 +115,13 @@ spec:
|
|||
# Disable kubelet state tracking by giving an empty path
|
||||
- "-kubelet-state-dir="
|
||||
{{- end }}
|
||||
- "-metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}"
|
||||
- "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}"
|
||||
- "-port={{ .Values.topologyUpdater.port | default "8080"}}"
|
||||
{{- with .Values.topologyUpdater.extraArgs }}
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
|
||||
name: metrics
|
||||
- containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }}
|
||||
name: health
|
||||
- containerPort: {{ .Values.topologyUpdater.port | default "8080"}}
|
||||
name: http
|
||||
volumeMounts:
|
||||
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
|
||||
- name: kubelet-config
|
||||
|
|
|
@ -509,8 +509,7 @@ topologyUpdater:
|
|||
rbac:
|
||||
create: true
|
||||
|
||||
metricsPort: 8081
|
||||
healthPort: 8082
|
||||
port: 8080
|
||||
kubeletConfigPath:
|
||||
kubeletPodResourcesSockPath:
|
||||
updateInterval: 60s
|
||||
|
@ -524,17 +523,13 @@ topologyUpdater:
|
|||
drop: [ "ALL" ]
|
||||
readOnlyRootFilesystem: true
|
||||
runAsUser: 0
|
||||
|
||||
|
||||
livenessProbe:
|
||||
grpc:
|
||||
port: 8082
|
||||
initialDelaySeconds: 10
|
||||
# failureThreshold: 3
|
||||
# periodSeconds: 10
|
||||
# timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
grpc:
|
||||
port: 8082
|
||||
initialDelaySeconds: 5
|
||||
failureThreshold: 10
|
||||
# periodSeconds: 10
|
||||
|
|
|
@ -275,8 +275,7 @@ API's you need to install the prometheus operator in your cluster.
|
|||
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
|
||||
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
|
||||
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
|
||||
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
|
||||
| `topologyUpdater.healthPort` | integer | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `topologyUpdater.port` in NFD v0.18. |
|
||||
| `topologyUpdater.port` | integer | 8080 | Port on which to serve http for metrics and healthz endpoints. |
|
||||
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
|
||||
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
|
||||
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
|
||||
|
|
|
@ -18,16 +18,13 @@ package nfdtopologyupdater
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/health"
|
||||
"google.golang.org/grpc/health/grpc_health_v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
|
@ -37,6 +34,7 @@ import (
|
|||
|
||||
"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
|
||||
topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/nfd-topology-updater/kubeletnotifier"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/podres"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/resourcemonitor"
|
||||
|
@ -56,13 +54,12 @@ const (
|
|||
|
||||
// Args are the command line arguments
|
||||
type Args struct {
|
||||
MetricsPort int
|
||||
Port int
|
||||
NoPublish bool
|
||||
Oneshot bool
|
||||
KubeConfigFile string
|
||||
ConfigFile string
|
||||
KubeletStateDir string
|
||||
GrpcHealthPort int
|
||||
|
||||
Klog map[string]*utils.KlogFlagVal
|
||||
}
|
||||
|
@ -90,7 +87,6 @@ type nfdTopologyUpdater struct {
|
|||
ownerRefs []metav1.OwnerReference
|
||||
k8sClient k8sclient.Interface
|
||||
kubeletConfigFunc func() (*kubeletconfigv1beta1.KubeletConfiguration, error)
|
||||
healthServer *grpc.Server
|
||||
}
|
||||
|
||||
// NewTopologyUpdater creates a new NfdTopologyUpdater instance.
|
||||
|
@ -134,27 +130,8 @@ func (w *nfdTopologyUpdater) detectTopologyPolicyAndScope() (string, string, err
|
|||
return klConfig.TopologyManagerPolicy, klConfig.TopologyManagerScope, nil
|
||||
}
|
||||
|
||||
func (w *nfdTopologyUpdater) startGrpcHealthServer(errChan chan<- error) error {
|
||||
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to listen: %w", err)
|
||||
}
|
||||
|
||||
s := grpc.NewServer()
|
||||
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
|
||||
klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
lis.Close()
|
||||
}()
|
||||
if err := s.Serve(lis); err != nil {
|
||||
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
|
||||
}
|
||||
klog.InfoS("gRPC health server stopped")
|
||||
}()
|
||||
w.healthServer = s
|
||||
return nil
|
||||
func (w *nfdTopologyUpdater) Healthz(writer http.ResponseWriter, _ *http.Request) {
|
||||
writer.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
// Run nfdTopologyUpdater. Returns if a fatal error is encountered, or, after
|
||||
|
@ -187,15 +164,14 @@ func (w *nfdTopologyUpdater) Run() error {
|
|||
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
|
||||
}
|
||||
|
||||
httpMux := http.NewServeMux()
|
||||
|
||||
// Register to metrics server
|
||||
if w.args.MetricsPort > 0 {
|
||||
m := utils.CreateMetricsServer(w.args.MetricsPort,
|
||||
buildInfo,
|
||||
scanErrors)
|
||||
go m.Run()
|
||||
registerVersion(version.Get())
|
||||
defer m.Stop()
|
||||
}
|
||||
promRegistry := prometheus.NewRegistry()
|
||||
promRegistry.MustRegister(
|
||||
buildInfo,
|
||||
scanErrors)
|
||||
registerVersion(version.Get())
|
||||
|
||||
var resScan resourcemonitor.ResourcesScanner
|
||||
|
||||
|
@ -215,20 +191,19 @@ func (w *nfdTopologyUpdater) Run() error {
|
|||
return fmt.Errorf("failed to obtain node resource information: %w", err)
|
||||
}
|
||||
|
||||
grpcErr := make(chan error)
|
||||
// Register health probe (at this point we're "ready and live")
|
||||
httpMux.HandleFunc("/healthz", w.Healthz)
|
||||
|
||||
// Start gRPC server for liveness probe (at this point we're "live")
|
||||
if w.args.GrpcHealthPort != 0 {
|
||||
if err := w.startGrpcHealthServer(grpcErr); err != nil {
|
||||
return fmt.Errorf("failed to start gRPC health server: %w", err)
|
||||
}
|
||||
}
|
||||
// Start HTTP server
|
||||
httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux}
|
||||
go func() {
|
||||
klog.InfoS("http server starting", "port", httpServer.Addr)
|
||||
klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe())
|
||||
}()
|
||||
defer httpServer.Close()
|
||||
|
||||
for {
|
||||
select {
|
||||
case err := <-grpcErr:
|
||||
return fmt.Errorf("error in serving gRPC: %w", err)
|
||||
|
||||
case info := <-w.eventSource:
|
||||
klog.V(4).InfoS("event received, scanning...", "event", info.Event)
|
||||
scanResponse, err := resScan.Scan()
|
||||
|
@ -257,9 +232,6 @@ func (w *nfdTopologyUpdater) Run() error {
|
|||
|
||||
case <-w.stop:
|
||||
klog.InfoS("shutting down nfd-topology-updater")
|
||||
if w.healthServer != nil {
|
||||
w.healthServer.GracefulStop()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue