mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
nfd-topology-updater: add metrics support
For now, add only one metric, a counter for the errors occurring while scanning pod resources on the node.
This commit is contained in:
parent
6d30ca9660
commit
06b333db1e
9 changed files with 110 additions and 0 deletions
|
@ -106,6 +106,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
|
||||||
"Do not publish discovered features to the cluster-local Kubernetes API server.")
|
"Do not publish discovered features to the cluster-local Kubernetes API server.")
|
||||||
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
|
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
|
||||||
"Kube config file.")
|
"Kube config file.")
|
||||||
|
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
|
||||||
|
"Port on which to expose metrics.")
|
||||||
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
|
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
|
||||||
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
|
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
|
||||||
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
|
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
|
||||||
|
|
|
@ -22,3 +22,6 @@ spec:
|
||||||
command:
|
command:
|
||||||
- "nfd-topology-updater"
|
- "nfd-topology-updater"
|
||||||
args: []
|
args: []
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: 8081
|
||||||
|
|
|
@ -70,6 +70,10 @@ spec:
|
||||||
# Disable kubelet state tracking by giving an empty path
|
# Disable kubelet state tracking by giving an empty path
|
||||||
- "-kubelet-state-dir="
|
- "-kubelet-state-dir="
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
- -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
|
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
|
||||||
- name: kubelet-config
|
- name: kubelet-config
|
||||||
|
|
|
@ -417,6 +417,7 @@ topologyUpdater:
|
||||||
rbac:
|
rbac:
|
||||||
create: true
|
create: true
|
||||||
|
|
||||||
|
metricsPort: 8081
|
||||||
kubeletConfigPath:
|
kubeletConfigPath:
|
||||||
kubeletPodResourcesSockPath:
|
kubeletPodResourcesSockPath:
|
||||||
updateInterval: 60s
|
updateInterval: 60s
|
||||||
|
|
|
@ -173,6 +173,7 @@ API's you need to install the prometheus operator in your cluster.
|
||||||
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
|
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
|
||||||
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
|
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
|
||||||
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
|
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
|
||||||
|
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics |
|
||||||
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
|
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
|
||||||
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
|
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
|
||||||
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
|
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
|
||||||
|
|
|
@ -20,6 +20,7 @@ The exposed metrics are
|
||||||
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
||||||
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
|
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
|
||||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
||||||
|
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
|
||||||
|
|
||||||
## Via Kustomize
|
## Via Kustomize
|
||||||
|
|
||||||
|
|
|
@ -72,6 +72,20 @@ Example:
|
||||||
nfd-topology-updater -oneshot -no-publish
|
nfd-topology-updater -oneshot -no-publish
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### -metrics
|
||||||
|
|
||||||
|
The `-metrics` flag specifies the port on which to expose
|
||||||
|
[Prometheus](https://prometheus.io/) metrics. Setting this to 0 disables the
|
||||||
|
metrics server on nfd-topology-updater.
|
||||||
|
|
||||||
|
Default: 8081
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nfd-topology-updater -metrics=12345
|
||||||
|
```
|
||||||
|
|
||||||
### -sleep-interval
|
### -sleep-interval
|
||||||
|
|
||||||
The `-sleep-interval` specifies the interval between resource hardware
|
The `-sleep-interval` specifies the interval between resource hardware
|
||||||
|
|
75
pkg/nfd-topology-updater/metrics.go
Normal file
75
pkg/nfd-topology-updater/metrics.go
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
/*
|
||||||
|
Copyright 2023 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package nfdtopologyupdater
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
|
"sigs.k8s.io/node-feature-discovery/pkg/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
|
const (
|
||||||
|
buildInfoQuery = "nfd_topology_updater_build_info"
|
||||||
|
scanErrorsQuery = "nfd_topology_updater_scan_errors_total"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
srv *http.Server
|
||||||
|
|
||||||
|
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
|
Name: buildInfoQuery,
|
||||||
|
Help: "Version from which Node Feature Discovery was built.",
|
||||||
|
ConstLabels: map[string]string{
|
||||||
|
"version": version.Get(),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: scanErrorsQuery,
|
||||||
|
Help: "Number of errors in scanning resource allocation of pods.",
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
// registerVersion exposes the Operator build version.
|
||||||
|
func registerVersion(version string) {
|
||||||
|
buildInfo.SetToCurrentTime()
|
||||||
|
}
|
||||||
|
|
||||||
|
// runMetricsServer starts a http server to expose metrics
|
||||||
|
func runMetricsServer(port int) {
|
||||||
|
r := prometheus.NewRegistry()
|
||||||
|
r.MustRegister(buildInfo,
|
||||||
|
scanErrors)
|
||||||
|
mux := http.NewServeMux()
|
||||||
|
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
|
||||||
|
|
||||||
|
klog.InfoS("metrics server starting", "port", port)
|
||||||
|
srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux}
|
||||||
|
klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe())
|
||||||
|
}
|
||||||
|
|
||||||
|
// stopMetricsServer stops the metrics server
|
||||||
|
func stopMetricsServer() {
|
||||||
|
if srv != nil {
|
||||||
|
klog.InfoS("stopping metrics server", "port", srv.Addr)
|
||||||
|
srv.Close()
|
||||||
|
}
|
||||||
|
}
|
|
@ -50,6 +50,7 @@ const (
|
||||||
|
|
||||||
// Args are the command line arguments
|
// Args are the command line arguments
|
||||||
type Args struct {
|
type Args struct {
|
||||||
|
MetricsPort int
|
||||||
NoPublish bool
|
NoPublish bool
|
||||||
Oneshot bool
|
Oneshot bool
|
||||||
KubeConfigFile string
|
KubeConfigFile string
|
||||||
|
@ -142,6 +143,13 @@ func (w *nfdTopologyUpdater) Run() error {
|
||||||
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
|
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Register to metrics server
|
||||||
|
if w.args.MetricsPort > 0 {
|
||||||
|
go runMetricsServer(w.args.MetricsPort)
|
||||||
|
registerVersion(version.Get())
|
||||||
|
defer stopMetricsServer()
|
||||||
|
}
|
||||||
|
|
||||||
var resScan resourcemonitor.ResourcesScanner
|
var resScan resourcemonitor.ResourcesScanner
|
||||||
|
|
||||||
resScan, err = resourcemonitor.NewPodResourcesScanner(w.resourcemonitorArgs.Namespace, podResClient, w.apihelper, w.resourcemonitorArgs.PodSetFingerprint)
|
resScan, err = resourcemonitor.NewPodResourcesScanner(w.resourcemonitorArgs.Namespace, podResClient, w.apihelper, w.resourcemonitorArgs.PodSetFingerprint)
|
||||||
|
@ -169,6 +177,7 @@ func (w *nfdTopologyUpdater) Run() error {
|
||||||
klog.V(1).InfoS("received updated pod resources", "podResources", utils.DelayedDumper(scanResponse.PodResources))
|
klog.V(1).InfoS("received updated pod resources", "podResources", utils.DelayedDumper(scanResponse.PodResources))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.ErrorS(err, "scan failed")
|
klog.ErrorS(err, "scan failed")
|
||||||
|
scanErrors.Inc()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
zones = resAggr.Aggregate(scanResponse.PodResources)
|
zones = resAggr.Aggregate(scanResponse.PodResources)
|
||||||
|
|
Loading…
Reference in a new issue