1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

nfd-topology-updater: add metrics support

For now, add only one metric, a counter for the errors occurring while
scanning pod resources on the node.
This commit is contained in:
Markus Lehtonen 2023-08-04 16:17:11 +03:00
parent 6d30ca9660
commit 06b333db1e
9 changed files with 110 additions and 0 deletions

View file

@ -106,6 +106,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
"Do not publish discovered features to the cluster-local Kubernetes API server.")
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
"Kube config file.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",

View file

@ -22,3 +22,6 @@ spec:
command:
- "nfd-topology-updater"
args: []
ports:
- name: metrics
containerPort: 8081

View file

@ -70,6 +70,10 @@ spec:
# Disable kubelet state tracking by giving an empty path
- "-kubelet-state-dir="
{{- end }}
- -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}
ports:
- name: metrics
containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
volumeMounts:
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config

View file

@ -417,6 +417,7 @@ topologyUpdater:
rbac:
create: true
metricsPort: 8081
kubeletConfigPath:
kubeletPodResourcesSockPath:
updateInterval: 60s

View file

@ -173,6 +173,7 @@ API's you need to install the prometheus operator in your cluster.
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics |
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |

View file

@ -20,6 +20,7 @@ The exposed metrics are
| `nfd_node_updates_total` | Counter | Number of nodes updated
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
## Via Kustomize

View file

@ -72,6 +72,20 @@ Example:
nfd-topology-updater -oneshot -no-publish
```
### -metrics
The `-metrics` flag specifies the port on which to expose
[Prometheus](https://prometheus.io/) metrics. Setting this to 0 disables the
metrics server on nfd-topology-updater.
Default: 8081
Example:
```bash
nfd-topology-updater -metrics=12345
```
### -sleep-interval
The `-sleep-interval` specifies the interval between resource hardware

View file

@ -0,0 +1,75 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nfdtopologyupdater
import (
"fmt"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_topology_updater_build_info"
scanErrorsQuery = "nfd_topology_updater_scan_errors_total"
)
var (
srv *http.Server
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{
"version": version.Get(),
},
})
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: scanErrorsQuery,
Help: "Number of errors in scanning resource allocation of pods.",
})
)
// registerVersion exposes the Operator build version.
func registerVersion(version string) {
buildInfo.SetToCurrentTime()
}
// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(buildInfo,
scanErrors)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
klog.InfoS("metrics server starting", "port", port)
srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux}
klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe())
}
// stopMetricsServer stops the metrics server
func stopMetricsServer() {
if srv != nil {
klog.InfoS("stopping metrics server", "port", srv.Addr)
srv.Close()
}
}

View file

@ -50,6 +50,7 @@ const (
// Args are the command line arguments
type Args struct {
MetricsPort int
NoPublish bool
Oneshot bool
KubeConfigFile string
@ -142,6 +143,13 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
}
// Register to metrics server
if w.args.MetricsPort > 0 {
go runMetricsServer(w.args.MetricsPort)
registerVersion(version.Get())
defer stopMetricsServer()
}
var resScan resourcemonitor.ResourcesScanner
resScan, err = resourcemonitor.NewPodResourcesScanner(w.resourcemonitorArgs.Namespace, podResClient, w.apihelper, w.resourcemonitorArgs.PodSetFingerprint)
@ -169,6 +177,7 @@ func (w *nfdTopologyUpdater) Run() error {
klog.V(1).InfoS("received updated pod resources", "podResources", utils.DelayedDumper(scanResponse.PodResources))
if err != nil {
klog.ErrorS(err, "scan failed")
scanErrors.Inc()
continue
}
zones = resAggr.Aggregate(scanResponse.PodResources)