mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
nfd-topology-updater: add metrics support
For now, add only one metric, a counter for the errors occurring while scanning pod resources on the node.
This commit is contained in:
parent
6d30ca9660
commit
06b333db1e
9 changed files with 110 additions and 0 deletions
|
@ -106,6 +106,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
|
|||
"Do not publish discovered features to the cluster-local Kubernetes API server.")
|
||||
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
|
||||
"Kube config file.")
|
||||
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
|
||||
"Port on which to expose metrics.")
|
||||
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
|
||||
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
|
||||
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
|
||||
|
|
|
@ -22,3 +22,6 @@ spec:
|
|||
command:
|
||||
- "nfd-topology-updater"
|
||||
args: []
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 8081
|
||||
|
|
|
@ -70,6 +70,10 @@ spec:
|
|||
# Disable kubelet state tracking by giving an empty path
|
||||
- "-kubelet-state-dir="
|
||||
{{- end }}
|
||||
- -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
|
||||
volumeMounts:
|
||||
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
|
||||
- name: kubelet-config
|
||||
|
|
|
@ -417,6 +417,7 @@ topologyUpdater:
|
|||
rbac:
|
||||
create: true
|
||||
|
||||
metricsPort: 8081
|
||||
kubeletConfigPath:
|
||||
kubeletPodResourcesSockPath:
|
||||
updateInterval: 60s
|
||||
|
|
|
@ -173,6 +173,7 @@ API's you need to install the prometheus operator in your cluster.
|
|||
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
|
||||
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
|
||||
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
|
||||
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics |
|
||||
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
|
||||
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
|
||||
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
|
||||
|
|
|
@ -20,6 +20,7 @@ The exposed metrics are
|
|||
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
||||
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
|
||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
||||
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
|
||||
|
||||
## Via Kustomize
|
||||
|
||||
|
|
|
@ -72,6 +72,20 @@ Example:
|
|||
nfd-topology-updater -oneshot -no-publish
|
||||
```
|
||||
|
||||
### -metrics
|
||||
|
||||
The `-metrics` flag specifies the port on which to expose
|
||||
[Prometheus](https://prometheus.io/) metrics. Setting this to 0 disables the
|
||||
metrics server on nfd-topology-updater.
|
||||
|
||||
Default: 8081
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
nfd-topology-updater -metrics=12345
|
||||
```
|
||||
|
||||
### -sleep-interval
|
||||
|
||||
The `-sleep-interval` specifies the interval between resource hardware
|
||||
|
|
75
pkg/nfd-topology-updater/metrics.go
Normal file
75
pkg/nfd-topology-updater/metrics.go
Normal file
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
Copyright 2023 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nfdtopologyupdater
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"k8s.io/klog/v2"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/version"
|
||||
)
|
||||
|
||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||
const (
|
||||
buildInfoQuery = "nfd_topology_updater_build_info"
|
||||
scanErrorsQuery = "nfd_topology_updater_scan_errors_total"
|
||||
)
|
||||
|
||||
var (
|
||||
srv *http.Server
|
||||
|
||||
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: buildInfoQuery,
|
||||
Help: "Version from which Node Feature Discovery was built.",
|
||||
ConstLabels: map[string]string{
|
||||
"version": version.Get(),
|
||||
},
|
||||
})
|
||||
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: scanErrorsQuery,
|
||||
Help: "Number of errors in scanning resource allocation of pods.",
|
||||
})
|
||||
)
|
||||
|
||||
// registerVersion exposes the Operator build version.
|
||||
func registerVersion(version string) {
|
||||
buildInfo.SetToCurrentTime()
|
||||
}
|
||||
|
||||
// runMetricsServer starts a http server to expose metrics
|
||||
func runMetricsServer(port int) {
|
||||
r := prometheus.NewRegistry()
|
||||
r.MustRegister(buildInfo,
|
||||
scanErrors)
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
|
||||
|
||||
klog.InfoS("metrics server starting", "port", port)
|
||||
srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux}
|
||||
klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe())
|
||||
}
|
||||
|
||||
// stopMetricsServer stops the metrics server
|
||||
func stopMetricsServer() {
|
||||
if srv != nil {
|
||||
klog.InfoS("stopping metrics server", "port", srv.Addr)
|
||||
srv.Close()
|
||||
}
|
||||
}
|
|
@ -50,6 +50,7 @@ const (
|
|||
|
||||
// Args are the command line arguments
|
||||
type Args struct {
|
||||
MetricsPort int
|
||||
NoPublish bool
|
||||
Oneshot bool
|
||||
KubeConfigFile string
|
||||
|
@ -142,6 +143,13 @@ func (w *nfdTopologyUpdater) Run() error {
|
|||
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
|
||||
}
|
||||
|
||||
// Register to metrics server
|
||||
if w.args.MetricsPort > 0 {
|
||||
go runMetricsServer(w.args.MetricsPort)
|
||||
registerVersion(version.Get())
|
||||
defer stopMetricsServer()
|
||||
}
|
||||
|
||||
var resScan resourcemonitor.ResourcesScanner
|
||||
|
||||
resScan, err = resourcemonitor.NewPodResourcesScanner(w.resourcemonitorArgs.Namespace, podResClient, w.apihelper, w.resourcemonitorArgs.PodSetFingerprint)
|
||||
|
@ -169,6 +177,7 @@ func (w *nfdTopologyUpdater) Run() error {
|
|||
klog.V(1).InfoS("received updated pod resources", "podResources", utils.DelayedDumper(scanResponse.PodResources))
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "scan failed")
|
||||
scanErrors.Inc()
|
||||
continue
|
||||
}
|
||||
zones = resAggr.Aggregate(scanResponse.PodResources)
|
||||
|
|
Loading…
Reference in a new issue