From 98c3b0750d51ec4296cef13495f5221c91713426 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Mon, 9 Oct 2023 13:06:07 +0300 Subject: [PATCH] nfd-gc: add metrics Implements three metrics for nfd-gc: - nfd_gc_build_info: version information of nfd-gc. - nfd_gc_objects_deleted_total: total number of NodeFeature and NodeResourceTopology objects deleted by nfd-gc. - nfd_gc_object_delete_failures_total: number of errors encountered when deleting NodeFeature and NodeResourceTopology objects. --- cmd/nfd-gc/main.go | 2 + deployment/base/gc/gc.yaml | 3 ++ .../templates/nfd-gc.yaml | 3 ++ .../helm/node-feature-discovery/values.yaml | 2 + docs/deployment/helm.md | 1 + docs/deployment/metrics.md | 3 ++ pkg/nfd-gc/metrics.go | 54 +++++++++++++++++++ pkg/nfd-gc/nfd-gc.go | 24 +++++++-- 8 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 pkg/nfd-gc/metrics.go diff --git a/cmd/nfd-gc/main.go b/cmd/nfd-gc/main.go index a47488d16..6093ac761 100644 --- a/cmd/nfd-gc/main.go +++ b/cmd/nfd-gc/main.go @@ -83,6 +83,8 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args { "interval between cleanup of obsolete api objects") flagset.StringVar(&args.Kubeconfig, "kubeconfig", "", "Kubeconfig to use") + flagset.IntVar(&args.MetricsPort, "metrics", 8081, + "Port on which to expose metrics.") klog.InitFlags(flagset) diff --git a/deployment/base/gc/gc.yaml b/deployment/base/gc/gc.yaml index bbae4aa5c..42591b60b 100644 --- a/deployment/base/gc/gc.yaml +++ b/deployment/base/gc/gc.yaml @@ -21,3 +21,6 @@ spec: imagePullPolicy: Always command: - "nfd-gc" + ports: + - name: metrics + containerPort: 8081 diff --git a/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml b/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml index d803eef40..1e0e12327 100644 --- a/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml +++ b/deployment/helm/node-feature-discovery/templates/nfd-gc.yaml @@ -58,6 +58,9 @@ spec: drop: [ "ALL" ] readOnlyRootFilesystem: true runAsNonRoot: true + ports: + - name: metrics + containerPort: {{ .Values.gc.metricsPort | default "8081"}} {{- with .Values.gc.nodeSelector }} nodeSelector: diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index d735b9100..f48b177a6 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -495,6 +495,8 @@ gc: # cpu: 100m # memory: 128Mi + metricsPort: 8081 + nodeSelector: {} tolerations: [] annotations: {} diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index b0c7528bc..d6b288796 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -203,6 +203,7 @@ API's you need to install the prometheus operator in your cluster. | `gc.interval` | string | 1h | Time between periodic garbage collector runs | `gc.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings | `gc.resources` | dict | {} | Garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) +| `gc.metricsPort` | integer | 8081 | Port on which to serve Prometheus metrics | `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector) | `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) | `gc.annotations` | dict | {} | Garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index 3fb6c8849..8680b9c29 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -17,6 +17,7 @@ The exposed metrics are | ------------------------------------------------- | --------- | --------------------------------------- | `nfd_master_build_info` | Gauge | Version from which nfd-master was built | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built +| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built | `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built | `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC | `nfd_node_updates_total` | Counter | Number of nodes updated @@ -28,6 +29,8 @@ The exposed metrics are | `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node | `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. +| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. +| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. ## Via Kustomize diff --git a/pkg/nfd-gc/metrics.go b/pkg/nfd-gc/metrics.go new file mode 100644 index 000000000..bc884808d --- /dev/null +++ b/pkg/nfd-gc/metrics.go @@ -0,0 +1,54 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nfdgarbagecollector + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/node-feature-discovery/pkg/version" +) + +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +const ( + buildInfoQuery = "nfd_gc_build_info" + objectsDeletedQuery = "nfd_gc_objects_deleted_total" + objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total" +) + +var ( + buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", + ConstLabels: map[string]string{ + "version": version.Get(), + }, + }) + objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: objectsDeletedQuery, + Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."}, + []string{"kind"}, + ) + objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: objectDeleteErrorsQuery, + Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."}, + []string{"kind"}, + ) +) + +// registerVersion exposes the Operator build version. +func registerVersion(version string) { + buildInfo.SetToCurrentTime() +} diff --git a/pkg/nfd-gc/nfd-gc.go b/pkg/nfd-gc/nfd-gc.go index 707c4af83..ac48805ab 100644 --- a/pkg/nfd-gc/nfd-gc.go +++ b/pkg/nfd-gc/nfd-gc.go @@ -34,13 +34,15 @@ import ( "sigs.k8s.io/node-feature-discovery/pkg/apihelper" nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1" nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" + "sigs.k8s.io/node-feature-discovery/pkg/utils" + "sigs.k8s.io/node-feature-discovery/pkg/version" ) // Args are the command line arguments type Args struct { - GCPeriod time.Duration - - Kubeconfig string + GCPeriod time.Duration + Kubeconfig string + MetricsPort int } type NfdGarbageCollector interface { @@ -74,29 +76,35 @@ func New(args *Args) (NfdGarbageCollector, error) { } func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) { + kind := "NodeFeature" if err := n.nfdClient.NfdV1alpha1().NodeFeatures(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil { if errors.IsNotFound(err) { klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name)) return } else { klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name)) + objectDeleteErrors.WithLabelValues(kind).Inc() return } } klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name)) + objectsDeleted.WithLabelValues(kind).Inc() } func (n *nfdGarbageCollector) deleteNRT(nodeName string) { + kind := "NodeResourceTopology" if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil { if errors.IsNotFound(err) { klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName) return } else { klog.ErrorS(err, "failed to delete NodeResourceTopology object", "nodeName", nodeName) + objectDeleteErrors.WithLabelValues(kind).Inc() return } } klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName) + objectsDeleted.WithLabelValues(kind).Inc() } func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) { @@ -208,6 +216,16 @@ func (n *nfdGarbageCollector) startNodeInformer() error { // Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed func (n *nfdGarbageCollector) Run() error { + if n.args.MetricsPort > 0 { + m := utils.CreateMetricsServer(n.args.MetricsPort, + buildInfo, + objectsDeleted, + objectDeleteErrors) + go m.Run() + registerVersion(version.Get()) + defer m.Stop() + } + if err := n.startNodeInformer(); err != nil { return err }