From a8a29e6df22c4391609fb4d49c0f5d75f9a13d36 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 1 Aug 2023 15:50:03 +0300 Subject: [PATCH] metrics: add nfd_nodefeaturerule_processing_errors_total counter Add a counter for errors encountered when processing NodeFeatureRules. Another simple counter without any additional prometheus labels - nfd-master logs can provide further details. --- docs/deployment/metrics.md | 1 + pkg/nfd-master/metrics.go | 16 +++++++++++----- pkg/nfd-master/nfd-master.go | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index a6db3bd3b..06c6c3192 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -20,6 +20,7 @@ The exposed metrics are | `nfd_node_updates_total` | Counter | Number of nodes updated | `nfd_node_update_failures_total` | Counter | Number of nodes update failures | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects +| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node ## Via Kustomize diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 20bdbc7b8..63bb25b3e 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -28,10 +28,11 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_master_build_info" - nodeUpdatesQuery = "nfd_node_updates_total" - nodeUpdateFailuresQuery = "nfd_node_update_failures_total" - nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + buildInfoQuery = "nfd_master_build_info" + nodeUpdatesQuery = "nfd_node_updates_total" + nodeUpdateFailuresQuery = "nfd_node_update_failures_total" + nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total" ) var ( @@ -63,6 +64,10 @@ var ( "node", }, ) + nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nfrProcessingErrorsQuery, + Help: "Number of errors encountered while processing NodeFeatureRule objects.", + }) ) // registerVersion exposes the Operator build version. @@ -76,7 +81,8 @@ func runMetricsServer(port int) { r.MustRegister(buildInfo, nodeUpdates, nodeUpdateFailures, - nfrProcessingTime) + nfrProcessingTime, + nfrProcessingErrors) mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 731d0c1f6..5357e228c 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -991,6 +991,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha ruleOut, err := rule.Execute(features) if err != nil { klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName) + nfrProcessingErrors.Inc() continue } taints = append(taints, ruleOut.Taints...)