diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index a6db3bd3b..06c6c3192 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -20,6 +20,7 @@ The exposed metrics are | `nfd_node_updates_total` | Counter | Number of nodes updated | `nfd_node_update_failures_total` | Counter | Number of nodes update failures | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects +| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node ## Via Kustomize diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 20bdbc7b8..63bb25b3e 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -28,10 +28,11 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_master_build_info" - nodeUpdatesQuery = "nfd_node_updates_total" - nodeUpdateFailuresQuery = "nfd_node_update_failures_total" - nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + buildInfoQuery = "nfd_master_build_info" + nodeUpdatesQuery = "nfd_node_updates_total" + nodeUpdateFailuresQuery = "nfd_node_update_failures_total" + nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total" ) var ( @@ -63,6 +64,10 @@ var ( "node", }, ) + nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nfrProcessingErrorsQuery, + Help: "Number of errors encountered while processing NodeFeatureRule objects.", + }) ) // registerVersion exposes the Operator build version. @@ -76,7 +81,8 @@ func runMetricsServer(port int) { r.MustRegister(buildInfo, nodeUpdates, nodeUpdateFailures, - nfrProcessingTime) + nfrProcessingTime, + nfrProcessingErrors) mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 731d0c1f6..5357e228c 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -991,6 +991,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha ruleOut, err := rule.Execute(features) if err != nil { klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName) + nfrProcessingErrors.Inc() continue } taints = append(taints, ruleOut.Taints...)