diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index 38a4011fe..a6db3bd3b 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -18,6 +18,7 @@ The exposed metrics are | `nfd_master_build_info` | Gauge | Version from which nfd-master was built | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built | `nfd_node_updates_total` | Counter | Number of nodes updated +| `nfd_node_update_failures_total` | Counter | Number of nodes update failures | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 9b409720d..20bdbc7b8 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -28,9 +28,10 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_master_build_info" - nodeUpdatesQuery = "nfd_node_updates_total" - nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + buildInfoQuery = "nfd_master_build_info" + nodeUpdatesQuery = "nfd_node_updates_total" + nodeUpdateFailuresQuery = "nfd_node_update_failures_total" + nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" ) var ( @@ -47,6 +48,10 @@ var ( Name: nodeUpdatesQuery, Help: "Number of nodes updated by the master.", }) + nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeUpdateFailuresQuery, + Help: "Number of node update failures.", + }) nfrProcessingTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: nfrProcessingTimeQuery, @@ -70,6 +75,7 @@ func runMetricsServer(port int) { r := prometheus.NewRegistry() r.MustRegister(buildInfo, nodeUpdates, + nodeUpdateFailures, nfrProcessingTime) mux := http.NewServeMux() diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index c194cd176..731d0c1f6 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -452,6 +452,7 @@ func (m *nfdMaster) prune() error { // Prune labels and extended resources err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{}) if err != nil { + nodeUpdateFailures.Inc() return fmt.Errorf("failed to prune node %q: %v", node.Name, err) } @@ -675,6 +676,7 @@ func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.Se // Create labels et al if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil { + nodeUpdateFailures.Inc() return &pb.SetLabelsReply{}, err } } diff --git a/pkg/nfd-master/node-updater-pool.go b/pkg/nfd-master/node-updater-pool.go index 8a19a91db..59429c34b 100644 --- a/pkg/nfd-master/node-updater-pool.go +++ b/pkg/nfd-master/node-updater-pool.go @@ -53,6 +53,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI return true } else { klog.ErrorS(err, "failed to update node", "nodeName", nodeName) + nodeUpdateFailures.Inc() } } queue.Forget(nodeName)