diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index acc962e51..6e80c41c4 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -17,8 +17,14 @@ The exposed metrics are | ------------------------------------------------- | --------- | --------------------------------------- | `nfd_master_build_info` | Gauge | Version from which nfd-master was built | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built +| `nfd_node_update_requests_total` | Counter | Number of node update requests processed by the master | `nfd_node_updates_total` | Counter | Number of nodes updated +| `nfd_node_update_failures_total` | Counter | Number of nodes update failures +| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master +| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master +| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects +| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node | `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 9b409720d..c64842a7a 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -28,9 +28,15 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_master_build_info" - nodeUpdatesQuery = "nfd_node_updates_total" - nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + buildInfoQuery = "nfd_master_build_info" + nodeUpdateRequestsQuery = "nfd_node_update_requests_total" + nodeUpdatesQuery = "nfd_node_updates_total" + nodeUpdateFailuresQuery = "nfd_node_update_failures_total" + nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total" + nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total" + nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total" + nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total" ) var ( @@ -43,10 +49,30 @@ var ( "version": version.Get(), }, }) + nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeUpdateRequestsQuery, + Help: "Number of node update requests processed by the master.", + }) nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{ Name: nodeUpdatesQuery, Help: "Number of nodes updated by the master.", }) + nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeUpdateFailuresQuery, + Help: "Number of node update failures.", + }) + nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeLabelsRejectedQuery, + Help: "Number of node labels that were rejected by nfd-master.", + }) + nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeERsRejectedQuery, + Help: "Number of node extended resources that were rejected by nfd-master.", + }) + nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeTaintsRejectedQuery, + Help: "Number of node taints that were rejected by nfd-master.", + }) nfrProcessingTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: nfrProcessingTimeQuery, @@ -58,6 +84,10 @@ var ( "node", }, ) + nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nfrProcessingErrorsQuery, + Help: "Number of errors encountered while processing NodeFeatureRule objects.", + }) ) // registerVersion exposes the Operator build version. @@ -68,9 +98,16 @@ func registerVersion(version string) { // runMetricsServer starts a http server to expose metrics func runMetricsServer(port int) { r := prometheus.NewRegistry() - r.MustRegister(buildInfo, + r.MustRegister( + buildInfo, + nodeUpdateRequests, nodeUpdates, - nfrProcessingTime) + nodeUpdateFailures, + nodeLabelsRejected, + nodeERsRejected, + nodeTaintsRejected, + nfrProcessingTime, + nfrProcessingErrors) mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index c194cd176..9f283398c 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -452,6 +452,7 @@ func (m *nfdMaster) prune() error { // Prune labels and extended resources err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{}) if err != nil { + nodeUpdateFailures.Inc() return fmt.Errorf("failed to prune node %q: %v", node.Name, err) } @@ -509,6 +510,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea if value, err := m.filterFeatureLabel(name, value, features); err != nil { klog.ErrorS(err, "ignoring label", "labelKey", name, "labelValue", value) + nodeLabelsRejected.Inc() } else { outLabels[name] = value } @@ -522,6 +524,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea if value, ok := outLabels[extendedResourceName]; ok { if _, err := strconv.Atoi(value); err != nil { klog.ErrorS(err, "bad label value encountered for extended resource", "labelKey", extendedResourceName, "labelValue", value) + nodeERsRejected.Inc() continue // non-numeric label can't be used } @@ -602,6 +605,7 @@ func filterTaints(taints []corev1.Taint) []corev1.Taint { for _, taint := range taints { if err := filterTaint(&taint); err != nil { klog.ErrorS(err, "ignoring taint", "taint", taint) + nodeTaintsRejected.Inc() } else { outTaints = append(outTaints, taint) } @@ -650,6 +654,7 @@ func isNamespaceDenied(labelNs string, wildcardDeniedNs map[string]struct{}, nor // SetLabels implements LabelerServer func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.SetLabelsReply, error) { + nodeUpdateRequests.Inc() err := authorizeClient(c, m.args.VerifyNodeName, r.NodeName) if err != nil { klog.ErrorS(err, "gRPC client authorization failed", "nodeName", r.NodeName) @@ -675,6 +680,7 @@ func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.Se // Create labels et al if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil { + nodeUpdateFailures.Inc() return &pb.SetLabelsReply{}, err } } @@ -784,6 +790,7 @@ func filterExtendedResources(features *nfdv1alpha1.Features, extendedResources E capacity, err := filterExtendedResource(name, value, features) if err != nil { klog.ErrorS(err, "failed to create extended resources", "extendedResourceName", name, "extendedResourceValue", value) + nodeERsRejected.Inc() } else { outExtendedResources[name] = capacity } @@ -989,6 +996,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha ruleOut, err := rule.Execute(features) if err != nil { klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName) + nfrProcessingErrors.Inc() continue } taints = append(taints, ruleOut.Taints...) diff --git a/pkg/nfd-master/node-updater-pool.go b/pkg/nfd-master/node-updater-pool.go index 8a19a91db..55ce2a712 100644 --- a/pkg/nfd-master/node-updater-pool.go +++ b/pkg/nfd-master/node-updater-pool.go @@ -46,6 +46,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI defer queue.Done(nodeName) + nodeUpdateRequests.Inc() if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil { if queue.NumRequeues(nodeName) < 5 { klog.InfoS("retrying node update", "nodeName", nodeName) @@ -53,6 +54,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI return true } else { klog.ErrorS(err, "failed to update node", "nodeName", nodeName) + nodeUpdateFailures.Inc() } } queue.Forget(nodeName)