diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index 61d3baaca..d0edd59a7 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -13,24 +13,25 @@ By default NFD Master and Worker expose metrics on port 8081. The exposed metrics are -| Metric | Type | Description | -| ------------------------------------------------- | --------- | ------------------------------------------------------- | -| `nfd_master_build_info` | Gauge | Version from which nfd-master was built | -| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built | -| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built | -| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built | -| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC | -| `nfd_node_updates_total` | Counter | Number of nodes updated | -| `nfd_node_update_failures_total` | Counter | Number of nodes update failures | -| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master | -| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master | -| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master | -| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects | -| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | -| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node | -| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. | -| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. | -| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. | +| Metric | Type | Description | +| -------------------------------------------------------- | --------- | -------------------------------------------------------------------------- | +| `nfd_master_build_info` | Gauge | Version from which nfd-master was built | +| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built | +| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built | +| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built | +| `nfd_master_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC | +| `nfd_master_node_updates_total` | Counter | Number of nodes updated | +| `nfd_master_node_feature_group_update_requests_total` | Counter | Number of cluster feature update requests processed by the master | +| `nfd_master_node_update_failures_total` | Counter | Number of nodes update failures | +| `nfd_master_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master | +| `nfd_master_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master | +| `nfd_master_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master | +| `nfd_master_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects | +| `nfd_master_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | +| `nfd_worker_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node | +| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. | +| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. | +| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. | ## Kustomize diff --git a/examples/grafana-dashboard.json b/examples/grafana-dashboard.json index d086a138a..e9cd95dec 100644 --- a/examples/grafana-dashboard.json +++ b/examples/grafana-dashboard.json @@ -391,7 +391,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "nfd_node_updates_total", + "expr": "nfd_master_node_updates_total", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -586,7 +586,7 @@ "uid": "prometheus" }, "editorMode": "builder", - "expr": "nfd_node_update_failures_total", + "expr": "nfd_master_node_update_failures_total", "legendFormat": "total", "range": true, "refId": "A" @@ -679,7 +679,7 @@ "uid": "prometheus" }, "editorMode": "builder", - "expr": "nfd_nodefeaturerule_processing_errors_total", + "expr": "nfd_master_nodefeaturerule_processing_errors_total", "hide": false, "legendFormat": "total", "range": true, @@ -940,7 +940,7 @@ "uid": "prometheus" }, "editorMode": "builder", - "expr": "sum by(le) (nfd_feature_discovery_duration_seconds_bucket)", + "expr": "sum by(le) (nfd_worker_feature_discovery_duration_seconds_bucket)", "format": "heatmap", "legendFormat": "__auto", "range": true, @@ -1007,7 +1007,7 @@ "uid": "prometheus" }, "editorMode": "builder", - "expr": "sum by(le) (nfd_nodefeaturerule_processing_duration_seconds_bucket)", + "expr": "sum by(le) (nfd_master_nodefeaturerule_processing_duration_seconds_bucket)", "format": "heatmap", "legendFormat": "__auto", "range": true, @@ -1101,7 +1101,7 @@ "uid": "prometheus" }, "editorMode": "builder", - "expr": "nfd_node_labels_rejected_total", + "expr": "nfd_master_node_labels_rejected_total", "legendFormat": "total", "range": true, "refId": "A" @@ -1194,7 +1194,7 @@ "uid": "prometheus" }, "editorMode": "builder", - "expr": "nfd_node_extendedresources_rejected_total", + "expr": "nfd_master_node_extendedresources_rejected_total", "hide": false, "legendFormat": "total", "range": true, @@ -1288,7 +1288,7 @@ "uid": "prometheus" }, "editorMode": "builder", - "expr": "nfd_node_taints_rejected_total", + "expr": "nfd_master_node_taints_rejected_total", "hide": false, "legendFormat": "total", "range": true, diff --git a/pkg/nfd-gc/metrics.go b/pkg/nfd-gc/metrics.go index bc884808d..f10ddc02f 100644 --- a/pkg/nfd-gc/metrics.go +++ b/pkg/nfd-gc/metrics.go @@ -23,27 +23,35 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_gc_build_info" - objectsDeletedQuery = "nfd_gc_objects_deleted_total" - objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total" + buildInfoQuery = "build_info" + objectsDeletedQuery = "objects_deleted_total" + objectDeleteErrorsQuery = "object_delete_failures_total" +) + +const ( + // nfdGCPrefix - subsystem name used by nfd gc. + nfdGCPrefix = "nfd_gc" ) var ( buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: buildInfoQuery, - Help: "Version from which Node Feature Discovery was built.", + Subsystem: nfdGCPrefix, + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", ConstLabels: map[string]string{ "version": version.Get(), }, }) objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: objectsDeletedQuery, - Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."}, + Subsystem: nfdGCPrefix, + Name: objectsDeletedQuery, + Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."}, []string{"kind"}, ) objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: objectDeleteErrorsQuery, - Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."}, + Subsystem: nfdGCPrefix, + Name: objectDeleteErrorsQuery, + Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."}, []string{"kind"}, ) ) diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 20335f6f0..7951432ca 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -23,59 +23,73 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_master_build_info" - nodeUpdateRequestsQuery = "nfd_node_update_requests_total" - nodeUpdatesQuery = "nfd_node_updates_total" - nodeFeatureGroupUpdateRequestsQuery = "nfd_node_feature_group_update_requests_total" - nodeUpdateFailuresQuery = "nfd_node_update_failures_total" - nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total" - nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total" - nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total" - nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" - nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total" + buildInfoQuery = "build_info" + nodeUpdateRequestsQuery = "node_update_requests_total" + nodeUpdatesQuery = "node_updates_total" + nodeFeatureGroupUpdateRequestsQuery = "node_feature_group_update_requests_total" + nodeUpdateFailuresQuery = "node_update_failures_total" + nodeLabelsRejectedQuery = "node_labels_rejected_total" + nodeERsRejectedQuery = "node_extendedresources_rejected_total" + nodeTaintsRejectedQuery = "node_taints_rejected_total" + nfrProcessingTimeQuery = "nodefeaturerule_processing_duration_seconds" + nfrProcessingErrorsQuery = "nodefeaturerule_processing_errors_total" +) + +const ( + // nfdMasterPrefix - subsystem name used by nfd master. + nfdMasterPrefix = "nfd_master" ) var ( buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: buildInfoQuery, - Help: "Version from which Node Feature Discovery was built.", + Subsystem: nfdMasterPrefix, + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", ConstLabels: map[string]string{ "version": version.Get(), }, }) nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nodeUpdateRequestsQuery, - Help: "Number of node update requests processed by the master.", + Subsystem: nfdMasterPrefix, + Name: nodeUpdateRequestsQuery, + Help: "Number of node update requests processed by the master.", }) nodeFeatureGroupUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nodeFeatureGroupUpdateRequestsQuery, - Help: "Number of cluster feature update requests processed by the master.", + Subsystem: nfdMasterPrefix, + Name: nodeFeatureGroupUpdateRequestsQuery, + Help: "Number of cluster feature update requests processed by the master.", }) nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nodeUpdatesQuery, - Help: "Number of nodes updated by the master.", + Subsystem: nfdMasterPrefix, + Name: nodeUpdatesQuery, + Help: "Number of nodes updated by the master.", }) nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nodeUpdateFailuresQuery, - Help: "Number of node update failures.", + Subsystem: nfdMasterPrefix, + Name: nodeUpdateFailuresQuery, + Help: "Number of node update failures.", }) nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nodeLabelsRejectedQuery, - Help: "Number of node labels that were rejected by nfd-master.", + Subsystem: nfdMasterPrefix, + Name: nodeLabelsRejectedQuery, + Help: "Number of node labels that were rejected by nfd-master.", }) nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nodeERsRejectedQuery, - Help: "Number of node extended resources that were rejected by nfd-master.", + Subsystem: nfdMasterPrefix, + Name: nodeERsRejectedQuery, + Help: "Number of node extended resources that were rejected by nfd-master.", }) nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nodeTaintsRejectedQuery, - Help: "Number of node taints that were rejected by nfd-master.", + Subsystem: nfdMasterPrefix, + Name: nodeTaintsRejectedQuery, + Help: "Number of node taints that were rejected by nfd-master.", }) nfrProcessingTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: nfrProcessingTimeQuery, - Help: "Time processing time of NodeFeatureRule objects.", - Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01}, + Subsystem: nfdMasterPrefix, + Name: nfrProcessingTimeQuery, + Help: "Time processing time of NodeFeatureRule objects.", + Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01}, }, []string{ "name", @@ -83,8 +97,9 @@ var ( }, ) nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{ - Name: nfrProcessingErrorsQuery, - Help: "Number of errors encountered while processing NodeFeatureRule objects.", + Subsystem: nfdMasterPrefix, + Name: nfrProcessingErrorsQuery, + Help: "Number of errors encountered while processing NodeFeatureRule objects.", }) ) diff --git a/pkg/nfd-topology-updater/metrics.go b/pkg/nfd-topology-updater/metrics.go index 978667bb2..ab62c6e62 100644 --- a/pkg/nfd-topology-updater/metrics.go +++ b/pkg/nfd-topology-updater/metrics.go @@ -23,21 +23,28 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_topology_updater_build_info" - scanErrorsQuery = "nfd_topology_updater_scan_errors_total" + buildInfoQuery = "build_info" + scanErrorsQuery = "scan_errors_total" +) + +const ( + // nfdTopologyUpdaterPrefix - subsystem name used by nfd topology updater. + nfdTopologyUpdaterPrefix = "nfd_topology_updater" ) var ( buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: buildInfoQuery, - Help: "Version from which Node Feature Discovery was built.", + Subsystem: nfdTopologyUpdaterPrefix, + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", ConstLabels: map[string]string{ "version": version.Get(), }, }) scanErrors = prometheus.NewCounter(prometheus.CounterOpts{ - Name: scanErrorsQuery, - Help: "Number of errors in scanning resource allocation of pods.", + Subsystem: nfdTopologyUpdaterPrefix, + Name: scanErrorsQuery, + Help: "Number of errors in scanning resource allocation of pods.", }) ) diff --git a/pkg/nfd-worker/metrics.go b/pkg/nfd-worker/metrics.go index 206c4d577..37be314ac 100644 --- a/pkg/nfd-worker/metrics.go +++ b/pkg/nfd-worker/metrics.go @@ -23,22 +23,29 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_worker_build_info" - featureDiscoveryDurationQuery = "nfd_feature_discovery_duration_seconds" + buildInfoQuery = "build_info" + featureDiscoveryDurationQuery = "feature_discovery_duration_seconds" +) + +const ( + // nfdWorkerPrefix - subsystem name used by nfd worker. + nfdWorkerPrefix = "nfd_worker" ) var ( featureDiscoveryDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: featureDiscoveryDurationQuery, - Help: "Time taken to discover features", - Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1}, + Subsystem: nfdWorkerPrefix, + Name: featureDiscoveryDurationQuery, + Help: "Time taken to discover features", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1}, }, []string{"node"}, ) buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: buildInfoQuery, - Help: "Version from which Node Feature Discovery was built.", + Subsystem: nfdWorkerPrefix, + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", ConstLabels: map[string]string{ "version": version.Get(), },