1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

chore: add metrics system prefix

This commit is contained in:
googs1025 2024-11-17 11:55:25 +08:00
parent 835832729f
commit e631a52374
6 changed files with 117 additions and 79 deletions

View file

@ -13,24 +13,25 @@ By default NFD Master and Worker expose metrics on port 8081.
The exposed metrics are The exposed metrics are
| Metric | Type | Description | | Metric | Type | Description |
| ------------------------------------------------- | --------- | ------------------------------------------------------- | | -------------------------------------------------------- | --------- | -------------------------------------------------------------------------- |
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built | | `nfd_master_build_info` | Gauge | Version from which nfd-master was built |
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built | | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built |
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built | | `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built |
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built | | `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built |
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC | | `nfd_master_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC |
| `nfd_node_updates_total` | Counter | Number of nodes updated | | `nfd_master_node_updates_total` | Counter | Number of nodes updated |
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures | | `nfd_master_node_feature_group_update_requests_total` | Counter | Number of cluster feature update requests processed by the master |
| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master | | `nfd_master_node_update_failures_total` | Counter | Number of nodes update failures |
| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master | | `nfd_master_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master |
| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master | | `nfd_master_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master |
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects | | `nfd_master_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master |
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | | `nfd_master_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects |
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node | | `nfd_master_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects |
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. | | `nfd_worker_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node |
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. | | `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. |
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. | | `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. |
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. |
## Kustomize ## Kustomize

View file

@ -391,7 +391,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "nfd_node_updates_total", "expr": "nfd_master_node_updates_total",
"fullMetaSearch": false, "fullMetaSearch": false,
"hide": false, "hide": false,
"includeNullMetadata": true, "includeNullMetadata": true,
@ -586,7 +586,7 @@
"uid": "prometheus" "uid": "prometheus"
}, },
"editorMode": "builder", "editorMode": "builder",
"expr": "nfd_node_update_failures_total", "expr": "nfd_master_node_update_failures_total",
"legendFormat": "total", "legendFormat": "total",
"range": true, "range": true,
"refId": "A" "refId": "A"
@ -679,7 +679,7 @@
"uid": "prometheus" "uid": "prometheus"
}, },
"editorMode": "builder", "editorMode": "builder",
"expr": "nfd_nodefeaturerule_processing_errors_total", "expr": "nfd_master_nodefeaturerule_processing_errors_total",
"hide": false, "hide": false,
"legendFormat": "total", "legendFormat": "total",
"range": true, "range": true,
@ -940,7 +940,7 @@
"uid": "prometheus" "uid": "prometheus"
}, },
"editorMode": "builder", "editorMode": "builder",
"expr": "sum by(le) (nfd_feature_discovery_duration_seconds_bucket)", "expr": "sum by(le) (nfd_worker_feature_discovery_duration_seconds_bucket)",
"format": "heatmap", "format": "heatmap",
"legendFormat": "__auto", "legendFormat": "__auto",
"range": true, "range": true,
@ -1007,7 +1007,7 @@
"uid": "prometheus" "uid": "prometheus"
}, },
"editorMode": "builder", "editorMode": "builder",
"expr": "sum by(le) (nfd_nodefeaturerule_processing_duration_seconds_bucket)", "expr": "sum by(le) (nfd_master_nodefeaturerule_processing_duration_seconds_bucket)",
"format": "heatmap", "format": "heatmap",
"legendFormat": "__auto", "legendFormat": "__auto",
"range": true, "range": true,
@ -1101,7 +1101,7 @@
"uid": "prometheus" "uid": "prometheus"
}, },
"editorMode": "builder", "editorMode": "builder",
"expr": "nfd_node_labels_rejected_total", "expr": "nfd_master_node_labels_rejected_total",
"legendFormat": "total", "legendFormat": "total",
"range": true, "range": true,
"refId": "A" "refId": "A"
@ -1194,7 +1194,7 @@
"uid": "prometheus" "uid": "prometheus"
}, },
"editorMode": "builder", "editorMode": "builder",
"expr": "nfd_node_extendedresources_rejected_total", "expr": "nfd_master_node_extendedresources_rejected_total",
"hide": false, "hide": false,
"legendFormat": "total", "legendFormat": "total",
"range": true, "range": true,
@ -1288,7 +1288,7 @@
"uid": "prometheus" "uid": "prometheus"
}, },
"editorMode": "builder", "editorMode": "builder",
"expr": "nfd_node_taints_rejected_total", "expr": "nfd_master_node_taints_rejected_total",
"hide": false, "hide": false,
"legendFormat": "total", "legendFormat": "total",
"range": true, "range": true,

View file

@ -23,27 +23,35 @@ import (
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const ( const (
buildInfoQuery = "nfd_gc_build_info" buildInfoQuery = "build_info"
objectsDeletedQuery = "nfd_gc_objects_deleted_total" objectsDeletedQuery = "objects_deleted_total"
objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total" objectDeleteErrorsQuery = "object_delete_failures_total"
)
const (
// nfdGCPrefix - subsystem name used by nfd gc.
nfdGCPrefix = "nfd_gc"
) )
var ( var (
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery, Subsystem: nfdGCPrefix,
Help: "Version from which Node Feature Discovery was built.", Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{ ConstLabels: map[string]string{
"version": version.Get(), "version": version.Get(),
}, },
}) })
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{ objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: objectsDeletedQuery, Subsystem: nfdGCPrefix,
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."}, Name: objectsDeletedQuery,
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
[]string{"kind"}, []string{"kind"},
) )
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: objectDeleteErrorsQuery, Subsystem: nfdGCPrefix,
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."}, Name: objectDeleteErrorsQuery,
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
[]string{"kind"}, []string{"kind"},
) )
) )

View file

@ -23,59 +23,73 @@ import (
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const ( const (
buildInfoQuery = "nfd_master_build_info" buildInfoQuery = "build_info"
nodeUpdateRequestsQuery = "nfd_node_update_requests_total" nodeUpdateRequestsQuery = "node_update_requests_total"
nodeUpdatesQuery = "nfd_node_updates_total" nodeUpdatesQuery = "node_updates_total"
nodeFeatureGroupUpdateRequestsQuery = "nfd_node_feature_group_update_requests_total" nodeFeatureGroupUpdateRequestsQuery = "node_feature_group_update_requests_total"
nodeUpdateFailuresQuery = "nfd_node_update_failures_total" nodeUpdateFailuresQuery = "node_update_failures_total"
nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total" nodeLabelsRejectedQuery = "node_labels_rejected_total"
nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total" nodeERsRejectedQuery = "node_extendedresources_rejected_total"
nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total" nodeTaintsRejectedQuery = "node_taints_rejected_total"
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" nfrProcessingTimeQuery = "nodefeaturerule_processing_duration_seconds"
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total" nfrProcessingErrorsQuery = "nodefeaturerule_processing_errors_total"
)
const (
// nfdMasterPrefix - subsystem name used by nfd master.
nfdMasterPrefix = "nfd_master"
) )
var ( var (
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery, Subsystem: nfdMasterPrefix,
Help: "Version from which Node Feature Discovery was built.", Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{ ConstLabels: map[string]string{
"version": version.Get(), "version": version.Get(),
}, },
}) })
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{ nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateRequestsQuery, Subsystem: nfdMasterPrefix,
Help: "Number of node update requests processed by the master.", Name: nodeUpdateRequestsQuery,
Help: "Number of node update requests processed by the master.",
}) })
nodeFeatureGroupUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{ nodeFeatureGroupUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeFeatureGroupUpdateRequestsQuery, Subsystem: nfdMasterPrefix,
Help: "Number of cluster feature update requests processed by the master.", Name: nodeFeatureGroupUpdateRequestsQuery,
Help: "Number of cluster feature update requests processed by the master.",
}) })
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{ nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdatesQuery, Subsystem: nfdMasterPrefix,
Help: "Number of nodes updated by the master.", Name: nodeUpdatesQuery,
Help: "Number of nodes updated by the master.",
}) })
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{ nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateFailuresQuery, Subsystem: nfdMasterPrefix,
Help: "Number of node update failures.", Name: nodeUpdateFailuresQuery,
Help: "Number of node update failures.",
}) })
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{ nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeLabelsRejectedQuery, Subsystem: nfdMasterPrefix,
Help: "Number of node labels that were rejected by nfd-master.", Name: nodeLabelsRejectedQuery,
Help: "Number of node labels that were rejected by nfd-master.",
}) })
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{ nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeERsRejectedQuery, Subsystem: nfdMasterPrefix,
Help: "Number of node extended resources that were rejected by nfd-master.", Name: nodeERsRejectedQuery,
Help: "Number of node extended resources that were rejected by nfd-master.",
}) })
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{ nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeTaintsRejectedQuery, Subsystem: nfdMasterPrefix,
Help: "Number of node taints that were rejected by nfd-master.", Name: nodeTaintsRejectedQuery,
Help: "Number of node taints that were rejected by nfd-master.",
}) })
nfrProcessingTime = prometheus.NewHistogramVec( nfrProcessingTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{ prometheus.HistogramOpts{
Name: nfrProcessingTimeQuery, Subsystem: nfdMasterPrefix,
Help: "Time processing time of NodeFeatureRule objects.", Name: nfrProcessingTimeQuery,
Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01}, Help: "Time processing time of NodeFeatureRule objects.",
Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01},
}, },
[]string{ []string{
"name", "name",
@ -83,8 +97,9 @@ var (
}, },
) )
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{ nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: nfrProcessingErrorsQuery, Subsystem: nfdMasterPrefix,
Help: "Number of errors encountered while processing NodeFeatureRule objects.", Name: nfrProcessingErrorsQuery,
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
}) })
) )

View file

@ -23,21 +23,28 @@ import (
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const ( const (
buildInfoQuery = "nfd_topology_updater_build_info" buildInfoQuery = "build_info"
scanErrorsQuery = "nfd_topology_updater_scan_errors_total" scanErrorsQuery = "scan_errors_total"
)
const (
// nfdTopologyUpdaterPrefix - subsystem name used by nfd topology updater.
nfdTopologyUpdaterPrefix = "nfd_topology_updater"
) )
var ( var (
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery, Subsystem: nfdTopologyUpdaterPrefix,
Help: "Version from which Node Feature Discovery was built.", Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{ ConstLabels: map[string]string{
"version": version.Get(), "version": version.Get(),
}, },
}) })
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{ scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: scanErrorsQuery, Subsystem: nfdTopologyUpdaterPrefix,
Help: "Number of errors in scanning resource allocation of pods.", Name: scanErrorsQuery,
Help: "Number of errors in scanning resource allocation of pods.",
}) })
) )

View file

@ -23,22 +23,29 @@ import (
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const ( const (
buildInfoQuery = "nfd_worker_build_info" buildInfoQuery = "build_info"
featureDiscoveryDurationQuery = "nfd_feature_discovery_duration_seconds" featureDiscoveryDurationQuery = "feature_discovery_duration_seconds"
)
const (
// nfdWorkerPrefix - subsystem name used by nfd worker.
nfdWorkerPrefix = "nfd_worker"
) )
var ( var (
featureDiscoveryDuration = prometheus.NewHistogramVec( featureDiscoveryDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{ prometheus.HistogramOpts{
Name: featureDiscoveryDurationQuery, Subsystem: nfdWorkerPrefix,
Help: "Time taken to discover features", Name: featureDiscoveryDurationQuery,
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1}, Help: "Time taken to discover features",
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
}, },
[]string{"node"}, []string{"node"},
) )
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery, Subsystem: nfdWorkerPrefix,
Help: "Version from which Node Feature Discovery was built.", Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{ ConstLabels: map[string]string{
"version": version.Get(), "version": version.Get(),
}, },