mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
chore: add metrics system prefix
This commit is contained in:
parent
835832729f
commit
e631a52374
6 changed files with 117 additions and 79 deletions
|
@ -13,24 +13,25 @@ By default NFD Master and Worker expose metrics on port 8081.
|
||||||
|
|
||||||
The exposed metrics are
|
The exposed metrics are
|
||||||
|
|
||||||
| Metric | Type | Description |
|
| Metric | Type | Description |
|
||||||
| ------------------------------------------------- | --------- | ------------------------------------------------------- |
|
| -------------------------------------------------------- | --------- | -------------------------------------------------------------------------- |
|
||||||
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built |
|
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built |
|
||||||
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built |
|
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built |
|
||||||
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built |
|
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built |
|
||||||
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built |
|
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built |
|
||||||
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC |
|
| `nfd_master_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC |
|
||||||
| `nfd_node_updates_total` | Counter | Number of nodes updated |
|
| `nfd_master_node_updates_total` | Counter | Number of nodes updated |
|
||||||
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures |
|
| `nfd_master_node_feature_group_update_requests_total` | Counter | Number of cluster feature update requests processed by the master |
|
||||||
| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master |
|
| `nfd_master_node_update_failures_total` | Counter | Number of nodes update failures |
|
||||||
| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master |
|
| `nfd_master_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master |
|
||||||
| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master |
|
| `nfd_master_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master |
|
||||||
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects |
|
| `nfd_master_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master |
|
||||||
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects |
|
| `nfd_master_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects |
|
||||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node |
|
| `nfd_master_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects |
|
||||||
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. |
|
| `nfd_worker_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node |
|
||||||
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. |
|
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. |
|
||||||
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. |
|
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected. |
|
||||||
|
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects. |
|
||||||
|
|
||||||
## Kustomize
|
## Kustomize
|
||||||
|
|
||||||
|
|
|
@ -391,7 +391,7 @@
|
||||||
},
|
},
|
||||||
"disableTextWrap": false,
|
"disableTextWrap": false,
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "nfd_node_updates_total",
|
"expr": "nfd_master_node_updates_total",
|
||||||
"fullMetaSearch": false,
|
"fullMetaSearch": false,
|
||||||
"hide": false,
|
"hide": false,
|
||||||
"includeNullMetadata": true,
|
"includeNullMetadata": true,
|
||||||
|
@ -586,7 +586,7 @@
|
||||||
"uid": "prometheus"
|
"uid": "prometheus"
|
||||||
},
|
},
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "nfd_node_update_failures_total",
|
"expr": "nfd_master_node_update_failures_total",
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"range": true,
|
"range": true,
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
|
@ -679,7 +679,7 @@
|
||||||
"uid": "prometheus"
|
"uid": "prometheus"
|
||||||
},
|
},
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "nfd_nodefeaturerule_processing_errors_total",
|
"expr": "nfd_master_nodefeaturerule_processing_errors_total",
|
||||||
"hide": false,
|
"hide": false,
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"range": true,
|
"range": true,
|
||||||
|
@ -940,7 +940,7 @@
|
||||||
"uid": "prometheus"
|
"uid": "prometheus"
|
||||||
},
|
},
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "sum by(le) (nfd_feature_discovery_duration_seconds_bucket)",
|
"expr": "sum by(le) (nfd_worker_feature_discovery_duration_seconds_bucket)",
|
||||||
"format": "heatmap",
|
"format": "heatmap",
|
||||||
"legendFormat": "__auto",
|
"legendFormat": "__auto",
|
||||||
"range": true,
|
"range": true,
|
||||||
|
@ -1007,7 +1007,7 @@
|
||||||
"uid": "prometheus"
|
"uid": "prometheus"
|
||||||
},
|
},
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "sum by(le) (nfd_nodefeaturerule_processing_duration_seconds_bucket)",
|
"expr": "sum by(le) (nfd_master_nodefeaturerule_processing_duration_seconds_bucket)",
|
||||||
"format": "heatmap",
|
"format": "heatmap",
|
||||||
"legendFormat": "__auto",
|
"legendFormat": "__auto",
|
||||||
"range": true,
|
"range": true,
|
||||||
|
@ -1101,7 +1101,7 @@
|
||||||
"uid": "prometheus"
|
"uid": "prometheus"
|
||||||
},
|
},
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "nfd_node_labels_rejected_total",
|
"expr": "nfd_master_node_labels_rejected_total",
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"range": true,
|
"range": true,
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
|
@ -1194,7 +1194,7 @@
|
||||||
"uid": "prometheus"
|
"uid": "prometheus"
|
||||||
},
|
},
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "nfd_node_extendedresources_rejected_total",
|
"expr": "nfd_master_node_extendedresources_rejected_total",
|
||||||
"hide": false,
|
"hide": false,
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"range": true,
|
"range": true,
|
||||||
|
@ -1288,7 +1288,7 @@
|
||||||
"uid": "prometheus"
|
"uid": "prometheus"
|
||||||
},
|
},
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"expr": "nfd_node_taints_rejected_total",
|
"expr": "nfd_master_node_taints_rejected_total",
|
||||||
"hide": false,
|
"hide": false,
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"range": true,
|
"range": true,
|
||||||
|
|
|
@ -23,27 +23,35 @@ import (
|
||||||
|
|
||||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
const (
|
const (
|
||||||
buildInfoQuery = "nfd_gc_build_info"
|
buildInfoQuery = "build_info"
|
||||||
objectsDeletedQuery = "nfd_gc_objects_deleted_total"
|
objectsDeletedQuery = "objects_deleted_total"
|
||||||
objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total"
|
objectDeleteErrorsQuery = "object_delete_failures_total"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// nfdGCPrefix - subsystem name used by nfd gc.
|
||||||
|
nfdGCPrefix = "nfd_gc"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Name: buildInfoQuery,
|
Subsystem: nfdGCPrefix,
|
||||||
Help: "Version from which Node Feature Discovery was built.",
|
Name: buildInfoQuery,
|
||||||
|
Help: "Version from which Node Feature Discovery was built.",
|
||||||
ConstLabels: map[string]string{
|
ConstLabels: map[string]string{
|
||||||
"version": version.Get(),
|
"version": version.Get(),
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
|
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
Name: objectsDeletedQuery,
|
Subsystem: nfdGCPrefix,
|
||||||
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
|
Name: objectsDeletedQuery,
|
||||||
|
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
|
||||||
[]string{"kind"},
|
[]string{"kind"},
|
||||||
)
|
)
|
||||||
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
|
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
Name: objectDeleteErrorsQuery,
|
Subsystem: nfdGCPrefix,
|
||||||
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
|
Name: objectDeleteErrorsQuery,
|
||||||
|
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
|
||||||
[]string{"kind"},
|
[]string{"kind"},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -23,59 +23,73 @@ import (
|
||||||
|
|
||||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
const (
|
const (
|
||||||
buildInfoQuery = "nfd_master_build_info"
|
buildInfoQuery = "build_info"
|
||||||
nodeUpdateRequestsQuery = "nfd_node_update_requests_total"
|
nodeUpdateRequestsQuery = "node_update_requests_total"
|
||||||
nodeUpdatesQuery = "nfd_node_updates_total"
|
nodeUpdatesQuery = "node_updates_total"
|
||||||
nodeFeatureGroupUpdateRequestsQuery = "nfd_node_feature_group_update_requests_total"
|
nodeFeatureGroupUpdateRequestsQuery = "node_feature_group_update_requests_total"
|
||||||
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
|
nodeUpdateFailuresQuery = "node_update_failures_total"
|
||||||
nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total"
|
nodeLabelsRejectedQuery = "node_labels_rejected_total"
|
||||||
nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total"
|
nodeERsRejectedQuery = "node_extendedresources_rejected_total"
|
||||||
nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total"
|
nodeTaintsRejectedQuery = "node_taints_rejected_total"
|
||||||
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
|
nfrProcessingTimeQuery = "nodefeaturerule_processing_duration_seconds"
|
||||||
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total"
|
nfrProcessingErrorsQuery = "nodefeaturerule_processing_errors_total"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// nfdMasterPrefix - subsystem name used by nfd master.
|
||||||
|
nfdMasterPrefix = "nfd_master"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Name: buildInfoQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Version from which Node Feature Discovery was built.",
|
Name: buildInfoQuery,
|
||||||
|
Help: "Version from which Node Feature Discovery was built.",
|
||||||
ConstLabels: map[string]string{
|
ConstLabels: map[string]string{
|
||||||
"version": version.Get(),
|
"version": version.Get(),
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeUpdateRequestsQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of node update requests processed by the master.",
|
Name: nodeUpdateRequestsQuery,
|
||||||
|
Help: "Number of node update requests processed by the master.",
|
||||||
})
|
})
|
||||||
nodeFeatureGroupUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeFeatureGroupUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeFeatureGroupUpdateRequestsQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of cluster feature update requests processed by the master.",
|
Name: nodeFeatureGroupUpdateRequestsQuery,
|
||||||
|
Help: "Number of cluster feature update requests processed by the master.",
|
||||||
})
|
})
|
||||||
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeUpdatesQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of nodes updated by the master.",
|
Name: nodeUpdatesQuery,
|
||||||
|
Help: "Number of nodes updated by the master.",
|
||||||
})
|
})
|
||||||
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeUpdateFailuresQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of node update failures.",
|
Name: nodeUpdateFailuresQuery,
|
||||||
|
Help: "Number of node update failures.",
|
||||||
})
|
})
|
||||||
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeLabelsRejectedQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of node labels that were rejected by nfd-master.",
|
Name: nodeLabelsRejectedQuery,
|
||||||
|
Help: "Number of node labels that were rejected by nfd-master.",
|
||||||
})
|
})
|
||||||
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeERsRejectedQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of node extended resources that were rejected by nfd-master.",
|
Name: nodeERsRejectedQuery,
|
||||||
|
Help: "Number of node extended resources that were rejected by nfd-master.",
|
||||||
})
|
})
|
||||||
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeTaintsRejectedQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of node taints that were rejected by nfd-master.",
|
Name: nodeTaintsRejectedQuery,
|
||||||
|
Help: "Number of node taints that were rejected by nfd-master.",
|
||||||
})
|
})
|
||||||
nfrProcessingTime = prometheus.NewHistogramVec(
|
nfrProcessingTime = prometheus.NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Name: nfrProcessingTimeQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Time processing time of NodeFeatureRule objects.",
|
Name: nfrProcessingTimeQuery,
|
||||||
Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01},
|
Help: "Time processing time of NodeFeatureRule objects.",
|
||||||
|
Buckets: []float64{0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01},
|
||||||
},
|
},
|
||||||
[]string{
|
[]string{
|
||||||
"name",
|
"name",
|
||||||
|
@ -83,8 +97,9 @@ var (
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nfrProcessingErrorsQuery,
|
Subsystem: nfdMasterPrefix,
|
||||||
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
|
Name: nfrProcessingErrorsQuery,
|
||||||
|
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -23,21 +23,28 @@ import (
|
||||||
|
|
||||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
const (
|
const (
|
||||||
buildInfoQuery = "nfd_topology_updater_build_info"
|
buildInfoQuery = "build_info"
|
||||||
scanErrorsQuery = "nfd_topology_updater_scan_errors_total"
|
scanErrorsQuery = "scan_errors_total"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// nfdTopologyUpdaterPrefix - subsystem name used by nfd topology updater.
|
||||||
|
nfdTopologyUpdaterPrefix = "nfd_topology_updater"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Name: buildInfoQuery,
|
Subsystem: nfdTopologyUpdaterPrefix,
|
||||||
Help: "Version from which Node Feature Discovery was built.",
|
Name: buildInfoQuery,
|
||||||
|
Help: "Version from which Node Feature Discovery was built.",
|
||||||
ConstLabels: map[string]string{
|
ConstLabels: map[string]string{
|
||||||
"version": version.Get(),
|
"version": version.Get(),
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: scanErrorsQuery,
|
Subsystem: nfdTopologyUpdaterPrefix,
|
||||||
Help: "Number of errors in scanning resource allocation of pods.",
|
Name: scanErrorsQuery,
|
||||||
|
Help: "Number of errors in scanning resource allocation of pods.",
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -23,22 +23,29 @@ import (
|
||||||
|
|
||||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
const (
|
const (
|
||||||
buildInfoQuery = "nfd_worker_build_info"
|
buildInfoQuery = "build_info"
|
||||||
featureDiscoveryDurationQuery = "nfd_feature_discovery_duration_seconds"
|
featureDiscoveryDurationQuery = "feature_discovery_duration_seconds"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// nfdWorkerPrefix - subsystem name used by nfd worker.
|
||||||
|
nfdWorkerPrefix = "nfd_worker"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
featureDiscoveryDuration = prometheus.NewHistogramVec(
|
featureDiscoveryDuration = prometheus.NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Name: featureDiscoveryDurationQuery,
|
Subsystem: nfdWorkerPrefix,
|
||||||
Help: "Time taken to discover features",
|
Name: featureDiscoveryDurationQuery,
|
||||||
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
|
Help: "Time taken to discover features",
|
||||||
|
Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
|
||||||
},
|
},
|
||||||
[]string{"node"},
|
[]string{"node"},
|
||||||
)
|
)
|
||||||
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Name: buildInfoQuery,
|
Subsystem: nfdWorkerPrefix,
|
||||||
Help: "Version from which Node Feature Discovery was built.",
|
Name: buildInfoQuery,
|
||||||
|
Help: "Version from which Node Feature Discovery was built.",
|
||||||
ConstLabels: map[string]string{
|
ConstLabels: map[string]string{
|
||||||
"version": version.Get(),
|
"version": version.Get(),
|
||||||
},
|
},
|
||||||
|
|
Loading…
Reference in a new issue