mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-30 19:54:46 +00:00
metrics: add nfd_nodefeaturerule_processing_errors_total counter
Add a counter for errors encountered when processing NodeFeatureRules. Another simple counter without any additional prometheus labels - nfd-master logs can provide further details.
This commit is contained in:
parent
b90f2c318e
commit
a8a29e6df2
3 changed files with 13 additions and 5 deletions
|
@ -20,6 +20,7 @@ The exposed metrics are
|
|||
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
||||
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures
|
||||
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
|
||||
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
|
||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
||||
|
||||
## Via Kustomize
|
||||
|
|
|
@ -28,10 +28,11 @@ import (
|
|||
|
||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||
const (
|
||||
buildInfoQuery = "nfd_master_build_info"
|
||||
nodeUpdatesQuery = "nfd_node_updates_total"
|
||||
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
|
||||
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
|
||||
buildInfoQuery = "nfd_master_build_info"
|
||||
nodeUpdatesQuery = "nfd_node_updates_total"
|
||||
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
|
||||
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
|
||||
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -63,6 +64,10 @@ var (
|
|||
"node",
|
||||
},
|
||||
)
|
||||
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: nfrProcessingErrorsQuery,
|
||||
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
|
||||
})
|
||||
)
|
||||
|
||||
// registerVersion exposes the Operator build version.
|
||||
|
@ -76,7 +81,8 @@ func runMetricsServer(port int) {
|
|||
r.MustRegister(buildInfo,
|
||||
nodeUpdates,
|
||||
nodeUpdateFailures,
|
||||
nfrProcessingTime)
|
||||
nfrProcessingTime,
|
||||
nfrProcessingErrors)
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
|
||||
|
|
|
@ -991,6 +991,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha
|
|||
ruleOut, err := rule.Execute(features)
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName)
|
||||
nfrProcessingErrors.Inc()
|
||||
continue
|
||||
}
|
||||
taints = append(taints, ruleOut.Taints...)
|
||||
|
|
Loading…
Add table
Reference in a new issue