mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-28 10:47:23 +00:00
metrics: add nfd_node_update_failures_total counter
Add a new counter for tracking node update failures from nfd-master. This tracks both normal feature updates and the --prune sub-command. This is a simple counter without any additional labels - nfd-master logs can be used for further diagnostics.
This commit is contained in:
parent
9ed191808d
commit
b90f2c318e
4 changed files with 13 additions and 3 deletions
|
@ -18,6 +18,7 @@ The exposed metrics are
|
|||
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
|
||||
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
|
||||
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
||||
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures
|
||||
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
|
||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
||||
|
||||
|
|
|
@ -28,9 +28,10 @@ import (
|
|||
|
||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||
const (
|
||||
buildInfoQuery = "nfd_master_build_info"
|
||||
nodeUpdatesQuery = "nfd_node_updates_total"
|
||||
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
|
||||
buildInfoQuery = "nfd_master_build_info"
|
||||
nodeUpdatesQuery = "nfd_node_updates_total"
|
||||
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
|
||||
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -47,6 +48,10 @@ var (
|
|||
Name: nodeUpdatesQuery,
|
||||
Help: "Number of nodes updated by the master.",
|
||||
})
|
||||
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: nodeUpdateFailuresQuery,
|
||||
Help: "Number of node update failures.",
|
||||
})
|
||||
nfrProcessingTime = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: nfrProcessingTimeQuery,
|
||||
|
@ -70,6 +75,7 @@ func runMetricsServer(port int) {
|
|||
r := prometheus.NewRegistry()
|
||||
r.MustRegister(buildInfo,
|
||||
nodeUpdates,
|
||||
nodeUpdateFailures,
|
||||
nfrProcessingTime)
|
||||
|
||||
mux := http.NewServeMux()
|
||||
|
|
|
@ -452,6 +452,7 @@ func (m *nfdMaster) prune() error {
|
|||
// Prune labels and extended resources
|
||||
err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{})
|
||||
if err != nil {
|
||||
nodeUpdateFailures.Inc()
|
||||
return fmt.Errorf("failed to prune node %q: %v", node.Name, err)
|
||||
}
|
||||
|
||||
|
@ -675,6 +676,7 @@ func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.Se
|
|||
|
||||
// Create labels et al
|
||||
if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil {
|
||||
nodeUpdateFailures.Inc()
|
||||
return &pb.SetLabelsReply{}, err
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,6 +53,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI
|
|||
return true
|
||||
} else {
|
||||
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
|
||||
nodeUpdateFailures.Inc()
|
||||
}
|
||||
}
|
||||
queue.Forget(nodeName)
|
||||
|
|
Loading…
Add table
Reference in a new issue