mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-16 21:38:23 +00:00
Merge pull request #1290 from marquiz/devel/metrics-new
metrics: additional metrics for nfd-master
This commit is contained in:
commit
6d95e59cd0
4 changed files with 58 additions and 5 deletions
|
@ -17,8 +17,14 @@ The exposed metrics are
|
||||||
| ------------------------------------------------- | --------- | ---------------------------------------
|
| ------------------------------------------------- | --------- | ---------------------------------------
|
||||||
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
|
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
|
||||||
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
|
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
|
||||||
|
| `nfd_node_update_requests_total` | Counter | Number of node update requests processed by the master
|
||||||
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
||||||
|
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures
|
||||||
|
| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master
|
||||||
|
| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master
|
||||||
|
| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master
|
||||||
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
|
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
|
||||||
|
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
|
||||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
||||||
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
|
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
|
||||||
|
|
||||||
|
|
|
@ -28,9 +28,15 @@ import (
|
||||||
|
|
||||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
const (
|
const (
|
||||||
buildInfoQuery = "nfd_master_build_info"
|
buildInfoQuery = "nfd_master_build_info"
|
||||||
nodeUpdatesQuery = "nfd_node_updates_total"
|
nodeUpdateRequestsQuery = "nfd_node_update_requests_total"
|
||||||
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
|
nodeUpdatesQuery = "nfd_node_updates_total"
|
||||||
|
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
|
||||||
|
nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total"
|
||||||
|
nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total"
|
||||||
|
nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total"
|
||||||
|
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
|
||||||
|
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -43,10 +49,30 @@ var (
|
||||||
"version": version.Get(),
|
"version": version.Get(),
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: nodeUpdateRequestsQuery,
|
||||||
|
Help: "Number of node update requests processed by the master.",
|
||||||
|
})
|
||||||
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
|
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: nodeUpdatesQuery,
|
Name: nodeUpdatesQuery,
|
||||||
Help: "Number of nodes updated by the master.",
|
Help: "Number of nodes updated by the master.",
|
||||||
})
|
})
|
||||||
|
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: nodeUpdateFailuresQuery,
|
||||||
|
Help: "Number of node update failures.",
|
||||||
|
})
|
||||||
|
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: nodeLabelsRejectedQuery,
|
||||||
|
Help: "Number of node labels that were rejected by nfd-master.",
|
||||||
|
})
|
||||||
|
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: nodeERsRejectedQuery,
|
||||||
|
Help: "Number of node extended resources that were rejected by nfd-master.",
|
||||||
|
})
|
||||||
|
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: nodeTaintsRejectedQuery,
|
||||||
|
Help: "Number of node taints that were rejected by nfd-master.",
|
||||||
|
})
|
||||||
nfrProcessingTime = prometheus.NewHistogramVec(
|
nfrProcessingTime = prometheus.NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Name: nfrProcessingTimeQuery,
|
Name: nfrProcessingTimeQuery,
|
||||||
|
@ -58,6 +84,10 @@ var (
|
||||||
"node",
|
"node",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: nfrProcessingErrorsQuery,
|
||||||
|
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
|
||||||
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
// registerVersion exposes the Operator build version.
|
// registerVersion exposes the Operator build version.
|
||||||
|
@ -68,9 +98,16 @@ func registerVersion(version string) {
|
||||||
// runMetricsServer starts a http server to expose metrics
|
// runMetricsServer starts a http server to expose metrics
|
||||||
func runMetricsServer(port int) {
|
func runMetricsServer(port int) {
|
||||||
r := prometheus.NewRegistry()
|
r := prometheus.NewRegistry()
|
||||||
r.MustRegister(buildInfo,
|
r.MustRegister(
|
||||||
|
buildInfo,
|
||||||
|
nodeUpdateRequests,
|
||||||
nodeUpdates,
|
nodeUpdates,
|
||||||
nfrProcessingTime)
|
nodeUpdateFailures,
|
||||||
|
nodeLabelsRejected,
|
||||||
|
nodeERsRejected,
|
||||||
|
nodeTaintsRejected,
|
||||||
|
nfrProcessingTime,
|
||||||
|
nfrProcessingErrors)
|
||||||
|
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
|
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
|
||||||
|
|
|
@ -452,6 +452,7 @@ func (m *nfdMaster) prune() error {
|
||||||
// Prune labels and extended resources
|
// Prune labels and extended resources
|
||||||
err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{})
|
err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
nodeUpdateFailures.Inc()
|
||||||
return fmt.Errorf("failed to prune node %q: %v", node.Name, err)
|
return fmt.Errorf("failed to prune node %q: %v", node.Name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -509,6 +510,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea
|
||||||
|
|
||||||
if value, err := m.filterFeatureLabel(name, value, features); err != nil {
|
if value, err := m.filterFeatureLabel(name, value, features); err != nil {
|
||||||
klog.ErrorS(err, "ignoring label", "labelKey", name, "labelValue", value)
|
klog.ErrorS(err, "ignoring label", "labelKey", name, "labelValue", value)
|
||||||
|
nodeLabelsRejected.Inc()
|
||||||
} else {
|
} else {
|
||||||
outLabels[name] = value
|
outLabels[name] = value
|
||||||
}
|
}
|
||||||
|
@ -522,6 +524,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea
|
||||||
if value, ok := outLabels[extendedResourceName]; ok {
|
if value, ok := outLabels[extendedResourceName]; ok {
|
||||||
if _, err := strconv.Atoi(value); err != nil {
|
if _, err := strconv.Atoi(value); err != nil {
|
||||||
klog.ErrorS(err, "bad label value encountered for extended resource", "labelKey", extendedResourceName, "labelValue", value)
|
klog.ErrorS(err, "bad label value encountered for extended resource", "labelKey", extendedResourceName, "labelValue", value)
|
||||||
|
nodeERsRejected.Inc()
|
||||||
continue // non-numeric label can't be used
|
continue // non-numeric label can't be used
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -602,6 +605,7 @@ func filterTaints(taints []corev1.Taint) []corev1.Taint {
|
||||||
for _, taint := range taints {
|
for _, taint := range taints {
|
||||||
if err := filterTaint(&taint); err != nil {
|
if err := filterTaint(&taint); err != nil {
|
||||||
klog.ErrorS(err, "ignoring taint", "taint", taint)
|
klog.ErrorS(err, "ignoring taint", "taint", taint)
|
||||||
|
nodeTaintsRejected.Inc()
|
||||||
} else {
|
} else {
|
||||||
outTaints = append(outTaints, taint)
|
outTaints = append(outTaints, taint)
|
||||||
}
|
}
|
||||||
|
@ -650,6 +654,7 @@ func isNamespaceDenied(labelNs string, wildcardDeniedNs map[string]struct{}, nor
|
||||||
|
|
||||||
// SetLabels implements LabelerServer
|
// SetLabels implements LabelerServer
|
||||||
func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.SetLabelsReply, error) {
|
func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.SetLabelsReply, error) {
|
||||||
|
nodeUpdateRequests.Inc()
|
||||||
err := authorizeClient(c, m.args.VerifyNodeName, r.NodeName)
|
err := authorizeClient(c, m.args.VerifyNodeName, r.NodeName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.ErrorS(err, "gRPC client authorization failed", "nodeName", r.NodeName)
|
klog.ErrorS(err, "gRPC client authorization failed", "nodeName", r.NodeName)
|
||||||
|
@ -675,6 +680,7 @@ func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.Se
|
||||||
|
|
||||||
// Create labels et al
|
// Create labels et al
|
||||||
if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil {
|
if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil {
|
||||||
|
nodeUpdateFailures.Inc()
|
||||||
return &pb.SetLabelsReply{}, err
|
return &pb.SetLabelsReply{}, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -784,6 +790,7 @@ func filterExtendedResources(features *nfdv1alpha1.Features, extendedResources E
|
||||||
capacity, err := filterExtendedResource(name, value, features)
|
capacity, err := filterExtendedResource(name, value, features)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.ErrorS(err, "failed to create extended resources", "extendedResourceName", name, "extendedResourceValue", value)
|
klog.ErrorS(err, "failed to create extended resources", "extendedResourceName", name, "extendedResourceValue", value)
|
||||||
|
nodeERsRejected.Inc()
|
||||||
} else {
|
} else {
|
||||||
outExtendedResources[name] = capacity
|
outExtendedResources[name] = capacity
|
||||||
}
|
}
|
||||||
|
@ -989,6 +996,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha
|
||||||
ruleOut, err := rule.Execute(features)
|
ruleOut, err := rule.Execute(features)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName)
|
klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName)
|
||||||
|
nfrProcessingErrors.Inc()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
taints = append(taints, ruleOut.Taints...)
|
taints = append(taints, ruleOut.Taints...)
|
||||||
|
|
|
@ -46,6 +46,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI
|
||||||
|
|
||||||
defer queue.Done(nodeName)
|
defer queue.Done(nodeName)
|
||||||
|
|
||||||
|
nodeUpdateRequests.Inc()
|
||||||
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
|
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
|
||||||
if queue.NumRequeues(nodeName) < 5 {
|
if queue.NumRequeues(nodeName) < 5 {
|
||||||
klog.InfoS("retrying node update", "nodeName", nodeName)
|
klog.InfoS("retrying node update", "nodeName", nodeName)
|
||||||
|
@ -53,6 +54,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI
|
||||||
return true
|
return true
|
||||||
} else {
|
} else {
|
||||||
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
|
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
|
||||||
|
nodeUpdateFailures.Inc()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
queue.Forget(nodeName)
|
queue.Forget(nodeName)
|
||||||
|
|
Loading…
Add table
Reference in a new issue