mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2025-03-16 21:38:23 +00:00
nfd-gc: add metrics
Implements three metrics for nfd-gc: - nfd_gc_build_info: version information of nfd-gc. - nfd_gc_objects_deleted_total: total number of NodeFeature and NodeResourceTopology objects deleted by nfd-gc. - nfd_gc_object_delete_failures_total: number of errors encountered when deleting NodeFeature and NodeResourceTopology objects.
This commit is contained in:
parent
44b26e39e4
commit
98c3b0750d
8 changed files with 89 additions and 3 deletions
|
@ -83,6 +83,8 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args {
|
||||||
"interval between cleanup of obsolete api objects")
|
"interval between cleanup of obsolete api objects")
|
||||||
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
|
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
|
||||||
"Kubeconfig to use")
|
"Kubeconfig to use")
|
||||||
|
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
|
||||||
|
"Port on which to expose metrics.")
|
||||||
|
|
||||||
klog.InitFlags(flagset)
|
klog.InitFlags(flagset)
|
||||||
|
|
||||||
|
|
|
@ -21,3 +21,6 @@ spec:
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
command:
|
command:
|
||||||
- "nfd-gc"
|
- "nfd-gc"
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: 8081
|
||||||
|
|
|
@ -58,6 +58,9 @@ spec:
|
||||||
drop: [ "ALL" ]
|
drop: [ "ALL" ]
|
||||||
readOnlyRootFilesystem: true
|
readOnlyRootFilesystem: true
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: {{ .Values.gc.metricsPort | default "8081"}}
|
||||||
|
|
||||||
{{- with .Values.gc.nodeSelector }}
|
{{- with .Values.gc.nodeSelector }}
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
|
|
|
@ -495,6 +495,8 @@ gc:
|
||||||
# cpu: 100m
|
# cpu: 100m
|
||||||
# memory: 128Mi
|
# memory: 128Mi
|
||||||
|
|
||||||
|
metricsPort: 8081
|
||||||
|
|
||||||
nodeSelector: {}
|
nodeSelector: {}
|
||||||
tolerations: []
|
tolerations: []
|
||||||
annotations: {}
|
annotations: {}
|
||||||
|
|
|
@ -203,6 +203,7 @@ API's you need to install the prometheus operator in your cluster.
|
||||||
| `gc.interval` | string | 1h | Time between periodic garbage collector runs
|
| `gc.interval` | string | 1h | Time between periodic garbage collector runs
|
||||||
| `gc.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings
|
| `gc.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings
|
||||||
| `gc.resources` | dict | {} | Garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/)
|
| `gc.resources` | dict | {} | Garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/)
|
||||||
|
| `gc.metricsPort` | integer | 8081 | Port on which to serve Prometheus metrics
|
||||||
| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector)
|
| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector)
|
||||||
| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
|
| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
|
||||||
| `gc.annotations` | dict | {} | Garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)
|
| `gc.annotations` | dict | {} | Garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)
|
||||||
|
|
|
@ -17,6 +17,7 @@ The exposed metrics are
|
||||||
| ------------------------------------------------- | --------- | ---------------------------------------
|
| ------------------------------------------------- | --------- | ---------------------------------------
|
||||||
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
|
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
|
||||||
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
|
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
|
||||||
|
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built
|
||||||
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built
|
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built
|
||||||
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC
|
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC
|
||||||
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
||||||
|
@ -28,6 +29,8 @@ The exposed metrics are
|
||||||
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
|
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
|
||||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
||||||
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
|
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
|
||||||
|
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected.
|
||||||
|
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects.
|
||||||
|
|
||||||
## Via Kustomize
|
## Via Kustomize
|
||||||
|
|
||||||
|
|
54
pkg/nfd-gc/metrics.go
Normal file
54
pkg/nfd-gc/metrics.go
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
/*
|
||||||
|
Copyright 2023 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package nfdgarbagecollector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"sigs.k8s.io/node-feature-discovery/pkg/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
|
const (
|
||||||
|
buildInfoQuery = "nfd_gc_build_info"
|
||||||
|
objectsDeletedQuery = "nfd_gc_objects_deleted_total"
|
||||||
|
objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
|
Name: buildInfoQuery,
|
||||||
|
Help: "Version from which Node Feature Discovery was built.",
|
||||||
|
ConstLabels: map[string]string{
|
||||||
|
"version": version.Get(),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Name: objectsDeletedQuery,
|
||||||
|
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
|
||||||
|
[]string{"kind"},
|
||||||
|
)
|
||||||
|
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Name: objectDeleteErrorsQuery,
|
||||||
|
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
|
||||||
|
[]string{"kind"},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
// registerVersion exposes the Operator build version.
|
||||||
|
func registerVersion(version string) {
|
||||||
|
buildInfo.SetToCurrentTime()
|
||||||
|
}
|
|
@ -34,13 +34,15 @@ import (
|
||||||
"sigs.k8s.io/node-feature-discovery/pkg/apihelper"
|
"sigs.k8s.io/node-feature-discovery/pkg/apihelper"
|
||||||
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1"
|
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1"
|
||||||
nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned"
|
nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned"
|
||||||
|
"sigs.k8s.io/node-feature-discovery/pkg/utils"
|
||||||
|
"sigs.k8s.io/node-feature-discovery/pkg/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Args are the command line arguments
|
// Args are the command line arguments
|
||||||
type Args struct {
|
type Args struct {
|
||||||
GCPeriod time.Duration
|
GCPeriod time.Duration
|
||||||
|
Kubeconfig string
|
||||||
Kubeconfig string
|
MetricsPort int
|
||||||
}
|
}
|
||||||
|
|
||||||
type NfdGarbageCollector interface {
|
type NfdGarbageCollector interface {
|
||||||
|
@ -74,29 +76,35 @@ func New(args *Args) (NfdGarbageCollector, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
|
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
|
||||||
|
kind := "NodeFeature"
|
||||||
if err := n.nfdClient.NfdV1alpha1().NodeFeatures(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
|
if err := n.nfdClient.NfdV1alpha1().NodeFeatures(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
|
||||||
if errors.IsNotFound(err) {
|
if errors.IsNotFound(err) {
|
||||||
klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name))
|
klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name))
|
||||||
return
|
return
|
||||||
} else {
|
} else {
|
||||||
klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name))
|
klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name))
|
||||||
|
objectDeleteErrors.WithLabelValues(kind).Inc()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name))
|
klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name))
|
||||||
|
objectsDeleted.WithLabelValues(kind).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (n *nfdGarbageCollector) deleteNRT(nodeName string) {
|
func (n *nfdGarbageCollector) deleteNRT(nodeName string) {
|
||||||
|
kind := "NodeResourceTopology"
|
||||||
if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil {
|
if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil {
|
||||||
if errors.IsNotFound(err) {
|
if errors.IsNotFound(err) {
|
||||||
klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName)
|
klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName)
|
||||||
return
|
return
|
||||||
} else {
|
} else {
|
||||||
klog.ErrorS(err, "failed to delete NodeResourceTopology object", "nodeName", nodeName)
|
klog.ErrorS(err, "failed to delete NodeResourceTopology object", "nodeName", nodeName)
|
||||||
|
objectDeleteErrors.WithLabelValues(kind).Inc()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName)
|
klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName)
|
||||||
|
objectsDeleted.WithLabelValues(kind).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) {
|
func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) {
|
||||||
|
@ -208,6 +216,16 @@ func (n *nfdGarbageCollector) startNodeInformer() error {
|
||||||
|
|
||||||
// Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed
|
// Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed
|
||||||
func (n *nfdGarbageCollector) Run() error {
|
func (n *nfdGarbageCollector) Run() error {
|
||||||
|
if n.args.MetricsPort > 0 {
|
||||||
|
m := utils.CreateMetricsServer(n.args.MetricsPort,
|
||||||
|
buildInfo,
|
||||||
|
objectsDeleted,
|
||||||
|
objectDeleteErrors)
|
||||||
|
go m.Run()
|
||||||
|
registerVersion(version.Get())
|
||||||
|
defer m.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
if err := n.startNodeInformer(); err != nil {
|
if err := n.startNodeInformer(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue