1
0
Fork 0
mirror of https://github.com/kubernetes-sigs/node-feature-discovery.git synced 2024-12-14 11:57:51 +00:00

nfd-gc: add metrics

Implements three metrics for nfd-gc:

- nfd_gc_build_info: version information of nfd-gc.
- nfd_gc_objects_deleted_total: total number of NodeFeature and
  NodeResourceTopology objects deleted by nfd-gc.
- nfd_gc_object_delete_failures_total: number of errors encountered when
  deleting NodeFeature and NodeResourceTopology objects.
This commit is contained in:
Markus Lehtonen 2023-10-09 13:06:07 +03:00
parent 44b26e39e4
commit 98c3b0750d
8 changed files with 89 additions and 3 deletions

View file

@ -83,6 +83,8 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args {
"interval between cleanup of obsolete api objects")
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
"Kubeconfig to use")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
klog.InitFlags(flagset)

View file

@ -21,3 +21,6 @@ spec:
imagePullPolicy: Always
command:
- "nfd-gc"
ports:
- name: metrics
containerPort: 8081

View file

@ -58,6 +58,9 @@ spec:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsNonRoot: true
ports:
- name: metrics
containerPort: {{ .Values.gc.metricsPort | default "8081"}}
{{- with .Values.gc.nodeSelector }}
nodeSelector:

View file

@ -495,6 +495,8 @@ gc:
# cpu: 100m
# memory: 128Mi
metricsPort: 8081
nodeSelector: {}
tolerations: []
annotations: {}

View file

@ -203,6 +203,7 @@ API's you need to install the prometheus operator in your cluster.
| `gc.interval` | string | 1h | Time between periodic garbage collector runs
| `gc.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings
| `gc.resources` | dict | {} | Garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/)
| `gc.metricsPort` | integer | 8081 | Port on which to serve Prometheus metrics
| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector)
| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
| `gc.annotations` | dict | {} | Garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)

View file

@ -17,6 +17,7 @@ The exposed metrics are
| ------------------------------------------------- | --------- | ---------------------------------------
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC
| `nfd_node_updates_total` | Counter | Number of nodes updated
@ -28,6 +29,8 @@ The exposed metrics are
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected.
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects.
## Via Kustomize

54
pkg/nfd-gc/metrics.go Normal file
View file

@ -0,0 +1,54 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nfdgarbagecollector
import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_gc_build_info"
objectsDeletedQuery = "nfd_gc_objects_deleted_total"
objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total"
)
var (
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{
"version": version.Get(),
},
})
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: objectsDeletedQuery,
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
[]string{"kind"},
)
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: objectDeleteErrorsQuery,
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
[]string{"kind"},
)
)
// registerVersion exposes the Operator build version.
func registerVersion(version string) {
buildInfo.SetToCurrentTime()
}

View file

@ -34,13 +34,15 @@ import (
"sigs.k8s.io/node-feature-discovery/pkg/apihelper"
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1"
nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned"
"sigs.k8s.io/node-feature-discovery/pkg/utils"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)
// Args are the command line arguments
type Args struct {
GCPeriod time.Duration
Kubeconfig string
GCPeriod time.Duration
Kubeconfig string
MetricsPort int
}
type NfdGarbageCollector interface {
@ -74,29 +76,35 @@ func New(args *Args) (NfdGarbageCollector, error) {
}
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
kind := "NodeFeature"
if err := n.nfdClient.NfdV1alpha1().NodeFeatures(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
if errors.IsNotFound(err) {
klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name))
return
} else {
klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name))
objectDeleteErrors.WithLabelValues(kind).Inc()
return
}
}
klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name))
objectsDeleted.WithLabelValues(kind).Inc()
}
func (n *nfdGarbageCollector) deleteNRT(nodeName string) {
kind := "NodeResourceTopology"
if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil {
if errors.IsNotFound(err) {
klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName)
return
} else {
klog.ErrorS(err, "failed to delete NodeResourceTopology object", "nodeName", nodeName)
objectDeleteErrors.WithLabelValues(kind).Inc()
return
}
}
klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName)
objectsDeleted.WithLabelValues(kind).Inc()
}
func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) {
@ -208,6 +216,16 @@ func (n *nfdGarbageCollector) startNodeInformer() error {
// Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed
func (n *nfdGarbageCollector) Run() error {
if n.args.MetricsPort > 0 {
m := utils.CreateMetricsServer(n.args.MetricsPort,
buildInfo,
objectsDeleted,
objectDeleteErrors)
go m.Run()
registerVersion(version.Get())
defer m.Stop()
}
if err := n.startNodeInformer(); err != nil {
return err
}