mirror of
https://github.com/kubernetes-sigs/node-feature-discovery.git
synced 2024-12-14 11:57:51 +00:00
nfd-gc: add metrics
Implements three metrics for nfd-gc: - nfd_gc_build_info: version information of nfd-gc. - nfd_gc_objects_deleted_total: total number of NodeFeature and NodeResourceTopology objects deleted by nfd-gc. - nfd_gc_object_delete_failures_total: number of errors encountered when deleting NodeFeature and NodeResourceTopology objects.
This commit is contained in:
parent
44b26e39e4
commit
98c3b0750d
8 changed files with 89 additions and 3 deletions
|
@ -83,6 +83,8 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args {
|
|||
"interval between cleanup of obsolete api objects")
|
||||
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
|
||||
"Kubeconfig to use")
|
||||
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
|
||||
"Port on which to expose metrics.")
|
||||
|
||||
klog.InitFlags(flagset)
|
||||
|
||||
|
|
|
@ -21,3 +21,6 @@ spec:
|
|||
imagePullPolicy: Always
|
||||
command:
|
||||
- "nfd-gc"
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 8081
|
||||
|
|
|
@ -58,6 +58,9 @@ spec:
|
|||
drop: [ "ALL" ]
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: {{ .Values.gc.metricsPort | default "8081"}}
|
||||
|
||||
{{- with .Values.gc.nodeSelector }}
|
||||
nodeSelector:
|
||||
|
|
|
@ -495,6 +495,8 @@ gc:
|
|||
# cpu: 100m
|
||||
# memory: 128Mi
|
||||
|
||||
metricsPort: 8081
|
||||
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
annotations: {}
|
||||
|
|
|
@ -203,6 +203,7 @@ API's you need to install the prometheus operator in your cluster.
|
|||
| `gc.interval` | string | 1h | Time between periodic garbage collector runs
|
||||
| `gc.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings
|
||||
| `gc.resources` | dict | {} | Garbage collector pod [resources management](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/)
|
||||
| `gc.metricsPort` | integer | 8081 | Port on which to serve Prometheus metrics
|
||||
| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector)
|
||||
| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
|
||||
| `gc.annotations` | dict | {} | Garbage collector pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/)
|
||||
|
|
|
@ -17,6 +17,7 @@ The exposed metrics are
|
|||
| ------------------------------------------------- | --------- | ---------------------------------------
|
||||
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
|
||||
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
|
||||
| `nfd_gc_build_info` | Gauge | Version from which nfd-gc was built
|
||||
| `nfd_topology_updater_build_info` | Gauge | Version from which nfd-topology-updater was built
|
||||
| `nfd_node_update_requests_total` | Counter | Number of node update requests received by the master over gRPC
|
||||
| `nfd_node_updates_total` | Counter | Number of nodes updated
|
||||
|
@ -28,6 +29,8 @@ The exposed metrics are
|
|||
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
|
||||
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
|
||||
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.
|
||||
| `nfd_gc_objects_deleted_total` | Counter | Number of NodeFeature and NodeResourceTopology objects garbage collected.
|
||||
| `nfd_gc_object_delete_failures_total` | Counter | Number of errors in deleting NodeFeature and NodeResourceTopology objects.
|
||||
|
||||
## Via Kustomize
|
||||
|
||||
|
|
54
pkg/nfd-gc/metrics.go
Normal file
54
pkg/nfd-gc/metrics.go
Normal file
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
Copyright 2023 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nfdgarbagecollector
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/version"
|
||||
)
|
||||
|
||||
// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
|
||||
const (
|
||||
buildInfoQuery = "nfd_gc_build_info"
|
||||
objectsDeletedQuery = "nfd_gc_objects_deleted_total"
|
||||
objectDeleteErrorsQuery = "nfd_gc_object_delete_failures_total"
|
||||
)
|
||||
|
||||
var (
|
||||
buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: buildInfoQuery,
|
||||
Help: "Version from which Node Feature Discovery was built.",
|
||||
ConstLabels: map[string]string{
|
||||
"version": version.Get(),
|
||||
},
|
||||
})
|
||||
objectsDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: objectsDeletedQuery,
|
||||
Help: "Number of NodeFeature and NodeResourceTopology objects garbage collected."},
|
||||
[]string{"kind"},
|
||||
)
|
||||
objectDeleteErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: objectDeleteErrorsQuery,
|
||||
Help: "Number of errors in deleting NodeFeature and NodeResourceTopology objects."},
|
||||
[]string{"kind"},
|
||||
)
|
||||
)
|
||||
|
||||
// registerVersion exposes the Operator build version.
|
||||
func registerVersion(version string) {
|
||||
buildInfo.SetToCurrentTime()
|
||||
}
|
|
@ -34,13 +34,15 @@ import (
|
|||
"sigs.k8s.io/node-feature-discovery/pkg/apihelper"
|
||||
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1"
|
||||
nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/utils"
|
||||
"sigs.k8s.io/node-feature-discovery/pkg/version"
|
||||
)
|
||||
|
||||
// Args are the command line arguments
|
||||
type Args struct {
|
||||
GCPeriod time.Duration
|
||||
|
||||
Kubeconfig string
|
||||
GCPeriod time.Duration
|
||||
Kubeconfig string
|
||||
MetricsPort int
|
||||
}
|
||||
|
||||
type NfdGarbageCollector interface {
|
||||
|
@ -74,29 +76,35 @@ func New(args *Args) (NfdGarbageCollector, error) {
|
|||
}
|
||||
|
||||
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
|
||||
kind := "NodeFeature"
|
||||
if err := n.nfdClient.NfdV1alpha1().NodeFeatures(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
|
||||
if errors.IsNotFound(err) {
|
||||
klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name))
|
||||
return
|
||||
} else {
|
||||
klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name))
|
||||
objectDeleteErrors.WithLabelValues(kind).Inc()
|
||||
return
|
||||
}
|
||||
}
|
||||
klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name))
|
||||
objectsDeleted.WithLabelValues(kind).Inc()
|
||||
}
|
||||
|
||||
func (n *nfdGarbageCollector) deleteNRT(nodeName string) {
|
||||
kind := "NodeResourceTopology"
|
||||
if err := n.topoClient.TopologyV1alpha2().NodeResourceTopologies().Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil {
|
||||
if errors.IsNotFound(err) {
|
||||
klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName)
|
||||
return
|
||||
} else {
|
||||
klog.ErrorS(err, "failed to delete NodeResourceTopology object", "nodeName", nodeName)
|
||||
objectDeleteErrors.WithLabelValues(kind).Inc()
|
||||
return
|
||||
}
|
||||
}
|
||||
klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName)
|
||||
objectsDeleted.WithLabelValues(kind).Inc()
|
||||
}
|
||||
|
||||
func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) {
|
||||
|
@ -208,6 +216,16 @@ func (n *nfdGarbageCollector) startNodeInformer() error {
|
|||
|
||||
// Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed
|
||||
func (n *nfdGarbageCollector) Run() error {
|
||||
if n.args.MetricsPort > 0 {
|
||||
m := utils.CreateMetricsServer(n.args.MetricsPort,
|
||||
buildInfo,
|
||||
objectsDeleted,
|
||||
objectDeleteErrors)
|
||||
go m.Run()
|
||||
registerVersion(version.Get())
|
||||
defer m.Stop()
|
||||
}
|
||||
|
||||
if err := n.startNodeInformer(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue