2023-01-03 13:23:46 +01:00
|
|
|
/*
|
|
|
|
Copyright 2023 The Kubernetes Authors.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
package nfdgarbagecollector
|
2023-01-03 13:23:46 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2024-07-26 10:29:15 +03:00
|
|
|
"fmt"
|
2023-01-03 13:23:46 +01:00
|
|
|
"time"
|
|
|
|
|
2024-07-26 15:12:09 +03:00
|
|
|
topologyv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
|
2023-01-03 13:23:46 +01:00
|
|
|
corev1 "k8s.io/api/core/v1"
|
|
|
|
"k8s.io/apimachinery/pkg/api/errors"
|
|
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
2023-08-17 18:16:36 +03:00
|
|
|
"k8s.io/apimachinery/pkg/labels"
|
2024-07-26 17:05:14 +03:00
|
|
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
2023-01-03 13:23:46 +01:00
|
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
2024-07-26 15:12:09 +03:00
|
|
|
metadataclient "k8s.io/client-go/metadata"
|
|
|
|
"k8s.io/client-go/metadata/metadatainformer"
|
2023-01-03 13:23:46 +01:00
|
|
|
"k8s.io/client-go/tools/cache"
|
|
|
|
"k8s.io/klog/v2"
|
|
|
|
|
2024-02-27 14:42:23 +01:00
|
|
|
nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/api/nfd/v1alpha1"
|
2023-10-09 13:06:07 +03:00
|
|
|
"sigs.k8s.io/node-feature-discovery/pkg/utils"
|
|
|
|
"sigs.k8s.io/node-feature-discovery/pkg/version"
|
2023-01-03 13:23:46 +01:00
|
|
|
)
|
|
|
|
|
2024-07-26 15:12:09 +03:00
|
|
|
var (
|
|
|
|
gvrNF = nfdv1alpha1.SchemeGroupVersion.WithResource("nodefeatures")
|
|
|
|
gvrNRT = topologyv1alpha2.SchemeGroupVersion.WithResource("noderesourcetopologies")
|
|
|
|
gvrNode = corev1.SchemeGroupVersion.WithResource("nodes")
|
|
|
|
)
|
|
|
|
|
2023-01-03 13:23:46 +01:00
|
|
|
// Args are the command line arguments
|
|
|
|
type Args struct {
|
2023-10-09 13:06:07 +03:00
|
|
|
GCPeriod time.Duration
|
|
|
|
Kubeconfig string
|
|
|
|
MetricsPort int
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
type NfdGarbageCollector interface {
|
2023-01-03 13:23:46 +01:00
|
|
|
Run() error
|
|
|
|
Stop()
|
|
|
|
}
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
type nfdGarbageCollector struct {
|
2024-07-26 15:12:09 +03:00
|
|
|
args *Args
|
|
|
|
stopChan chan struct{}
|
|
|
|
client metadataclient.Interface
|
|
|
|
factory metadatainformer.SharedInformerFactory
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
func New(args *Args) (NfdGarbageCollector, error) {
|
2024-01-22 13:42:20 +02:00
|
|
|
kubeconfig, err := utils.GetKubeconfig(args.Kubeconfig)
|
2023-01-03 13:23:46 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2024-07-26 15:12:09 +03:00
|
|
|
cli := metadataclient.NewForConfigOrDie(kubeconfig)
|
2023-01-03 13:23:46 +01:00
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
return &nfdGarbageCollector{
|
2024-07-26 15:12:09 +03:00
|
|
|
args: args,
|
|
|
|
stopChan: make(chan struct{}),
|
|
|
|
client: cli,
|
|
|
|
factory: metadatainformer.NewSharedInformerFactory(cli, 0),
|
2023-01-03 13:23:46 +01:00
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
2023-08-17 20:01:14 +03:00
|
|
|
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
|
2023-10-09 13:06:07 +03:00
|
|
|
kind := "NodeFeature"
|
2024-07-26 15:12:09 +03:00
|
|
|
if err := n.client.Resource(gvrNF).Namespace(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
|
2023-08-17 20:01:14 +03:00
|
|
|
if errors.IsNotFound(err) {
|
|
|
|
klog.V(2).InfoS("NodeFeature not found, omitting deletion", "nodefeature", klog.KRef(namespace, name))
|
|
|
|
return
|
|
|
|
} else {
|
|
|
|
klog.ErrorS(err, "failed to delete NodeFeature object", "nodefeature", klog.KRef(namespace, name))
|
2023-10-09 13:06:07 +03:00
|
|
|
objectDeleteErrors.WithLabelValues(kind).Inc()
|
2023-08-17 20:01:14 +03:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
klog.InfoS("NodeFeature object has been deleted", "nodefeature", klog.KRef(namespace, name))
|
2023-10-09 13:06:07 +03:00
|
|
|
objectsDeleted.WithLabelValues(kind).Inc()
|
2023-08-17 20:01:14 +03:00
|
|
|
}
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
func (n *nfdGarbageCollector) deleteNRT(nodeName string) {
|
2023-10-09 13:06:07 +03:00
|
|
|
kind := "NodeResourceTopology"
|
2024-07-26 15:12:09 +03:00
|
|
|
if err := n.client.Resource(gvrNRT).Delete(context.TODO(), nodeName, metav1.DeleteOptions{}); err != nil {
|
2023-01-03 13:23:46 +01:00
|
|
|
if errors.IsNotFound(err) {
|
2023-05-03 11:32:53 +03:00
|
|
|
klog.V(2).InfoS("NodeResourceTopology not found, omitting deletion", "nodeName", nodeName)
|
2023-01-03 13:23:46 +01:00
|
|
|
return
|
|
|
|
} else {
|
2023-05-03 11:32:53 +03:00
|
|
|
klog.ErrorS(err, "failed to delete NodeResourceTopology object", "nodeName", nodeName)
|
2023-10-09 13:06:07 +03:00
|
|
|
objectDeleteErrors.WithLabelValues(kind).Inc()
|
2023-01-03 13:23:46 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
2023-05-03 11:32:53 +03:00
|
|
|
klog.InfoS("NodeResourceTopology object has been deleted", "nodeName", nodeName)
|
2023-10-09 13:06:07 +03:00
|
|
|
objectsDeleted.WithLabelValues(kind).Inc()
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
func (n *nfdGarbageCollector) deleteNodeHandler(object interface{}) {
|
2023-01-03 13:23:46 +01:00
|
|
|
// handle a case when we are starting up and need to clear stale NRT resources
|
|
|
|
obj := object
|
|
|
|
if deletedFinalStateUnknown, ok := object.(cache.DeletedFinalStateUnknown); ok {
|
2023-05-03 11:32:53 +03:00
|
|
|
klog.V(2).InfoS("found stale NodeResourceTopology object", "object", object)
|
2023-01-03 13:23:46 +01:00
|
|
|
obj = deletedFinalStateUnknown.Obj
|
|
|
|
}
|
|
|
|
|
2024-07-26 15:12:09 +03:00
|
|
|
meta, ok := obj.(*metav1.PartialObjectMetadata)
|
2023-01-03 13:23:46 +01:00
|
|
|
if !ok {
|
2024-07-26 15:12:09 +03:00
|
|
|
klog.InfoS("cannot convert object to metav1.ObjectMeta", "object", object)
|
2023-01-03 13:23:46 +01:00
|
|
|
return
|
|
|
|
}
|
2024-07-26 15:12:09 +03:00
|
|
|
nodeName := meta.ObjectMeta.GetName()
|
2023-01-03 13:23:46 +01:00
|
|
|
|
2024-07-26 15:12:09 +03:00
|
|
|
n.deleteNRT(nodeName)
|
2023-08-17 20:01:14 +03:00
|
|
|
|
|
|
|
// Delete all NodeFeature objects (from all namespaces) targeting the deleted node
|
2024-07-26 15:12:09 +03:00
|
|
|
nfListOptions := metav1.ListOptions{LabelSelector: nfdv1alpha1.NodeFeatureObjNodeNameLabel + "=" + nodeName}
|
|
|
|
if nfs, err := n.client.Resource(gvrNF).List(context.TODO(), nfListOptions); err != nil {
|
2023-08-17 20:01:14 +03:00
|
|
|
klog.ErrorS(err, "failed to list NodeFeature objects")
|
|
|
|
} else {
|
|
|
|
for _, nf := range nfs.Items {
|
|
|
|
n.deleteNodeFeature(nf.Namespace, nf.Name)
|
|
|
|
}
|
|
|
|
}
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
|
|
|
|
2023-08-17 18:09:14 +03:00
|
|
|
// garbageCollect removes all stale API objects
|
2023-08-17 16:53:06 +03:00
|
|
|
func (n *nfdGarbageCollector) garbageCollect() {
|
2023-08-17 18:09:14 +03:00
|
|
|
klog.InfoS("performing garbage collection")
|
2024-07-26 15:12:09 +03:00
|
|
|
objs, err := n.factory.ForResource(gvrNode).Lister().List(labels.Everything())
|
2023-08-17 18:16:36 +03:00
|
|
|
if err != nil {
|
|
|
|
klog.ErrorS(err, "failed to list Node objects")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
nodeNames := sets.NewString()
|
2024-07-26 15:12:09 +03:00
|
|
|
for _, obj := range objs {
|
|
|
|
meta := obj.(*metav1.PartialObjectMetadata).ObjectMeta
|
|
|
|
nodeNames.Insert(meta.Name)
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
|
|
|
|
2024-07-26 17:05:14 +03:00
|
|
|
listAndHandle := func(gvr schema.GroupVersionResource, handler func(metav1.PartialObjectMetadata)) {
|
|
|
|
opts := metav1.ListOptions{
|
|
|
|
Limit: 200,
|
|
|
|
}
|
|
|
|
for {
|
|
|
|
rsp, err := n.client.Resource(gvr).List(context.TODO(), opts)
|
|
|
|
if errors.IsNotFound(err) {
|
|
|
|
klog.V(2).InfoS("resource does not exist", "resource", gvr)
|
|
|
|
break
|
|
|
|
} else if err != nil {
|
|
|
|
klog.ErrorS(err, "failed to list objects", "resource", gvr)
|
|
|
|
break
|
2023-08-17 20:01:14 +03:00
|
|
|
}
|
2024-07-26 17:05:14 +03:00
|
|
|
for _, item := range rsp.Items {
|
|
|
|
handler(item)
|
2023-08-17 20:01:14 +03:00
|
|
|
}
|
2024-07-26 17:05:14 +03:00
|
|
|
|
|
|
|
if rsp.ListMeta.Continue == "" {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
opts.Continue = rsp.ListMeta.Continue
|
2023-08-17 20:01:14 +03:00
|
|
|
}
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
|
|
|
|
2024-07-26 17:05:14 +03:00
|
|
|
// Handle NodeFeature objects
|
|
|
|
listAndHandle(gvrNF, func(meta metav1.PartialObjectMetadata) {
|
|
|
|
nodeName, ok := meta.GetLabels()[nfdv1alpha1.NodeFeatureObjNodeNameLabel]
|
|
|
|
if !ok {
|
|
|
|
klog.InfoS("node name label missing from NodeFeature object", "nodefeature", klog.KObj(&meta))
|
|
|
|
}
|
|
|
|
if !nodeNames.Has(nodeName) {
|
|
|
|
n.deleteNodeFeature(meta.Namespace, meta.Name)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
2023-08-17 20:01:14 +03:00
|
|
|
// Handle NodeResourceTopology objects
|
2024-07-26 17:05:14 +03:00
|
|
|
listAndHandle(gvrNRT, func(meta metav1.PartialObjectMetadata) {
|
|
|
|
if !nodeNames.Has(meta.Name) {
|
|
|
|
n.deleteNRT(meta.Name)
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
2024-07-26 17:05:14 +03:00
|
|
|
})
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// periodicGC runs garbage collector at every gcPeriod to make sure we haven't missed any node
|
2023-08-17 16:53:06 +03:00
|
|
|
func (n *nfdGarbageCollector) periodicGC(gcPeriod time.Duration) {
|
2023-08-17 17:58:16 +03:00
|
|
|
// Do initial round of garbage collection at startup time
|
|
|
|
n.garbageCollect()
|
|
|
|
|
2023-01-03 13:23:46 +01:00
|
|
|
gcTrigger := time.NewTicker(gcPeriod)
|
2023-07-04 13:57:10 +08:00
|
|
|
defer gcTrigger.Stop()
|
2023-01-03 13:23:46 +01:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-gcTrigger.C:
|
2023-08-17 18:09:14 +03:00
|
|
|
n.garbageCollect()
|
2023-01-03 13:23:46 +01:00
|
|
|
case <-n.stopChan:
|
2023-05-03 11:32:53 +03:00
|
|
|
klog.InfoS("shutting down periodic Garbage Collector")
|
2023-01-03 13:23:46 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
func (n *nfdGarbageCollector) startNodeInformer() error {
|
2024-07-26 15:12:09 +03:00
|
|
|
nodeInformer := n.factory.ForResource(gvrNode).Informer()
|
2023-01-03 13:23:46 +01:00
|
|
|
|
|
|
|
if _, err := nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
|
|
DeleteFunc: n.deleteNodeHandler,
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// start informers
|
|
|
|
n.factory.Start(n.stopChan)
|
2024-07-26 10:29:15 +03:00
|
|
|
|
|
|
|
start := time.Now()
|
|
|
|
ret := n.factory.WaitForCacheSync(n.stopChan)
|
|
|
|
for res, ok := range ret {
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("node informer cache failed to sync (%s)", res)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
klog.InfoS("node informer cache synced", "duration", time.Since(start))
|
2023-01-03 13:23:46 +01:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Run is a blocking function that removes stale NRT objects when Node is deleted and runs periodic GC to make sure any obsolete objects are removed
|
2023-08-17 16:53:06 +03:00
|
|
|
func (n *nfdGarbageCollector) Run() error {
|
2023-10-09 13:06:07 +03:00
|
|
|
if n.args.MetricsPort > 0 {
|
|
|
|
m := utils.CreateMetricsServer(n.args.MetricsPort,
|
|
|
|
buildInfo,
|
|
|
|
objectsDeleted,
|
|
|
|
objectDeleteErrors)
|
|
|
|
go m.Run()
|
|
|
|
registerVersion(version.Get())
|
|
|
|
defer m.Stop()
|
|
|
|
}
|
|
|
|
|
2023-08-17 17:48:21 +03:00
|
|
|
if err := n.startNodeInformer(); err != nil {
|
2023-01-03 13:23:46 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
// run periodic GC
|
2023-10-09 11:48:49 +03:00
|
|
|
n.periodicGC(n.args.GCPeriod)
|
2023-01-03 13:23:46 +01:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-08-17 16:53:06 +03:00
|
|
|
func (n *nfdGarbageCollector) Stop() {
|
2023-08-18 16:23:36 +03:00
|
|
|
close(n.stopChan)
|
2023-01-03 13:23:46 +01:00
|
|
|
}
|