From 3e0f90fe66591530024eb6b88bba75e4b49610eb Mon Sep 17 00:00:00 2001 From: Adam Janikowski <12255597+ajanikow@users.noreply.github.com> Date: Mon, 26 Feb 2024 14:28:18 +0100 Subject: [PATCH] [Bugfix] Wait for ImageStatus (#1602) --- CHANGELOG.md | 1 + README.md | 5 ++++- cmd/cmd.go | 39 ++++++++++++++++++++++++++++++++++---- pkg/util/k8sutil/images.go | 15 ++++++++++++--- 4 files changed, 52 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c2c81c8b..afba1c05d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - (Feature) Extract Scheduler API - (Bugfix) Fix Image Discovery - (Bugfix) Fix Resources Copy mechanism to prevent invalid pod creation +- (Bugfix) Wait for ImageStatus in ImageDiscover ## [1.2.38](https://github.com/arangodb/kube-arangodb/tree/1.2.38) (2024-02-22) - (Feature) Extract GRPC Server diff --git a/README.md b/README.md index 368f72df2..e7e9cc7a6 100644 --- a/README.md +++ b/README.md @@ -161,12 +161,15 @@ Flags: --deployment.feature.upgrade-version-check-v2 Enable initContainer with pre version check based by Operator - Required ArangoDB 3.8.0 or higher --features-config-map-name string Name of the Feature Map ConfigMap (default "arangodb-operator-feature-config-map") -h, --help help for arangodb_operator + --image.discovery.status Discover Operator Image from Pod Status by default. When disabled Pod Spec is used. (default true) + --image.discovery.timeout duration Timeout for image discovery process (default 1m0s) --internal.scaling-integration Enable Scaling Integration --kubernetes.burst int Burst for the k8s API (default 30) --kubernetes.max-batch-size int Size of batch during objects read (default 256) --kubernetes.qps float32 Number of queries per second for k8s API (default 15) --log.format string Set log format. Allowed values: 'pretty', 'JSON'. If empty, default format is used (default "pretty") - --log.level stringArray Set log levels in format or =. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, k8s-client, ml-batchjob-operator, ml-cronjob-operator, ml-extension-operator, ml-extension-shutdown, ml-storage-operator, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, server, server-authentication (default [debug]) + --log.level stringArray Set log levels in format or =. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, integrations, k8s-client, ml-batchjob-operator, ml-cronjob-operator, ml-extension-operator, ml-extension-shutdown, ml-storage-operator, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, server, server-authentication (default [info]) + --log.sampling If true, operator will try to minimize duplication of logging events (default true) --memory-limit uint Define memory limit for hard shutdown and the dump of goroutines. Used for testing --metrics.excluded-prefixes stringArray List of the excluded metrics prefixes --mode.single Enable single mode in Operator. WARNING: There should be only one replica of Operator, otherwise Operator can take unexpected actions diff --git a/cmd/cmd.go b/cmd/cmd.go index 6058de8f0..5c7f31ef9 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -161,6 +161,10 @@ var ( backupArangoD time.Duration backupUploadArangoD time.Duration } + operatorImageDiscovery struct { + timeout time.Duration + defaultStatusDiscovery bool + } operatorReconciliationRetry struct { delay time.Duration count int @@ -241,6 +245,8 @@ func init() { f.IntVar(&operatorBackup.concurrentUploads, "backup-concurrent-uploads", globals.DefaultBackupConcurrentUploads, "Number of concurrent uploads per deployment") f.Uint64Var(&memoryLimit.hardLimit, "memory-limit", 0, "Define memory limit for hard shutdown and the dump of goroutines. Used for testing") f.StringArrayVar(&metricsOptions.excludedMetricPrefixes, "metrics.excluded-prefixes", nil, "List of the excluded metrics prefixes") + f.BoolVar(&operatorImageDiscovery.defaultStatusDiscovery, "image.discovery.status", true, "Discover Operator Image from Pod Status by default. When disabled Pod Spec is used.") + f.DurationVar(&operatorImageDiscovery.timeout, "image.discovery.timeout", time.Minute, "Timeout for image discovery process") if err := features.Init(&cmdMain); err != nil { panic(err.Error()) } @@ -584,6 +590,20 @@ func newOperatorConfigAndDeps(id, namespace, name string) (operator.Config, oper // getMyPodInfo looks up the image & service account of the pod with given name in given namespace // Returns image, serviceAccount, error. func getMyPodInfo(kubecli kubernetes.Interface, namespace, name string) (string, string, error) { + if image, sa, ok := getMyPodInfoWrap(kubecli, namespace, name, getMyImageInfoFunc(operatorImageDiscovery.defaultStatusDiscovery)); ok { + return image, sa, nil + } + + logger.Warn("Unable to discover image, fallback to second method") + + if image, sa, ok := getMyPodInfoWrap(kubecli, namespace, name, getMyImageInfoFunc(!operatorImageDiscovery.defaultStatusDiscovery)); ok { + return image, sa, nil + } + + return "", "", errors.Errorf("Unable to discover image") +} + +func getMyPodInfoWrap(kubecli kubernetes.Interface, namespace, name string, imageFunc func(in *core.Pod) (string, bool)) (string, string, bool) { var image, sa string op := func() error { pod, err := kubecli.CoreV1().Pods(namespace).Get(context.Background(), name, meta.GetOptions{}) @@ -595,15 +615,26 @@ func getMyPodInfo(kubecli kubernetes.Interface, namespace, name string) (string, return errors.WithStack(err) } sa = pod.Spec.ServiceAccountName - if image, err = k8sutil.GetArangoDBImageIDFromPod(pod, shared.ServerContainerName, shared.OperatorContainerName, constants.MyContainerNameEnv.GetOrDefault(shared.OperatorContainerName)); err != nil { + if i, ok := imageFunc(pod); !ok { return errors.Wrap(err, "failed to get image ID from pod") + } else { + image = i } return nil } - if err := retry.Retry(op, time.Minute*5); err != nil { - return "", "", errors.WithStack(err) + if err := retry.Retry(op, operatorImageDiscovery.timeout/2); err == nil { + return image, sa, true + } + return "", "", false +} + +func getMyImageInfoFunc(status bool) func(pod *core.Pod) (string, bool) { + return func(pod *core.Pod) (string, bool) { + if status { + return k8sutil.GetArangoDBImageIDFromContainerStatuses(pod.Status.ContainerStatuses, shared.ServerContainerName, shared.OperatorContainerName, constants.MyContainerNameEnv.GetOrDefault(shared.OperatorContainerName)) + } + return k8sutil.GetArangoDBImageIDFromContainers(pod.Spec.Containers, shared.ServerContainerName, shared.OperatorContainerName, constants.MyContainerNameEnv.GetOrDefault(shared.OperatorContainerName)) } - return image, sa, nil } func createRecorder(kubecli kubernetes.Interface, name, namespace string) record.EventRecorder { diff --git a/pkg/util/k8sutil/images.go b/pkg/util/k8sutil/images.go index 5bcc89378..aab27e310 100644 --- a/pkg/util/k8sutil/images.go +++ b/pkg/util/k8sutil/images.go @@ -53,13 +53,16 @@ func GetArangoDBImageIDFromPod(pod *core.Pod, names ...string) (string, error) { if image, ok := GetArangoDBImageIDFromContainerStatuses(pod.Status.ContainerStatuses, names...); ok { return image, nil } + if image, ok := GetArangoDBImageIDFromContainers(pod.Spec.Containers, names...); ok { return image, nil } if cs := pod.Status.ContainerStatuses; len(cs) > 0 { if image := cs[0].ImageID; image != "" { - return ConvertImageID2Image(image), nil + if disc := ConvertImageID2Image(image); disc != "" { + return disc, nil + } } } if cs := pod.Spec.Containers; len(cs) > 0 { @@ -75,7 +78,11 @@ func GetArangoDBImageIDFromPod(pod *core.Pod, names ...string) (string, error) { func GetArangoDBImageIDFromContainerStatuses(containers []core.ContainerStatus, names ...string) (string, bool) { for _, name := range names { if id := container.GetContainerStatusIDByName(containers, name); id != -1 { - return ConvertImageID2Image(containers[id].ImageID), true + if image := containers[id].ImageID; image != "" { + if disc := ConvertImageID2Image(image); disc != "" { + return disc, true + } + } } } @@ -86,7 +93,9 @@ func GetArangoDBImageIDFromContainerStatuses(containers []core.ContainerStatus, func GetArangoDBImageIDFromContainers(containers []core.Container, names ...string) (string, bool) { for _, name := range names { if id := container.GetContainerIDByName(containers, name); id != -1 { - return containers[id].Image, true + if image := containers[id].Image; image != "" { + return image, true + } } }