diff --git a/CHANGELOG.md b/CHANGELOG.md index 71e6beb2f..6d24bd9da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A) - Add support for spec.ClusterDomain to be able to use FQDN in ArangoDB cluster communication +- Add Version Check feature with extended Upgrade checks +- Fix Upgrade failures recovery ## [1.1.3](https://github.com/arangodb/kube-arangodb/tree/1.1.3) (2020-12-16) - Add v2alpha1 API for ArangoDeployment and ArangoDeploymentReplication diff --git a/README.md b/README.md index 6f624d7ea..ee698a027 100644 --- a/README.md +++ b/README.md @@ -54,28 +54,30 @@ covers individual newer features separately. Feature-wise production readiness table: -| Feature | Operator Version | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | -|-----------------------------------------|------------------|------------------|-----------------------|------------|---------|------------------------------------------|--------------------------------------------------------------------------| -| Pod Disruption Budgets | 0.3.10 | Any | Community, Enterprise | Alpha | True | N/A | N/A | -| Pod Disruption Budgets | 0.3.11 | Any | Community, Enterprise | Production | True | N/A | N/A | -| Volume Resizing | 0.3.10 | Any | Community, Enterprise | Alpha | True | N/A | N/A | -| Volume Resizing | 0.3.11 | Any | Community, Enterprise | Production | True | N/A | N/A | -| Disabling of liveness probes | 0.3.10 | Any | Community, Enterprise | Alpha | True | N/A | N/A | -| Disabling of liveness probes | 0.3.11 | Any | Community, Enterprise | Production | True | N/A | N/A | -| Volume Claim Templates | 0.3.11 | Any | Community, Enterprise | Alpha | True | N/A | N/A | -| Volume Claim Templates | 1.0.0 | Any | Community, Enterprise | Production | True | N/A | N/A | -| Prometheus Metrics Exporter | 0.3.11 | Any | Community, Enterprise | Alpha | True | N/A | Prometheus required | -| Prometheus Metrics Exporter | 1.0.0 | Any | Community, Enterprise | Production | True | N/A | Prometheus required | -| Sidecar Containers | 0.3.11 | Any | Community, Enterprise | Alpha | True | N/A | N/A | -| Sidecar Containers | 1.0.0 | Any | Community, Enterprise | Production | True | N/A | N/A | -| Operator Single Mode | 1.0.4 | Any | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled | -| TLS SNI Support | 1.0.3 | >= 3.7.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A | -| TLS Runtime Rotation Support | 1.0.4 | > 3.7.0 | Enterprise | Alpha | False | --deployment.feature.tls-rotation | N/A | -| TLS Runtime Rotation Support | 1.1.0 | > 3.7.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A | -| JWT Rotation Support | 1.0.4 | > 3.7.0 | Enterprise | Alpha | False | --deployment.feature.jwt-rotation | N/A | -| JWT Rotation Support | 1.1.0 | > 3.7.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A | -| Encryption Key Rotation Support | 1.0.4 | > 3.7.0 | Enterprise | Alpha | False | --deployment.feature.encryption-rotation | N/A | -| Operator Maintenance Management Support | 1.0.7 | >= 3.5.0 | Community, Enterprise | Alpha | False | --deployment.feature.maintenance | N/A | +| Feature | Operator Version | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | +|-----------------------------------------|------------------|------------------|-----------------------|------------|---------|--------------------------------------------|--------------------------------------------------------------------------| +| Pod Disruption Budgets | 0.3.10 | Any | Community, Enterprise | Alpha | True | N/A | N/A | +| Pod Disruption Budgets | 0.3.11 | Any | Community, Enterprise | Production | True | N/A | N/A | +| Volume Resizing | 0.3.10 | Any | Community, Enterprise | Alpha | True | N/A | N/A | +| Volume Resizing | 0.3.11 | Any | Community, Enterprise | Production | True | N/A | N/A | +| Disabling of liveness probes | 0.3.10 | Any | Community, Enterprise | Alpha | True | N/A | N/A | +| Disabling of liveness probes | 0.3.11 | Any | Community, Enterprise | Production | True | N/A | N/A | +| Volume Claim Templates | 0.3.11 | Any | Community, Enterprise | Alpha | True | N/A | N/A | +| Volume Claim Templates | 1.0.0 | Any | Community, Enterprise | Production | True | N/A | N/A | +| Prometheus Metrics Exporter | 0.3.11 | Any | Community, Enterprise | Alpha | True | N/A | Prometheus required | +| Prometheus Metrics Exporter | 1.0.0 | Any | Community, Enterprise | Production | True | N/A | Prometheus required | +| Sidecar Containers | 0.3.11 | Any | Community, Enterprise | Alpha | True | N/A | N/A | +| Sidecar Containers | 1.0.0 | Any | Community, Enterprise | Production | True | N/A | N/A | +| Operator Single Mode | 1.0.4 | Any | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled | +| TLS SNI Support | 1.0.3 | >= 3.7.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A | +| TLS Runtime Rotation Support | 1.0.4 | > 3.7.0 | Enterprise | Alpha | False | --deployment.feature.tls-rotation | N/A | +| TLS Runtime Rotation Support | 1.1.0 | > 3.7.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A | +| JWT Rotation Support | 1.0.4 | > 3.7.0 | Enterprise | Alpha | False | --deployment.feature.jwt-rotation | N/A | +| JWT Rotation Support | 1.1.0 | > 3.7.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A | +| Encryption Key Rotation Support | 1.0.4 | > 3.7.0 | Enterprise | Alpha | False | --deployment.feature.encryption-rotation | N/A | +| Encryption Key Rotation Support | 1.1.0 | > 3.7.0 | Enterprise | Production | True | --deployment.feature.encryption-rotation | N/A | +| Version Check | 1.1.4 | >= 3.5.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check | N/A | +| Operator Maintenance Management Support | 1.0.7 | >= 3.5.0 | Community, Enterprise | Alpha | False | --deployment.feature.maintenance | N/A | ## Release notes for 0.3.16 diff --git a/pkg/apis/deployment/v1/conditions.go b/pkg/apis/deployment/v1/conditions.go index 213340b3b..0e1ebc1a9 100644 --- a/pkg/apis/deployment/v1/conditions.go +++ b/pkg/apis/deployment/v1/conditions.go @@ -63,6 +63,8 @@ const ( ConditionTypeUpToDate ConditionType = "UpToDate" // ConditionTypeMarkedToRemove indicates that the member is marked to be removed. ConditionTypeMarkedToRemove ConditionType = "MarkedToRemove" + // ConditionTypeUpgradeFailed indicates that mem + ConditionTypeUpgradeFailed ConditionType = "UpgradeFailed" ) // Condition represents one current condition of a deployment or deployment member. diff --git a/pkg/apis/deployment/v1/image_info.go b/pkg/apis/deployment/v1/image_info.go index 3730080f8..4c01bc6ef 100644 --- a/pkg/apis/deployment/v1/image_info.go +++ b/pkg/apis/deployment/v1/image_info.go @@ -22,7 +22,11 @@ package v1 -import driver "github.com/arangodb/go-driver" +import ( + "fmt" + + driver "github.com/arangodb/go-driver" +) // ImageInfo contains an ID of an image and the ArangoDB version inside the image. type ImageInfo struct { @@ -32,6 +36,20 @@ type ImageInfo struct { Enterprise bool `json:"enterprise,omitempty"` // If set, this is an enterprise image } +func (i *ImageInfo) String() string { + if i == nil { + return "undefined" + } + + e := "Community" + + if i.Enterprise { + e = "Enterprise" + } + + return fmt.Sprintf("ArangoDB %s %s (%s)", e, string(i.ArangoDBVersion), i.Image) +} + // ImageInfoList is a list of image infos type ImageInfoList []ImageInfo diff --git a/pkg/apis/deployment/v1/member_status.go b/pkg/apis/deployment/v1/member_status.go index 180a5c207..0b1065a07 100644 --- a/pkg/apis/deployment/v1/member_status.go +++ b/pkg/apis/deployment/v1/member_status.go @@ -69,6 +69,8 @@ type MemberStatus struct { ImageID string `json:"image-id,omitempty"` // Image holds image details Image *ImageInfo `json:"image,omitempty"` + // OldImage holds old image defails + OldImage *ImageInfo `json:"old-image,omitempty"` // Upgrade define if upgrade should be enforced during next execution Upgrade bool `json:"upgrade,omitempty"` // Endpoint definition how member should be reachable @@ -89,6 +91,7 @@ func (s MemberStatus) Equal(other MemberStatus) bool { s.ArangoVersion == other.ArangoVersion && s.ImageID == other.ImageID && s.Image.Equal(other.Image) && + s.OldImage.Equal(other.OldImage) && s.Upgrade == other.Upgrade && util.CompareStringPointers(s.Endpoint, other.Endpoint) } diff --git a/pkg/apis/deployment/v1/server_group_init_containers.go b/pkg/apis/deployment/v1/server_group_init_containers.go index 983eb3487..262dbb3a2 100644 --- a/pkg/apis/deployment/v1/server_group_init_containers.go +++ b/pkg/apis/deployment/v1/server_group_init_containers.go @@ -31,14 +31,15 @@ import ( ) const ( - ServerGroupReservedInitContainerNameLifecycle = "init-lifecycle" - ServerGroupReservedInitContainerNameUUID = "uuid" - ServerGroupReservedInitContainerNameUpgrade = "upgrade" + ServerGroupReservedInitContainerNameLifecycle = "init-lifecycle" + ServerGroupReservedInitContainerNameUUID = "uuid" + ServerGroupReservedInitContainerNameUpgrade = "upgrade" + ServerGroupReservedInitContainerNameVersionCheck = "version-check" ) func IsReservedServerGroupInitContainerName(name string) bool { switch name { - case ServerGroupReservedInitContainerNameLifecycle, ServerGroupReservedInitContainerNameUUID, ServerGroupReservedInitContainerNameUpgrade: + case ServerGroupReservedInitContainerNameLifecycle, ServerGroupReservedInitContainerNameUUID, ServerGroupReservedInitContainerNameUpgrade, ServerGroupReservedInitContainerNameVersionCheck: return true default: return false diff --git a/pkg/apis/deployment/v1/zz_generated.deepcopy.go b/pkg/apis/deployment/v1/zz_generated.deepcopy.go index f14f9301f..14b54f9b0 100644 --- a/pkg/apis/deployment/v1/zz_generated.deepcopy.go +++ b/pkg/apis/deployment/v1/zz_generated.deepcopy.go @@ -850,6 +850,11 @@ func (in *MemberStatus) DeepCopyInto(out *MemberStatus) { *out = new(ImageInfo) **out = **in } + if in.OldImage != nil { + in, out := &in.OldImage, &out.OldImage + *out = new(ImageInfo) + **out = **in + } if in.Endpoint != nil { in, out := &in.Endpoint, &out.Endpoint *out = new(string) diff --git a/pkg/apis/deployment/v2alpha1/conditions.go b/pkg/apis/deployment/v2alpha1/conditions.go index da18827bd..c6f51bf5e 100644 --- a/pkg/apis/deployment/v2alpha1/conditions.go +++ b/pkg/apis/deployment/v2alpha1/conditions.go @@ -63,6 +63,8 @@ const ( ConditionTypeUpToDate ConditionType = "UpToDate" // ConditionTypeMarkedToRemove indicates that the member is marked to be removed. ConditionTypeMarkedToRemove ConditionType = "MarkedToRemove" + // ConditionTypeUpgradeFailed indicates that mem + ConditionTypeUpgradeFailed ConditionType = "UpgradeFailed" ) // Condition represents one current condition of a deployment or deployment member. diff --git a/pkg/apis/deployment/v2alpha1/image_info.go b/pkg/apis/deployment/v2alpha1/image_info.go index d7842c331..251c39687 100644 --- a/pkg/apis/deployment/v2alpha1/image_info.go +++ b/pkg/apis/deployment/v2alpha1/image_info.go @@ -22,7 +22,11 @@ package v2alpha1 -import driver "github.com/arangodb/go-driver" +import ( + "fmt" + + driver "github.com/arangodb/go-driver" +) // ImageInfo contains an ID of an image and the ArangoDB version inside the image. type ImageInfo struct { @@ -32,6 +36,20 @@ type ImageInfo struct { Enterprise bool `json:"enterprise,omitempty"` // If set, this is an enterprise image } +func (i *ImageInfo) String() string { + if i == nil { + return "undefined" + } + + e := "Community" + + if i.Enterprise { + e = "Enterprise" + } + + return fmt.Sprintf("ArangoDB %s %s (%s)", e, string(i.ArangoDBVersion), i.Image) +} + // ImageInfoList is a list of image infos type ImageInfoList []ImageInfo diff --git a/pkg/apis/deployment/v2alpha1/member_status.go b/pkg/apis/deployment/v2alpha1/member_status.go index 5099c4cbe..595b15eee 100644 --- a/pkg/apis/deployment/v2alpha1/member_status.go +++ b/pkg/apis/deployment/v2alpha1/member_status.go @@ -69,6 +69,8 @@ type MemberStatus struct { ImageID string `json:"image-id,omitempty"` // Image holds image details Image *ImageInfo `json:"image,omitempty"` + // OldImage holds old image defails + OldImage *ImageInfo `json:"old-image,omitempty"` // Upgrade define if upgrade should be enforced during next execution Upgrade bool `json:"upgrade,omitempty"` // Endpoint definition how member should be reachable @@ -89,6 +91,7 @@ func (s MemberStatus) Equal(other MemberStatus) bool { s.ArangoVersion == other.ArangoVersion && s.ImageID == other.ImageID && s.Image.Equal(other.Image) && + s.OldImage.Equal(other.OldImage) && s.Upgrade == other.Upgrade && util.CompareStringPointers(s.Endpoint, other.Endpoint) } diff --git a/pkg/apis/deployment/v2alpha1/server_group_init_containers.go b/pkg/apis/deployment/v2alpha1/server_group_init_containers.go index 62f1a11ab..73332e55d 100644 --- a/pkg/apis/deployment/v2alpha1/server_group_init_containers.go +++ b/pkg/apis/deployment/v2alpha1/server_group_init_containers.go @@ -31,14 +31,15 @@ import ( ) const ( - ServerGroupReservedInitContainerNameLifecycle = "init-lifecycle" - ServerGroupReservedInitContainerNameUUID = "uuid" - ServerGroupReservedInitContainerNameUpgrade = "upgrade" + ServerGroupReservedInitContainerNameLifecycle = "init-lifecycle" + ServerGroupReservedInitContainerNameUUID = "uuid" + ServerGroupReservedInitContainerNameUpgrade = "upgrade" + ServerGroupReservedInitContainerNameVersionCheck = "version-check" ) func IsReservedServerGroupInitContainerName(name string) bool { switch name { - case ServerGroupReservedInitContainerNameLifecycle, ServerGroupReservedInitContainerNameUUID, ServerGroupReservedInitContainerNameUpgrade: + case ServerGroupReservedInitContainerNameLifecycle, ServerGroupReservedInitContainerNameUUID, ServerGroupReservedInitContainerNameUpgrade, ServerGroupReservedInitContainerNameVersionCheck: return true default: return false diff --git a/pkg/apis/deployment/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/deployment/v2alpha1/zz_generated.deepcopy.go index 5b359e6c4..6184e6c99 100644 --- a/pkg/apis/deployment/v2alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/deployment/v2alpha1/zz_generated.deepcopy.go @@ -850,6 +850,11 @@ func (in *MemberStatus) DeepCopyInto(out *MemberStatus) { *out = new(ImageInfo) **out = **in } + if in.OldImage != nil { + in, out := &in.OldImage, &out.OldImage + *out = new(ImageInfo) + **out = **in + } if in.Endpoint != nil { in, out := &in.Endpoint, &out.Endpoint *out = new(string) diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index ed0719b4f..614f2bf8d 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -129,6 +129,7 @@ func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval func (d *Deployment) inspectDeploymentWithError(ctx context.Context, lastInterval util.Interval, cachedStatus inspector.Inspector) (nextInterval util.Interval, inspectError error) { t := time.Now() + defer func() { d.deps.Log.Info().Msgf("Reconciliation loop took %s", time.Since(t)) }() diff --git a/pkg/deployment/features/upgrade.go b/pkg/deployment/features/upgrade.go new file mode 100644 index 000000000..a5e59a687 --- /dev/null +++ b/pkg/deployment/features/upgrade.go @@ -0,0 +1,39 @@ +// +// DISCLAIMER +// +// Copyright 2020 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Adam Janikowski +// + +package features + +func init() { + registerFeature(upgradeVersionCheck) +} + +var upgradeVersionCheck Feature = &feature{ + name: "upgrade-version-check", + description: "Enable initContainer with pre version check", + version: "3.5.0", + enterpriseRequired: false, + enabledByDefault: false, +} + +func UpgradeVersionCheck() Feature { + return upgradeVersionCheck +} diff --git a/pkg/deployment/pod/upgrade_version_check.go b/pkg/deployment/pod/upgrade_version_check.go new file mode 100644 index 000000000..f02b29710 --- /dev/null +++ b/pkg/deployment/pod/upgrade_version_check.go @@ -0,0 +1,63 @@ +// +// DISCLAIMER +// +// Copyright 2020 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Adam Janikowski +// + +package pod + +import ( + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" + "github.com/arangodb/kube-arangodb/pkg/deployment/features" + "github.com/arangodb/kube-arangodb/pkg/deployment/resources/inspector" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" + core "k8s.io/api/core/v1" +) + +func UpgradeVersionCheck() Builder { + return upgradeVersionCheck{} +} + +type upgradeVersionCheck struct{} + +func (u upgradeVersionCheck) Args(i Input) k8sutil.OptionPairs { + if features.UpgradeVersionCheck().Enabled() { + switch i.Group { + case api.ServerGroupAgents, api.ServerGroupDBServers, api.ServerGroupSingle: + return k8sutil.NewOptionPair(k8sutil.OptionPair{ + Key: "--database.check-version", + Value: "true", + }) + } + } + + return nil +} + +func (u upgradeVersionCheck) Volumes(i Input) ([]core.Volume, []core.VolumeMount) { + return nil, nil +} + +func (u upgradeVersionCheck) Envs(i Input) []core.EnvVar { + return nil +} + +func (u upgradeVersionCheck) Verify(i Input, cachedStatus inspector.Inspector) error { + return nil +} diff --git a/pkg/deployment/reconcile/action_set_current_image.go b/pkg/deployment/reconcile/action_set_current_image.go index ba5dcba86..816612d1c 100644 --- a/pkg/deployment/reconcile/action_set_current_image.go +++ b/pkg/deployment/reconcile/action_set_current_image.go @@ -80,6 +80,9 @@ func (a *setCurrentMemberImageAction) CheckProgress(ctx context.Context) (bool, return false } + if !m.Image.Equal(&imageInfo) { + m.OldImage = m.Image.DeepCopy() + } m.Image = &imageInfo if err := s.Members.Update(m, g); err != nil { diff --git a/pkg/deployment/reconcile/action_upgrade_member.go b/pkg/deployment/reconcile/action_upgrade_member.go index 341c33fa5..1a844c07e 100644 --- a/pkg/deployment/reconcile/action_upgrade_member.go +++ b/pkg/deployment/reconcile/action_upgrade_member.go @@ -110,7 +110,32 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, bool, er log.Error().Msg("No such member") return true, false, nil } + isUpgrading := m.Phase == api.MemberPhaseUpgrading + + if isUpgrading { + if m.Conditions.IsTrue(api.ConditionTypeTerminated) { + if m.Conditions.IsTrue(api.ConditionTypeUpgradeFailed) { + a.log.Error().Msgf("Upgrade of member failed") + } + // Invalidate plan + m.Phase = "" + m.Conditions.Remove(api.ConditionTypeTerminated) + m.Conditions.Remove(api.ConditionTypeUpgradeFailed) + + if m.OldImage != nil { + m.Image = m.OldImage.DeepCopy() + } + + if err := a.actionCtx.UpdateMember(m); err != nil { + return false, true, nil + } + + log.Error().Msgf("Upgrade failed") + return false, true, nil + } + } + log = log.With(). Str("pod-name", m.PodName). Bool("is-upgrading", isUpgrading).Logger() @@ -128,6 +153,9 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, bool, er m.Phase = api.MemberPhaseCreated m.RecentTerminations = nil // Since we're upgrading, we do not care about old terminations. m.CleanoutJobID = "" + if !m.OldImage.Equal(m.Image) && isUpgrading { + m.OldImage = m.Image.DeepCopy() + } if err := a.actionCtx.UpdateMember(m); err != nil { return false, false, errors.WithStack(err) } diff --git a/pkg/deployment/reconcile/action_wait_for_member_up.go b/pkg/deployment/reconcile/action_wait_for_member_up.go index a51474ebd..6f4d2b069 100644 --- a/pkg/deployment/reconcile/action_wait_for_member_up.go +++ b/pkg/deployment/reconcile/action_wait_for_member_up.go @@ -24,6 +24,7 @@ package reconcile import ( "context" + "time" "github.com/arangodb/kube-arangodb/pkg/util/errors" @@ -139,7 +140,10 @@ func (a *actionWaitForMemberUp) checkProgressAgent(ctx context.Context) (bool, b a.Endpoints() } - if err := agency.AreAgentsHealthy(ctx, clients); err != nil { + shortCtx, c := context.WithTimeout(ctx, 3*time.Second) + defer c() + + if err := agency.AreAgentsHealthy(shortCtx, clients); err != nil { log.Debug().Err(err).Msg("Not all agents are ready") return false, false, nil } diff --git a/pkg/deployment/reconcile/plan_builder.go b/pkg/deployment/reconcile/plan_builder.go index 95b62fe86..700245c33 100644 --- a/pkg/deployment/reconcile/plan_builder.go +++ b/pkg/deployment/reconcile/plan_builder.go @@ -113,7 +113,6 @@ func createPlan(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIOb currentPlan api.Plan, spec api.DeploymentSpec, status api.DeploymentStatus, cachedStatus inspector.Inspector, builderCtx PlanBuilderContext) (api.Plan, bool) { - if !currentPlan.IsEmpty() { // Plan already exists, complete that first return currentPlan, false @@ -313,6 +312,11 @@ type planBuilder func(ctx context.Context, spec api.DeploymentSpec, status api.DeploymentStatus, cachedStatus inspector.Inspector, context PlanBuilderContext) api.Plan +type planBuilderCondition func(ctx context.Context, + log zerolog.Logger, apiObject k8sutil.APIObject, + spec api.DeploymentSpec, status api.DeploymentStatus, + cachedStatus inspector.Inspector, context PlanBuilderContext) bool + type planBuilderSubPlan func(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, @@ -335,6 +339,7 @@ func NewWithPlanBuilder(ctx context.Context, type WithPlanBuilder interface { Apply(p planBuilder) api.Plan + ApplyWithCondition(c planBuilderCondition, p planBuilder) api.Plan ApplySubPlan(p planBuilderSubPlan, plans ...planBuilder) api.Plan } @@ -348,6 +353,14 @@ type withPlanBuilder struct { context PlanBuilderContext } +func (w withPlanBuilder) ApplyWithCondition(c planBuilderCondition, p planBuilder) api.Plan { + if !c(w.ctx, w.log, w.apiObject, w.spec, w.status, w.cachedStatus, w.context) { + return api.Plan{} + } + + return w.Apply(p) +} + func (w withPlanBuilder) ApplySubPlan(p planBuilderSubPlan, plans ...planBuilder) api.Plan { return p(w.ctx, w.log, w.apiObject, w.spec, w.status, w.cachedStatus, w.context, w, plans...) } diff --git a/pkg/deployment/reconcile/plan_builder_jwt.go b/pkg/deployment/reconcile/plan_builder_jwt.go index 19ee0a711..689b05e14 100644 --- a/pkg/deployment/reconcile/plan_builder_jwt.go +++ b/pkg/deployment/reconcile/plan_builder_jwt.go @@ -26,6 +26,7 @@ import ( "context" "fmt" "sort" + "time" "github.com/arangodb/kube-arangodb/pkg/util/errors" @@ -220,12 +221,16 @@ func createJWTStatusUpdateRequired(ctx context.Context, func areJWTTokensUpToDate(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, - cachedStatus inspector.Inspector, context PlanBuilderContext, + cachedStatus inspector.Inspector, planCtx PlanBuilderContext, folder *core.Secret) (plan api.Plan, failed bool) { + gCtx, c := context.WithTimeout(ctx, 2*time.Second) + defer c() status.Members.ForeachServerGroup(func(group api.ServerGroup, list api.MemberStatusList) error { for _, m := range list { - if updateRequired, failedMember := isJWTTokenUpToDate(ctx, log, apiObject, spec, status, cachedStatus, context, group, m, folder); failedMember { + nCtx, c := context.WithTimeout(gCtx, 500*time.Millisecond) + defer c() + if updateRequired, failedMember := isJWTTokenUpToDate(nCtx, log, apiObject, spec, status, cachedStatus, planCtx, group, m, folder); failedMember { failed = true continue } else if updateRequired { diff --git a/pkg/deployment/reconcile/plan_builder_tls.go b/pkg/deployment/reconcile/plan_builder_tls.go index dab89301e..2a94dbd59 100644 --- a/pkg/deployment/reconcile/plan_builder_tls.go +++ b/pkg/deployment/reconcile/plan_builder_tls.go @@ -290,7 +290,7 @@ func createCACleanPlan(ctx context.Context, func createKeyfileRenewalPlanDefault(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, - cachedStatus inspector.Inspector, context PlanBuilderContext) api.Plan { + cachedStatus inspector.Inspector, planCtx PlanBuilderContext) api.Plan { if !spec.TLS.IsSecure() { return nil } @@ -306,7 +306,11 @@ func createKeyfileRenewalPlanDefault(ctx context.Context, if !plan.IsEmpty() { return nil } - if renew, recreate := keyfileRenewalRequired(ctx, log, apiObject, spec, status, cachedStatus, context, group, member, api.TLSRotateModeRecreate); renew { + + lCtx, c := context.WithTimeout(ctx, 500*time.Millisecond) + defer c() + + if renew, recreate := keyfileRenewalRequired(lCtx, log, apiObject, spec, status, cachedStatus, planCtx, group, member, api.TLSRotateModeRecreate); renew { log.Info().Msg("Renewal of keyfile required - Recreate") if recreate { plan = append(plan, api.NewAction(api.ActionTypeCleanTLSKeyfileCertificate, group, member.ID, "Remove server keyfile and enforce renewal")) @@ -324,7 +328,7 @@ func createKeyfileRenewalPlanDefault(ctx context.Context, func createKeyfileRenewalPlanInPlace(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, - cachedStatus inspector.Inspector, context PlanBuilderContext) api.Plan { + cachedStatus inspector.Inspector, planCtx PlanBuilderContext) api.Plan { if !spec.TLS.IsSecure() { return nil } @@ -337,7 +341,10 @@ func createKeyfileRenewalPlanInPlace(ctx context.Context, } for _, member := range members { - if renew, recreate := keyfileRenewalRequired(ctx, log, apiObject, spec, status, cachedStatus, context, group, member, api.TLSRotateModeInPlace); renew { + lCtx, c := context.WithTimeout(ctx, 500*time.Millisecond) + defer c() + + if renew, recreate := keyfileRenewalRequired(lCtx, log, apiObject, spec, status, cachedStatus, planCtx, group, member, api.TLSRotateModeInPlace); renew { log.Info().Msg("Renewal of keyfile required - InPlace") if recreate { plan = append(plan, api.NewAction(api.ActionTypeCleanTLSKeyfileCertificate, group, member.ID, "Remove server keyfile and enforce renewal")) @@ -355,16 +362,19 @@ func createKeyfileRenewalPlanInPlace(ctx context.Context, func createKeyfileRenewalPlan(ctx context.Context, log zerolog.Logger, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, - cachedStatus inspector.Inspector, context PlanBuilderContext) api.Plan { + cachedStatus inspector.Inspector, planCtx PlanBuilderContext) api.Plan { if !spec.TLS.IsSecure() { return nil } + gCtx, c := context.WithTimeout(ctx, 2*time.Second) + defer c() + switch createKeyfileRenewalPlanMode(spec, status) { case api.TLSRotateModeInPlace: - return createKeyfileRenewalPlanInPlace(ctx, log, apiObject, spec, status, cachedStatus, context) + return createKeyfileRenewalPlanInPlace(gCtx, log, apiObject, spec, status, cachedStatus, planCtx) default: - return createKeyfileRenewalPlanDefault(ctx, log, apiObject, spec, status, cachedStatus, context) + return createKeyfileRenewalPlanDefault(gCtx, log, apiObject, spec, status, cachedStatus, planCtx) } } @@ -420,6 +430,8 @@ func checkServerValidCertRequest(ctx context.Context, context PlanBuilderContext return nil, err } + req = req.WithContext(ctx) + if auth != nil && auth.Type() == driver.AuthenticationTypeRaw { if h := auth.Get("value"); h != "" { req.Header.Add("Authorization", h) diff --git a/pkg/deployment/resources/pod_creator.go b/pkg/deployment/resources/pod_creator.go index 42611ed86..769e01914 100644 --- a/pkg/deployment/resources/pod_creator.go +++ b/pkg/deployment/resources/pod_creator.go @@ -61,7 +61,7 @@ func versionHasAdvertisedEndpoint(v driver.Version) bool { return v.CompareTo("3.4.0") >= 0 } -// createArangodArgs creates command line arguments for an arangod server in the given group. +// createArangodArgsWithUpgrade creates command line arguments for an arangod server upgrade in the given group. func createArangodArgsWithUpgrade(input pod.Input, additionalOptions ...k8sutil.OptionPair) []string { return createArangodArgs(input, pod.AutoUpgrade().Args(input)...) } @@ -484,8 +484,8 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string, m.PodUID = uid m.PodSpecVersion = sha m.Endpoint = util.NewString(k8sutil.CreatePodDNSNameWithDomain(apiObject, spec.ClusterDomain, role, m.ID)) - m.ArangoVersion = status.CurrentImage.ArangoDBVersion - m.ImageID = status.CurrentImage.ImageID + m.ArangoVersion = m.Image.ArangoDBVersion + m.ImageID = m.Image.ImageID // Check for missing side cars in m.SideCarSpecs = make(map[string]core.Container) @@ -543,6 +543,8 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string, m.Conditions.Remove(api.ConditionTypeTerminating) m.Conditions.Remove(api.ConditionTypeAgentRecoveryNeeded) m.Conditions.Remove(api.ConditionTypeAutoUpgrade) + m.Conditions.Remove(api.ConditionTypeUpgradeFailed) + m.Upgrade = false if err := status.Members.Update(m, group); err != nil { return errors.WithStack(err) } diff --git a/pkg/deployment/resources/pod_creator_arangod.go b/pkg/deployment/resources/pod_creator_arangod.go index 4d7637773..7ef8b7bba 100644 --- a/pkg/deployment/resources/pod_creator_arangod.go +++ b/pkg/deployment/resources/pod_creator_arangod.go @@ -426,7 +426,7 @@ func (m *MemberArangoDPod) GetInitContainers() ([]core.Container, error) { { // Upgrade container - run in background - if m.autoUpgrade { + if m.autoUpgrade || m.status.Upgrade { args := createArangodArgsWithUpgrade(m.AsInput()) c, err := k8sutil.NewContainer(args, m.GetContainerCreator()) @@ -443,6 +443,28 @@ func (m *MemberArangoDPod) GetInitContainers() ([]core.Container, error) { initContainers = append(initContainers, c) } + + // VersionCheck Container + { + versionArgs := pod.UpgradeVersionCheck().Args(m.AsInput()) + if len(versionArgs) > 0 { + args := createArangodArgs(m.AsInput(), versionArgs...) + + c, err := k8sutil.NewContainer(args, m.GetContainerCreator()) + if err != nil { + return nil, err + } + + _, c.VolumeMounts = m.GetVolumes() + + c.Name = api.ServerGroupReservedInitContainerNameVersionCheck + c.Lifecycle = nil + c.LivenessProbe = nil + c.ReadinessProbe = nil + + initContainers = append(initContainers, c) + } + } } return initContainers, nil diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index b09ae4f00..d0a36bd96 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -113,20 +113,64 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspector.Insp // Pod has terminated with at least 1 container with a non-zero exit code. wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") { - if c, ok := k8sutil.GetContainerStatusByName(pod, k8sutil.ServerContainerName); ok { - if t := c.State.Terminated; t != nil { - log.Warn().Str("member", memberStatus.ID). - Str("pod", pod.GetName()). - Str("uid", string(pod.GetUID())). - Int32("exit-code", t.ExitCode). - Str("reason", t.Reason). - Str("message", t.Message). - Int32("signal", t.Signal). - Time("started", t.StartedAt.Time). - Time("finished", t.FinishedAt.Time). - Msgf("Pod failed in unexpected way") + if containers := k8sutil.GetFailedContainerNames(pod.Status.InitContainerStatuses); len(containers) > 0 { + for _, container := range containers { + switch container { + case api.ServerGroupReservedInitContainerNameVersionCheck: + if c, ok := k8sutil.GetAnyContainerStatusByName(pod.Status.InitContainerStatuses, container); ok { + if t := c.State.Terminated; t != nil { + if t := c.State.Terminated; t != nil && t.ExitCode == 11 { + memberStatus.Upgrade = true + updateMemberStatusNeeded = true + } + } + } + case api.ServerGroupReservedInitContainerNameUpgrade: + memberStatus.Conditions.Update(api.ConditionTypeUpgradeFailed, true, "Upgrade Failed", "") + } + + if c, ok := k8sutil.GetAnyContainerStatusByName(pod.Status.InitContainerStatuses, container); ok { + if t := c.State.Terminated; t != nil { + if t := c.State.Terminated; t != nil && t.ExitCode != 0 { + log.Warn().Str("member", memberStatus.ID). + Str("pod", pod.GetName()). + Str("container", container). + Str("uid", string(pod.GetUID())). + Int32("exit-code", t.ExitCode). + Str("reason", t.Reason). + Str("message", t.Message). + Int32("signal", t.Signal). + Time("started", t.StartedAt.Time). + Time("finished", t.FinishedAt.Time). + Msgf("Pod failed in unexpected way: Init Container failed") + } + } + } } } + + if containers := k8sutil.GetFailedContainerNames(pod.Status.ContainerStatuses); len(containers) > 0 { + for _, container := range containers { + if c, ok := k8sutil.GetAnyContainerStatusByName(pod.Status.ContainerStatuses, container); ok { + if t := c.State.Terminated; t != nil { + if t := c.State.Terminated; t != nil && t.ExitCode != 0 { + log.Warn().Str("member", memberStatus.ID). + Str("pod", pod.GetName()). + Str("container", container). + Str("uid", string(pod.GetUID())). + Int32("exit-code", t.ExitCode). + Str("reason", t.Reason). + Str("message", t.Message). + Int32("signal", t.Signal). + Time("started", t.StartedAt.Time). + Time("finished", t.FinishedAt.Time). + Msgf("Pod failed in unexpected way: Core Container failed") + } + } + } + } + } + log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Failed") updateMemberStatusNeeded = true nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) diff --git a/pkg/util/k8sutil/container.go b/pkg/util/k8sutil/container.go index c94122267..4b804739c 100644 --- a/pkg/util/k8sutil/container.go +++ b/pkg/util/k8sutil/container.go @@ -22,33 +22,57 @@ package k8sutil -import v1 "k8s.io/api/core/v1" +import core "k8s.io/api/core/v1" // GetContainerByName returns the container in the given pod with the given name. // Returns false if not found. -func GetContainerByName(p *v1.Pod, name string) (v1.Container, bool) { +func GetContainerByName(p *core.Pod, name string) (core.Container, bool) { for _, c := range p.Spec.Containers { if c.Name == name { return c, true } } - return v1.Container{}, false + return core.Container{}, false } // GetContainerStatusByName returns the container status in the given pod with the given name. // Returns false if not found. -func GetContainerStatusByName(p *v1.Pod, name string) (v1.ContainerStatus, bool) { +func GetContainerStatusByName(p *core.Pod, name string) (core.ContainerStatus, bool) { for _, c := range p.Status.ContainerStatuses { if c.Name == name { return c, true } } - return v1.ContainerStatus{}, false + return core.ContainerStatus{}, false +} + +// GetAnyContainerStatusByName returns the container status in the given ContainerStatus list with the given name. +// Returns false if not found. +func GetAnyContainerStatusByName(containers []core.ContainerStatus, name string) (core.ContainerStatus, bool) { + for _, c := range containers { + if c.Name == name { + return c, true + } + } + return core.ContainerStatus{}, false +} + +// GetFailedContainerNames returns list of failed containers from provided list of statuses. +func GetFailedContainerNames(containers []core.ContainerStatus) []string { + var failedContainers []string + + for _, c := range containers { + if IsContainerFailed(&c) { + failedContainers = append(failedContainers, c.Name) + } + } + + return failedContainers } // IsResourceRequirementsChanged returns true if the resource requirements have changed. -func IsResourceRequirementsChanged(wanted, given v1.ResourceRequirements) bool { - checkList := func(wanted, given v1.ResourceList) bool { +func IsResourceRequirementsChanged(wanted, given core.ResourceRequirements) bool { + checkList := func(wanted, given core.ResourceList) bool { for k, v := range wanted { if gv, ok := given[k]; !ok { return true diff --git a/pkg/util/k8sutil/pods.go b/pkg/util/k8sutil/pods.go index 37b50deab..a512477e4 100644 --- a/pkg/util/k8sutil/pods.go +++ b/pkg/util/k8sutil/pods.go @@ -121,6 +121,18 @@ func IsPodFailed(pod *core.Pod) bool { } } +// IsContainerFailed returns true if the arangodb container +// has terminated wih a non-zero exit code. +func IsContainerFailed(container *core.ContainerStatus) bool { + if c := container.State.Terminated; c != nil { + if c.ExitCode != 0 { + return true + } + } + + return false +} + // IsPodScheduled returns true if the pod has been scheduled. func IsPodScheduled(pod *core.Pod) bool { condition := getPodCondition(&pod.Status, core.PodScheduled)