1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00
kube-arangodb/pkg/deployment/deployment_inspector.go

478 lines
17 KiB
Go
Raw Normal View History

2018-03-15 15:33:28 +00:00
//
// DISCLAIMER
//
2022-01-10 11:35:49 +00:00
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
2018-03-15 15:33:28 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package deployment
import (
"context"
"time"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
2021-01-08 14:35:38 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/errors"
2020-11-27 12:49:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
operatorErrors "github.com/arangodb/kube-arangodb/pkg/util/errors"
2020-03-11 07:57:03 +00:00
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
2019-11-04 07:49:24 +00:00
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2022-03-10 11:29:55 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/acs"
2018-08-31 14:08:21 +00:00
"github.com/arangodb/kube-arangodb/pkg/metrics"
"github.com/arangodb/kube-arangodb/pkg/upgrade"
2018-08-25 10:08:44 +00:00
"github.com/arangodb/kube-arangodb/pkg/util"
2018-03-15 15:33:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
2022-03-10 11:29:55 +00:00
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
2018-03-15 15:33:28 +00:00
)
2018-08-31 14:08:21 +00:00
var (
inspectDeploymentDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_deployment_duration", "Amount of time taken by a single inspection of a deployment (in sec)", metrics.DeploymentName)
)
2018-03-15 15:33:28 +00:00
// inspectDeployment inspects the entire deployment, creates
// a plan to update if needed and inspects underlying resources.
// This function should be called when:
// - the deployment has changed
// - any of the underlying resources has changed
// - once in a while
// Returns the delay until this function should be called again.
2018-08-30 14:57:08 +00:00
func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval {
log := d.deps.Log
2018-08-31 14:08:21 +00:00
start := time.Now()
2021-04-26 08:30:06 +00:00
ctxReconciliation, cancelReconciliation := globals.GetGlobalTimeouts().Reconciliation().WithTimeout(context.Background())
2021-04-26 08:30:06 +00:00
defer cancelReconciliation()
defer func() {
d.deps.Log.Info().Msgf("Inspect loop took %s", time.Since(start))
}()
2018-03-15 15:33:28 +00:00
nextInterval := lastInterval
hasError := false
2021-04-26 08:30:06 +00:00
deploymentName := d.GetName()
2018-08-31 14:08:21 +00:00
defer metrics.SetDuration(inspectDeploymentDurationGauges.WithLabelValues(deploymentName), start)
2018-03-15 15:33:28 +00:00
err := d.currentState.Refresh(ctxReconciliation)
if err != nil {
log.Error().Err(err).Msg("Unable to get resources")
return minInspectionInterval // Retry ASAP
}
// Check deployment still exists
2021-04-26 08:30:06 +00:00
var updated *api.ArangoDeployment
err = globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctxReconciliation, func(ctxChild context.Context) error {
2021-04-26 08:30:06 +00:00
var err error
2022-03-10 11:29:55 +00:00
updated, err = d.deps.Client.Arango().DatabaseV1().ArangoDeployments(d.GetNamespace()).Get(ctxChild, deploymentName, meta.GetOptions{})
2021-04-26 08:30:06 +00:00
return err
})
if k8sutil.IsNotFound(err) {
// Deployment is gone
log.Info().Msg("Deployment is gone")
d.Delete()
return nextInterval
} else if updated != nil && updated.GetDeletionTimestamp() != nil {
// Deployment is marked for deletion
if err := d.runDeploymentFinalizers(ctxReconciliation, d.GetCachedStatus()); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("ArangoDeployment finalizer inspection failed", err, d.apiObject))
}
} else {
2020-01-27 06:33:12 +00:00
// Check if maintenance annotation is set
if updated != nil && updated.Annotations != nil {
2020-03-11 07:57:03 +00:00
if v, ok := updated.Annotations[deployment.ArangoDeploymentPodMaintenanceAnnotation]; ok && v == "true" {
2020-01-27 06:33:12 +00:00
// Disable checks if we will enter maintenance mode
log.Info().Str("deployment", deploymentName).Msg("Deployment in maintenance mode")
return nextInterval
}
}
// Is the deployment in failed state, if so, give up.
2018-06-12 09:09:42 +00:00
if d.GetPhase() == api.DeploymentPhaseFailed {
log.Debug().Msg("Deployment is in Failed state.")
return nextInterval
}
2020-11-27 12:49:28 +00:00
d.apiObject = updated
d.GetMembersState().RefreshState(ctxReconciliation, updated.Status.Members.AsList())
d.GetMembersState().Log(d.deps.Log)
if err := d.WithStatusUpdateErr(ctxReconciliation, func(s *api.DeploymentStatus) (bool, error) {
if changed, err := upgrade.RunUpgrade(*updated, s, d.GetCachedStatus()); err != nil {
return false, err
} else {
return changed, nil
}
}); err != nil {
d.CreateEvent(k8sutil.NewErrorEvent("Upgrade failed", err, d.apiObject))
nextInterval = minInspectionInterval
d.recentInspectionErrors++
return nextInterval.ReduceTo(maxInspectionInterval)
}
inspectNextInterval, err := d.inspectDeploymentWithError(ctxReconciliation, nextInterval)
2021-04-26 08:30:06 +00:00
if err != nil {
if !operatorErrors.IsReconcile(err) {
nextInterval = inspectNextInterval
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Reconcilation failed", err, d.apiObject))
} else {
nextInterval = minInspectionInterval
}
}
}
// Update next interval (on errors)
if hasError {
if d.recentInspectionErrors == 0 {
nextInterval = minInspectionInterval
d.recentInspectionErrors++
}
} else {
d.recentInspectionErrors = 0
}
return nextInterval.ReduceTo(maxInspectionInterval)
}
2018-03-20 12:10:52 +00:00
func (d *Deployment) inspectDeploymentWithError(ctx context.Context, lastInterval util.Interval) (nextInterval util.Interval, inspectError error) {
t := time.Now()
2021-02-10 08:17:52 +00:00
defer func() {
d.deps.Log.Info().Msgf("Reconciliation loop took %s", time.Since(t))
}()
// Ensure that spec and status checksum are same
spec := d.GetSpec()
status, _ := d.getStatus()
2018-03-15 15:33:28 +00:00
nextInterval = lastInterval
inspectError = nil
checksum, err := spec.Checksum()
if err != nil {
return minInspectionInterval, errors.Wrapf(err, "Calculation of spec failed")
} else {
condition, exists := status.Conditions.Get(api.ConditionTypeUpToDate)
if checksum != status.AppliedVersion && (!exists || condition.IsTrue()) {
2022-01-15 22:55:08 +00:00
if err = d.updateConditionWithHash(ctx, api.ConditionTypeUpToDate, false, "Spec Changed", "Spec Object changed. Waiting until plan will be applied", checksum); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil // Retry ASAP
2018-09-07 11:18:39 +00:00
}
}
2018-09-07 11:18:39 +00:00
if err := acs.Inspect(ctx, d.apiObject, d.deps.Client, d.GetCachedStatus()); err != nil {
2022-03-10 11:29:55 +00:00
d.deps.Log.Warn().Err(err).Msgf("Unable to handle ACS objects")
}
2020-12-15 11:41:14 +00:00
// Cleanup terminated pods on the beginning of loop
if x, err := d.resources.CleanupTerminatedPods(ctx, d.GetCachedStatus()); err != nil {
2020-12-15 11:41:14 +00:00
return minInspectionInterval, errors.Wrapf(err, "Pod cleanup failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
if err := d.resources.EnsureArangoMembers(ctx, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "ArangoMember creation failed")
}
if err := d.resources.EnsureServices(ctx, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Service creation failed")
}
if err := d.resources.EnsureSecrets(ctx, d.deps.Log, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Secret creation failed")
}
// Inspect secret hashes
if err := d.resources.ValidateSecretHashes(ctx, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Secret hash validation failed")
}
2018-03-15 15:33:28 +00:00
// Check for LicenseKeySecret
if err := d.resources.ValidateLicenseKeySecret(d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "License Key Secret invalid")
}
// Is the deployment in a good state?
if status.Conditions.IsTrue(api.ConditionTypeSecretsChanged) {
2021-01-08 14:35:38 +00:00
return minInspectionInterval, errors.Newf("Secrets changed")
}
// Ensure we have image info
if retrySoon, exists, err := d.ensureImages(ctx, d.apiObject, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Image detection failed")
2020-06-26 06:53:24 +00:00
} else if retrySoon || !exists {
return minInspectionInterval, nil
}
// Inspection of generated resources needed
if x, err := d.resources.InspectPods(ctx, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Pod inspection failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
if x, err := d.resources.InspectPVCs(ctx, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "PVC inspection failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
// Check members for resilience
2021-04-26 08:30:06 +00:00
if err := d.resilience.CheckMemberFailure(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Member failure detection failed")
}
// Immediate actions
2021-04-26 08:30:06 +00:00
if err := d.reconciler.CheckDeployment(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Reconciler immediate actions failed")
}
if interval, err := d.ensureResources(ctx, nextInterval, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Reconciler resource recreation failed")
} else {
nextInterval = interval
}
2021-12-06 10:31:17 +00:00
inspectDeploymentAgencyFetches.WithLabelValues(d.GetName()).Inc()
if offset, err := d.RefreshAgencyCache(ctx); err != nil {
inspectDeploymentAgencyErrors.WithLabelValues(d.GetName()).Inc()
d.deps.Log.Err(err).Msgf("Unable to refresh agency")
} else {
inspectDeploymentAgencyIndex.WithLabelValues(d.GetName()).Set(float64(offset))
}
// Refresh maintenance lock
d.refreshMaintenanceTTL(ctx)
// Create scale/update plan
2020-11-27 12:49:28 +00:00
if _, ok := d.apiObject.Annotations[deployment.ArangoDeploymentPlanCleanAnnotation]; ok {
2021-04-26 08:30:06 +00:00
if err := d.ApplyPatch(ctx, patch.ItemRemove(patch.NewPath("metadata", "annotations", deployment.ArangoDeploymentPlanCleanAnnotation))); err != nil {
2020-11-27 12:49:28 +00:00
return minInspectionInterval, errors.Wrapf(err, "Unable to create remove annotation patch")
}
2021-04-26 08:30:06 +00:00
if err := d.WithStatusUpdate(ctx, func(s *api.DeploymentStatus) bool {
2020-11-27 12:49:28 +00:00
s.Plan = nil
return true
}, true); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable clean plan")
}
} else if err, updated := d.reconciler.CreatePlan(ctx, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Plan creation failed")
} else if updated {
2022-01-15 22:55:08 +00:00
d.deps.Log.Info().Msgf("Plan generated, reconciling")
return minInspectionInterval, nil
}
2022-02-22 15:55:33 +00:00
// Reachable state ensurer
reachableConditionState := status.Conditions.Check(api.ConditionTypeReachable).Exists().IsTrue().Evaluate()
if d.GetMembersState().State().IsReachable() {
2022-02-22 15:55:33 +00:00
if !reachableConditionState {
if err = d.updateConditionWithHash(ctx, api.ConditionTypeReachable, true, "ArangoDB is reachable", "", ""); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update Reachable condition")
}
}
} else {
if reachableConditionState {
if err = d.updateConditionWithHash(ctx, api.ConditionTypeReachable, false, "ArangoDB is not reachable", "", ""); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update Reachable condition")
}
}
}
if d.apiObject.Status.IsPlanEmpty() && status.AppliedVersion != checksum {
2021-04-26 08:30:06 +00:00
if err := d.WithStatusUpdate(ctx, func(s *api.DeploymentStatus) bool {
s.AppliedVersion = checksum
return true
}); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil
} else if status.AppliedVersion == checksum {
2022-02-22 15:55:33 +00:00
isUpToDate, reason := d.isUpToDateStatus(status)
2021-08-30 09:07:52 +00:00
if !isUpToDate && status.Conditions.IsTrue(api.ConditionTypeUpToDate) {
2022-01-15 22:55:08 +00:00
if err = d.updateConditionWithHash(ctx, api.ConditionTypeUpToDate, false, reason, "There are pending operations in plan or members are in restart process", checksum); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil
}
2021-08-30 09:07:52 +00:00
if isUpToDate && !status.Conditions.IsTrue(api.ConditionTypeUpToDate) {
2022-01-15 22:55:08 +00:00
if err = d.updateConditionWithHash(ctx, api.ConditionTypeUpToDate, true, "Spec is Up To Date", "Spec is Up To Date", checksum); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil
}
}
// Execute current step of scale/update plan
retrySoon, err := d.reconciler.ExecutePlan(ctx, d.GetCachedStatus())
if err != nil {
return minInspectionInterval, errors.Wrapf(err, "Plan execution failed")
}
if retrySoon {
nextInterval = minInspectionInterval
}
2018-03-15 15:33:28 +00:00
// Create access packages
2021-04-26 08:30:06 +00:00
if err := d.createAccessPackages(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "AccessPackage creation failed")
}
2019-11-11 13:11:27 +00:00
// Inspect deployment for synced members
if err := d.resources.SyncMembersInCluster(ctx, d.GetMembersState().Health()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Removed member cleanup failed")
}
// At the end of the inspect, we cleanup terminated pods.
if x, err := d.resources.CleanupTerminatedPods(ctx, d.GetCachedStatus()); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Pod cleanup failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
return
}
2022-02-22 15:55:33 +00:00
func (d *Deployment) isUpToDateStatus(status api.DeploymentStatus) (upToDate bool, reason string) {
if !status.IsPlanEmpty() {
2021-08-30 09:07:52 +00:00
return false, "Plan is not empty"
}
upToDate = true
2022-02-22 15:55:33 +00:00
if !status.Conditions.Check(api.ConditionTypeReachable).Exists().IsTrue().Evaluate() {
upToDate = false
}
status.Members.ForeachServerGroup(func(group api.ServerGroup, list api.MemberStatusList) error {
if !upToDate {
return nil
}
2021-08-30 09:07:52 +00:00
for _, member := range list {
if member.Conditions.IsTrue(api.ConditionTypeRestart) || member.Conditions.IsTrue(api.ConditionTypePendingRestart) {
upToDate = false
reason = "Pending restarts on members"
2022-02-22 15:55:33 +00:00
return nil
}
if member.Conditions.IsTrue(api.ConditionTypePVCResizePending) {
upToDate = false
reason = "PVC is resizing"
return nil
2021-08-30 09:07:52 +00:00
}
}
return nil
})
return
}
func (d *Deployment) refreshMaintenanceTTL(ctx context.Context) {
if d.apiObject.Spec.Mode.Get() == api.DeploymentModeSingle {
return
}
if !features.Maintenance().Enabled() {
// Maintenance feature is not enabled
return
}
condition, ok := d.status.last.Conditions.Get(api.ConditionTypeMaintenanceMode)
if !ok || !condition.IsTrue() {
return
}
// Check GracePeriod
if condition.LastUpdateTime.Add(d.apiObject.Spec.Timeouts.GetMaintenanceGracePeriod()).Before(time.Now()) {
if err := d.SetAgencyMaintenanceMode(ctx, true); err != nil {
return
}
if err := d.WithStatusUpdate(ctx, func(s *api.DeploymentStatus) bool {
return s.Conditions.Touch(api.ConditionTypeMaintenanceMode)
}); err != nil {
return
}
d.deps.Log.Info().Msgf("Refreshed maintenance lock")
}
}
2021-04-26 08:30:06 +00:00
func (d *Deployment) ensureResources(ctx context.Context, lastInterval util.Interval, cachedStatus inspectorInterface.Inspector) (util.Interval, error) {
// Ensure all resources are created
if d.haveServiceMonitorCRD {
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureServiceMonitor(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Service monitor creation failed")
}
2018-03-26 11:35:00 +00:00
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsurePVCs(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "PVC creation failed")
2018-03-15 15:33:28 +00:00
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsurePods(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Pod creation failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsurePDBs(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "PDB creation failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureAnnotations(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Annotation update failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureLabels(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Labels update failed")
}
return lastInterval, nil
2018-03-15 15:33:28 +00:00
}
// triggerInspection ensures that an inspection is run soon.
func (d *Deployment) triggerInspection() {
d.inspectTrigger.Trigger()
}
2019-05-16 08:43:02 +00:00
// triggerCRDInspection ensures that an inspection is run soon.
func (d *Deployment) triggerCRDInspection() {
d.inspectCRDTrigger.Trigger()
}
2022-01-15 22:55:08 +00:00
func (d *Deployment) updateConditionWithHash(ctx context.Context, conditionType api.ConditionType, status bool, reason, message, hash string) error {
d.deps.Log.Info().Str("condition", string(conditionType)).Bool("status", status).Str("reason", reason).Str("message", message).Str("hash", hash).Msg("Updated condition")
2021-04-26 08:30:06 +00:00
if err := d.WithStatusUpdate(ctx, func(s *api.DeploymentStatus) bool {
2022-01-15 22:55:08 +00:00
return s.Conditions.UpdateWithHash(conditionType, status, reason, message, hash)
}); err != nil {
return errors.Wrapf(err, "Unable to update condition")
}
return nil
}