1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00
kube-arangodb/pkg/deployment/deployment_inspector.go

398 lines
14 KiB
Go
Raw Normal View History

2018-03-15 15:33:28 +00:00
//
// DISCLAIMER
//
2021-04-26 08:30:06 +00:00
// Copyright 2020-2021 ArangoDB GmbH, Cologne, Germany
2018-03-15 15:33:28 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
2021-04-26 08:30:06 +00:00
// Author Tomasz Mielech
2018-03-15 15:33:28 +00:00
//
package deployment
import (
"context"
"time"
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
2021-01-08 14:35:38 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/errors"
2020-11-27 12:49:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
operatorErrors "github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/deployment/resources/inspector"
2020-03-11 07:57:03 +00:00
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
2019-11-04 07:49:24 +00:00
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2018-08-31 14:08:21 +00:00
"github.com/arangodb/kube-arangodb/pkg/metrics"
2018-08-25 10:08:44 +00:00
"github.com/arangodb/kube-arangodb/pkg/util"
2018-03-15 15:33:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2018-03-15 15:33:28 +00:00
)
2021-04-26 08:30:06 +00:00
const (
timeoutReconciliationPerNode = time.Second * 20
)
2018-08-31 14:08:21 +00:00
var (
inspectDeploymentDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_deployment_duration", "Amount of time taken by a single inspection of a deployment (in sec)", metrics.DeploymentName)
)
2021-04-26 08:30:06 +00:00
// getReconciliationTimeout gets timeout for the reconciliation loop.
// The whole reconciliation loop timeout depends on the number of nodes but not less then one minute.
func (d *Deployment) getReconciliationTimeout() (time.Duration, error) {
ctx, cancel := context.WithTimeout(context.TODO(), k8sutil.GetRequestTimeout())
defer cancel()
nodes, err := d.GetKubeCli().CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return 0, errors.Wrapf(err, "Unable to get nodes")
}
if timeout := timeoutReconciliationPerNode * time.Duration(len(nodes.Items)); timeout > time.Minute {
return timeout, nil
}
// The minimum timeout for the reconciliation loop.
return time.Minute, nil
}
2018-03-15 15:33:28 +00:00
// inspectDeployment inspects the entire deployment, creates
// a plan to update if needed and inspects underlying resources.
// This function should be called when:
// - the deployment has changed
// - any of the underlying resources has changed
// - once in a while
// Returns the delay until this function should be called again.
2018-08-30 14:57:08 +00:00
func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval {
log := d.deps.Log
2018-08-31 14:08:21 +00:00
start := time.Now()
2021-04-26 08:30:06 +00:00
timeout, err := d.getReconciliationTimeout()
if err != nil {
log.Error().Err(err).Msg("Unable to get nodes")
return minInspectionInterval // Retry ASAP
}
ctxReconciliation, cancelReconciliation := context.WithTimeout(context.Background(), timeout)
defer cancelReconciliation()
defer func() {
d.deps.Log.Info().Msgf("Inspect loop took %s", time.Since(start))
}()
2018-03-15 15:33:28 +00:00
nextInterval := lastInterval
hasError := false
2021-04-26 08:30:06 +00:00
2018-08-31 14:08:21 +00:00
deploymentName := d.apiObject.GetName()
defer metrics.SetDuration(inspectDeploymentDurationGauges.WithLabelValues(deploymentName), start)
2018-03-15 15:33:28 +00:00
cachedStatus, err := inspector.NewInspector(d.GetKubeCli(), d.GetMonitoringV1Cli(), d.GetArangoCli(), d.GetNamespace())
if err != nil {
log.Error().Err(err).Msg("Unable to get resources")
return minInspectionInterval // Retry ASAP
}
// Check deployment still exists
2021-04-26 08:30:06 +00:00
var updated *api.ArangoDeployment
err = k8sutil.RunWithTimeout(ctxReconciliation, func(ctxChild context.Context) error {
var err error
updated, err = d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(d.apiObject.GetNamespace()).Get(ctxChild, deploymentName, metav1.GetOptions{})
return err
})
if k8sutil.IsNotFound(err) {
// Deployment is gone
log.Info().Msg("Deployment is gone")
d.Delete()
return nextInterval
} else if updated != nil && updated.GetDeletionTimestamp() != nil {
// Deployment is marked for deletion
2021-04-26 08:30:06 +00:00
if err := d.runDeploymentFinalizers(ctxReconciliation, cachedStatus); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("ArangoDeployment finalizer inspection failed", err, d.apiObject))
}
} else {
2020-01-27 06:33:12 +00:00
// Check if maintenance annotation is set
if updated != nil && updated.Annotations != nil {
2020-03-11 07:57:03 +00:00
if v, ok := updated.Annotations[deployment.ArangoDeploymentPodMaintenanceAnnotation]; ok && v == "true" {
2020-01-27 06:33:12 +00:00
// Disable checks if we will enter maintenance mode
log.Info().Str("deployment", deploymentName).Msg("Deployment in maintenance mode")
return nextInterval
}
}
// Is the deployment in failed state, if so, give up.
2018-06-12 09:09:42 +00:00
if d.GetPhase() == api.DeploymentPhaseFailed {
log.Debug().Msg("Deployment is in Failed state.")
return nextInterval
}
2020-11-27 12:49:28 +00:00
d.apiObject = updated
2021-04-26 08:30:06 +00:00
inspectNextInterval, err := d.inspectDeploymentWithError(ctxReconciliation, nextInterval, cachedStatus)
if err != nil {
if !operatorErrors.IsReconcile(err) {
nextInterval = inspectNextInterval
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Reconcilation failed", err, d.apiObject))
} else {
nextInterval = minInspectionInterval
}
}
}
// Update next interval (on errors)
if hasError {
if d.recentInspectionErrors == 0 {
nextInterval = minInspectionInterval
d.recentInspectionErrors++
}
} else {
d.recentInspectionErrors = 0
}
return nextInterval.ReduceTo(maxInspectionInterval)
}
2018-03-20 12:10:52 +00:00
2021-04-26 08:30:06 +00:00
func (d *Deployment) inspectDeploymentWithError(ctx context.Context, lastInterval util.Interval,
cachedStatus inspectorInterface.Inspector) (nextInterval util.Interval, inspectError error) {
t := time.Now()
2021-02-10 08:17:52 +00:00
d.SetCachedStatus(cachedStatus)
defer d.SetCachedStatus(nil)
defer func() {
d.deps.Log.Info().Msgf("Reconciliation loop took %s", time.Since(t))
}()
// Ensure that spec and status checksum are same
spec := d.GetSpec()
status, _ := d.getStatus()
2018-03-15 15:33:28 +00:00
nextInterval = lastInterval
inspectError = nil
checksum, err := spec.Checksum()
if err != nil {
return minInspectionInterval, errors.Wrapf(err, "Calculation of spec failed")
} else {
condition, exists := status.Conditions.Get(api.ConditionTypeUpToDate)
if checksum != status.AppliedVersion && (!exists || condition.IsTrue()) {
2021-04-26 08:30:06 +00:00
if err = d.updateCondition(ctx, api.ConditionTypeUpToDate, false, "Spec Changed", "Spec Object changed. Waiting until plan will be applied"); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil // Retry ASAP
2018-09-07 11:18:39 +00:00
}
}
2018-09-07 11:18:39 +00:00
2020-12-15 11:41:14 +00:00
// Cleanup terminated pods on the beginning of loop
2021-04-26 08:30:06 +00:00
if x, err := d.resources.CleanupTerminatedPods(ctx, cachedStatus); err != nil {
2020-12-15 11:41:14 +00:00
return minInspectionInterval, errors.Wrapf(err, "Pod cleanup failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureArangoMembers(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "ArangoMember creation failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureServices(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Service creation failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureSecrets(ctx, d.deps.Log, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Secret creation failed")
}
// Inspect secret hashes
2021-04-26 08:30:06 +00:00
if err := d.resources.ValidateSecretHashes(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Secret hash validation failed")
}
2018-03-15 15:33:28 +00:00
// Check for LicenseKeySecret
if err := d.resources.ValidateLicenseKeySecret(cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "License Key Secret invalid")
}
// Is the deployment in a good state?
if status.Conditions.IsTrue(api.ConditionTypeSecretsChanged) {
2021-01-08 14:35:38 +00:00
return minInspectionInterval, errors.Newf("Secrets changed")
}
// Ensure we have image info
if retrySoon, exists, err := d.ensureImages(ctx, d.apiObject, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Image detection failed")
2020-06-26 06:53:24 +00:00
} else if retrySoon || !exists {
return minInspectionInterval, nil
}
// Inspection of generated resources needed
if x, err := d.resources.InspectPods(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Pod inspection failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
if x, err := d.resources.InspectPVCs(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "PVC inspection failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
// Check members for resilience
2021-04-26 08:30:06 +00:00
if err := d.resilience.CheckMemberFailure(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Member failure detection failed")
}
// Immediate actions
2021-04-26 08:30:06 +00:00
if err := d.reconciler.CheckDeployment(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Reconciler immediate actions failed")
}
2021-04-26 08:30:06 +00:00
if interval, err := d.ensureResources(ctx, nextInterval, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Reconciler resource recreation failed")
} else {
nextInterval = interval
}
// Create scale/update plan
2020-11-27 12:49:28 +00:00
if _, ok := d.apiObject.Annotations[deployment.ArangoDeploymentPlanCleanAnnotation]; ok {
2021-04-26 08:30:06 +00:00
if err := d.ApplyPatch(ctx, patch.ItemRemove(patch.NewPath("metadata", "annotations", deployment.ArangoDeploymentPlanCleanAnnotation))); err != nil {
2020-11-27 12:49:28 +00:00
return minInspectionInterval, errors.Wrapf(err, "Unable to create remove annotation patch")
}
2021-04-26 08:30:06 +00:00
if err := d.WithStatusUpdate(ctx, func(s *api.DeploymentStatus) bool {
2020-11-27 12:49:28 +00:00
s.Plan = nil
return true
}, true); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable clean plan")
}
} else if err, updated := d.reconciler.CreatePlan(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Plan creation failed")
} else if updated {
return minInspectionInterval, nil
}
if d.apiObject.Status.Plan.IsEmpty() && status.AppliedVersion != checksum {
2021-04-26 08:30:06 +00:00
if err := d.WithStatusUpdate(ctx, func(s *api.DeploymentStatus) bool {
s.AppliedVersion = checksum
return true
}); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil
} else if status.AppliedVersion == checksum {
if !status.Plan.IsEmpty() && status.Conditions.IsTrue(api.ConditionTypeUpToDate) {
2021-04-26 08:30:06 +00:00
if err = d.updateCondition(ctx, api.ConditionTypeUpToDate, false, "Plan is not empty", "There are pending operations in plan"); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil
}
if status.Plan.IsEmpty() && !status.Conditions.IsTrue(api.ConditionTypeUpToDate) {
2021-04-26 08:30:06 +00:00
if err = d.updateCondition(ctx, api.ConditionTypeUpToDate, true, "Spec is Up To Date", "Spec is Up To Date"); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Unable to update UpToDate condition")
}
return minInspectionInterval, nil
}
}
// Execute current step of scale/update plan
retrySoon, err := d.reconciler.ExecutePlan(ctx, cachedStatus)
if err != nil {
return minInspectionInterval, errors.Wrapf(err, "Plan execution failed")
}
if retrySoon {
nextInterval = minInspectionInterval
}
2018-03-15 15:33:28 +00:00
// Create access packages
2021-04-26 08:30:06 +00:00
if err := d.createAccessPackages(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "AccessPackage creation failed")
}
2019-11-11 13:11:27 +00:00
// Inspect deployment for obsolete members
2021-04-26 08:30:06 +00:00
if err := d.resources.CleanupRemovedMembers(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Removed member cleanup failed")
}
// At the end of the inspect, we cleanup terminated pods.
2021-04-26 08:30:06 +00:00
if x, err := d.resources.CleanupTerminatedPods(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Pod cleanup failed")
} else {
nextInterval = nextInterval.ReduceTo(x)
}
return
}
2021-04-26 08:30:06 +00:00
func (d *Deployment) ensureResources(ctx context.Context, lastInterval util.Interval, cachedStatus inspectorInterface.Inspector) (util.Interval, error) {
// Ensure all resources are created
if d.haveServiceMonitorCRD {
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureServiceMonitor(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Service monitor creation failed")
}
2018-03-26 11:35:00 +00:00
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsurePVCs(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "PVC creation failed")
2018-03-15 15:33:28 +00:00
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsurePods(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Pod creation failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsurePDBs(ctx); err != nil {
return minInspectionInterval, errors.Wrapf(err, "PDB creation failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureAnnotations(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Annotation update failed")
}
2021-04-26 08:30:06 +00:00
if err := d.resources.EnsureLabels(ctx, cachedStatus); err != nil {
return minInspectionInterval, errors.Wrapf(err, "Labels update failed")
}
return lastInterval, nil
2018-03-15 15:33:28 +00:00
}
// triggerInspection ensures that an inspection is run soon.
func (d *Deployment) triggerInspection() {
d.inspectTrigger.Trigger()
}
2019-05-16 08:43:02 +00:00
// triggerCRDInspection ensures that an inspection is run soon.
func (d *Deployment) triggerCRDInspection() {
d.inspectCRDTrigger.Trigger()
}
2021-04-26 08:30:06 +00:00
func (d *Deployment) updateCondition(ctx context.Context, conditionType api.ConditionType, status bool, reason, message string) error {
d.deps.Log.Info().Str("condition", string(conditionType)).Bool("status", status).Str("reason", reason).Str("message", message).Msg("Updated condition")
2021-04-26 08:30:06 +00:00
if err := d.WithStatusUpdate(ctx, func(s *api.DeploymentStatus) bool {
return s.Conditions.Update(conditionType, status, reason, message)
}); err != nil {
return errors.Wrapf(err, "Unable to update condition")
}
return nil
}