2018-03-15 15:33:28 +00:00
//
// DISCLAIMER
//
2021-04-26 08:30:06 +00:00
// Copyright 2020-2021 ArangoDB GmbH, Cologne, Germany
2018-03-15 15:33:28 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
2021-04-26 08:30:06 +00:00
// Author Tomasz Mielech
2018-03-15 15:33:28 +00:00
//
package deployment
import (
"context"
"time"
2021-03-10 13:30:47 +00:00
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
2021-01-08 14:35:38 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/errors"
2020-11-27 12:49:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
2020-06-08 11:30:32 +00:00
operatorErrors "github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/deployment/resources/inspector"
2020-03-11 07:57:03 +00:00
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
2019-11-04 07:49:24 +00:00
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2018-08-31 14:08:21 +00:00
"github.com/arangodb/kube-arangodb/pkg/metrics"
2018-08-25 10:08:44 +00:00
"github.com/arangodb/kube-arangodb/pkg/util"
2018-03-15 15:33:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
2018-04-03 15:43:42 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2018-03-15 15:33:28 +00:00
)
2021-04-26 08:30:06 +00:00
const (
timeoutReconciliationPerNode = time . Second * 20
)
2018-08-31 14:08:21 +00:00
var (
inspectDeploymentDurationGauges = metrics . MustRegisterGaugeVec ( metricsComponent , "inspect_deployment_duration" , "Amount of time taken by a single inspection of a deployment (in sec)" , metrics . DeploymentName )
)
2021-04-26 08:30:06 +00:00
// getReconciliationTimeout gets timeout for the reconciliation loop.
// The whole reconciliation loop timeout depends on the number of nodes but not less then one minute.
func ( d * Deployment ) getReconciliationTimeout ( ) ( time . Duration , error ) {
ctx , cancel := context . WithTimeout ( context . TODO ( ) , k8sutil . GetRequestTimeout ( ) )
defer cancel ( )
nodes , err := d . GetKubeCli ( ) . CoreV1 ( ) . Nodes ( ) . List ( ctx , metav1 . ListOptions { } )
if err != nil {
return 0 , errors . Wrapf ( err , "Unable to get nodes" )
}
if timeout := timeoutReconciliationPerNode * time . Duration ( len ( nodes . Items ) ) ; timeout > time . Minute {
return timeout , nil
}
// The minimum timeout for the reconciliation loop.
return time . Minute , nil
}
2018-03-15 15:33:28 +00:00
// inspectDeployment inspects the entire deployment, creates
// a plan to update if needed and inspects underlying resources.
// This function should be called when:
// - the deployment has changed
// - any of the underlying resources has changed
// - once in a while
// Returns the delay until this function should be called again.
2018-08-30 14:57:08 +00:00
func ( d * Deployment ) inspectDeployment ( lastInterval util . Interval ) util . Interval {
2018-03-27 10:11:57 +00:00
log := d . deps . Log
2018-08-31 14:08:21 +00:00
start := time . Now ( )
2021-04-26 08:30:06 +00:00
timeout , err := d . getReconciliationTimeout ( )
if err != nil {
log . Error ( ) . Err ( err ) . Msg ( "Unable to get nodes" )
return minInspectionInterval // Retry ASAP
}
ctxReconciliation , cancelReconciliation := context . WithTimeout ( context . Background ( ) , timeout )
defer cancelReconciliation ( )
2020-06-08 11:30:32 +00:00
defer func ( ) {
d . deps . Log . Info ( ) . Msgf ( "Inspect loop took %s" , time . Since ( start ) )
} ( )
2018-03-15 15:33:28 +00:00
nextInterval := lastInterval
hasError := false
2021-04-26 08:30:06 +00:00
2018-08-31 14:08:21 +00:00
deploymentName := d . apiObject . GetName ( )
defer metrics . SetDuration ( inspectDeploymentDurationGauges . WithLabelValues ( deploymentName ) , start )
2018-03-15 15:33:28 +00:00
2021-03-10 13:30:47 +00:00
cachedStatus , err := inspector . NewInspector ( d . GetKubeCli ( ) , d . GetMonitoringV1Cli ( ) , d . GetArangoCli ( ) , d . GetNamespace ( ) )
2020-06-08 11:30:32 +00:00
if err != nil {
log . Error ( ) . Err ( err ) . Msg ( "Unable to get resources" )
return minInspectionInterval // Retry ASAP
}
2018-04-03 15:43:42 +00:00
// Check deployment still exists
2021-04-26 08:30:06 +00:00
var updated * api . ArangoDeployment
err = k8sutil . RunWithTimeout ( ctxReconciliation , func ( ctxChild context . Context ) error {
var err error
updated , err = d . deps . DatabaseCRCli . DatabaseV1 ( ) . ArangoDeployments ( d . apiObject . GetNamespace ( ) ) . Get ( ctxChild , deploymentName , metav1 . GetOptions { } )
return err
} )
2018-06-07 14:22:02 +00:00
if k8sutil . IsNotFound ( err ) {
2018-04-03 15:43:42 +00:00
// Deployment is gone
log . Info ( ) . Msg ( "Deployment is gone" )
d . Delete ( )
return nextInterval
2018-06-07 14:22:02 +00:00
} else if updated != nil && updated . GetDeletionTimestamp ( ) != nil {
// Deployment is marked for deletion
2021-04-26 08:30:06 +00:00
if err := d . runDeploymentFinalizers ( ctxReconciliation , cachedStatus ) ; err != nil {
2018-06-07 14:22:02 +00:00
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "ArangoDeployment finalizer inspection failed" , err , d . apiObject ) )
}
} else {
2020-01-27 06:33:12 +00:00
// Check if maintenance annotation is set
if updated != nil && updated . Annotations != nil {
2020-03-11 07:57:03 +00:00
if v , ok := updated . Annotations [ deployment . ArangoDeploymentPodMaintenanceAnnotation ] ; ok && v == "true" {
2020-01-27 06:33:12 +00:00
// Disable checks if we will enter maintenance mode
log . Info ( ) . Str ( "deployment" , deploymentName ) . Msg ( "Deployment in maintenance mode" )
return nextInterval
}
}
2018-06-07 14:22:02 +00:00
// Is the deployment in failed state, if so, give up.
2018-06-12 09:09:42 +00:00
if d . GetPhase ( ) == api . DeploymentPhaseFailed {
2018-06-07 14:22:02 +00:00
log . Debug ( ) . Msg ( "Deployment is in Failed state." )
return nextInterval
}
2018-03-27 10:11:57 +00:00
2020-11-27 12:49:28 +00:00
d . apiObject = updated
2021-04-26 08:30:06 +00:00
inspectNextInterval , err := d . inspectDeploymentWithError ( ctxReconciliation , nextInterval , cachedStatus )
if err != nil {
2020-06-08 11:30:32 +00:00
if ! operatorErrors . IsReconcile ( err ) {
nextInterval = inspectNextInterval
hasError = true
2018-03-27 10:11:57 +00:00
2020-06-08 11:30:32 +00:00
d . CreateEvent ( k8sutil . NewErrorEvent ( "Reconcilation failed" , err , d . apiObject ) )
} else {
nextInterval = minInspectionInterval
}
2018-06-07 14:22:02 +00:00
}
2020-04-01 13:38:03 +00:00
}
2018-03-27 10:11:57 +00:00
2020-04-01 13:38:03 +00:00
// Update next interval (on errors)
if hasError {
if d . recentInspectionErrors == 0 {
2018-06-07 14:22:02 +00:00
nextInterval = minInspectionInterval
2020-04-01 13:38:03 +00:00
d . recentInspectionErrors ++
2018-06-07 14:22:02 +00:00
}
2020-04-01 13:38:03 +00:00
} else {
d . recentInspectionErrors = 0
}
return nextInterval . ReduceTo ( maxInspectionInterval )
}
2018-03-20 12:10:52 +00:00
2021-04-26 08:30:06 +00:00
func ( d * Deployment ) inspectDeploymentWithError ( ctx context . Context , lastInterval util . Interval ,
cachedStatus inspectorInterface . Inspector ) ( nextInterval util . Interval , inspectError error ) {
2020-06-08 11:30:32 +00:00
t := time . Now ( )
2021-02-10 08:17:52 +00:00
2021-02-19 11:19:40 +00:00
d . SetCachedStatus ( cachedStatus )
defer d . SetCachedStatus ( nil )
2020-06-08 11:30:32 +00:00
defer func ( ) {
d . deps . Log . Info ( ) . Msgf ( "Reconciliation loop took %s" , time . Since ( t ) )
} ( )
2020-04-01 13:38:03 +00:00
// Ensure that spec and status checksum are same
spec := d . GetSpec ( )
status , _ := d . getStatus ( )
2018-03-15 15:33:28 +00:00
2020-04-01 13:38:03 +00:00
nextInterval = lastInterval
inspectError = nil
2018-03-29 09:56:57 +00:00
2020-04-01 13:38:03 +00:00
checksum , err := spec . Checksum ( )
if err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Calculation of spec failed" )
} else {
condition , exists := status . Conditions . Get ( api . ConditionTypeUpToDate )
2020-04-16 05:57:48 +00:00
if checksum != status . AppliedVersion && ( ! exists || condition . IsTrue ( ) ) {
2021-04-26 08:30:06 +00:00
if err = d . updateCondition ( ctx , api . ConditionTypeUpToDate , false , "Spec Changed" , "Spec Object changed. Waiting until plan will be applied" ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
2019-03-22 09:48:42 +00:00
2020-04-01 13:38:03 +00:00
return minInspectionInterval , nil // Retry ASAP
2018-09-07 11:18:39 +00:00
}
2020-04-01 13:38:03 +00:00
}
2018-09-07 11:18:39 +00:00
2020-12-15 11:41:14 +00:00
// Cleanup terminated pods on the beginning of loop
2021-04-26 08:30:06 +00:00
if x , err := d . resources . CleanupTerminatedPods ( ctx , cachedStatus ) ; err != nil {
2020-12-15 11:41:14 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Pod cleanup failed" )
} else {
nextInterval = nextInterval . ReduceTo ( x )
}
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsureArangoMembers ( ctx , cachedStatus ) ; err != nil {
2021-03-10 13:30:47 +00:00
return minInspectionInterval , errors . Wrapf ( err , "ArangoMember creation failed" )
2020-06-08 11:30:32 +00:00
}
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsureServices ( ctx , cachedStatus ) ; err != nil {
2020-06-08 11:30:32 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Service creation failed" )
}
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsureSecrets ( ctx , d . deps . Log , cachedStatus ) ; err != nil {
2021-03-10 13:30:47 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Secret creation failed" )
}
2020-04-01 13:38:03 +00:00
// Inspect secret hashes
2021-04-26 08:30:06 +00:00
if err := d . resources . ValidateSecretHashes ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Secret hash validation failed" )
}
2018-03-15 15:33:28 +00:00
2020-04-01 13:38:03 +00:00
// Check for LicenseKeySecret
2020-06-08 11:30:32 +00:00
if err := d . resources . ValidateLicenseKeySecret ( cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "License Key Secret invalid" )
}
// Is the deployment in a good state?
if status . Conditions . IsTrue ( api . ConditionTypeSecretsChanged ) {
2021-01-08 14:35:38 +00:00
return minInspectionInterval , errors . Newf ( "Secrets changed" )
2020-04-01 13:38:03 +00:00
}
// Ensure we have image info
2021-05-18 12:26:32 +00:00
if retrySoon , exists , err := d . ensureImages ( ctx , d . apiObject , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Image detection failed" )
2020-06-26 06:53:24 +00:00
} else if retrySoon || ! exists {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , nil
}
// Inspection of generated resources needed
2020-06-08 11:30:32 +00:00
if x , err := d . resources . InspectPods ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Pod inspection failed" )
} else {
nextInterval = nextInterval . ReduceTo ( x )
}
2020-06-08 11:30:32 +00:00
if x , err := d . resources . InspectPVCs ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "PVC inspection failed" )
} else {
nextInterval = nextInterval . ReduceTo ( x )
}
// Check members for resilience
2021-04-26 08:30:06 +00:00
if err := d . resilience . CheckMemberFailure ( ctx ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Member failure detection failed" )
}
// Immediate actions
2021-04-26 08:30:06 +00:00
if err := d . reconciler . CheckDeployment ( ctx ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Reconciler immediate actions failed" )
}
2021-04-26 08:30:06 +00:00
if interval , err := d . ensureResources ( ctx , nextInterval , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Reconciler resource recreation failed" )
} else {
nextInterval = interval
}
// Create scale/update plan
2020-11-27 12:49:28 +00:00
if _ , ok := d . apiObject . Annotations [ deployment . ArangoDeploymentPlanCleanAnnotation ] ; ok {
2021-04-26 08:30:06 +00:00
if err := d . ApplyPatch ( ctx , patch . ItemRemove ( patch . NewPath ( "metadata" , "annotations" , deployment . ArangoDeploymentPlanCleanAnnotation ) ) ) ; err != nil {
2020-11-27 12:49:28 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Unable to create remove annotation patch" )
}
2021-04-26 08:30:06 +00:00
if err := d . WithStatusUpdate ( ctx , func ( s * api . DeploymentStatus ) bool {
2020-11-27 12:49:28 +00:00
s . Plan = nil
return true
} , true ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Unable clean plan" )
}
} else if err , updated := d . reconciler . CreatePlan ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Plan creation failed" )
2020-04-16 05:57:48 +00:00
} else if updated {
return minInspectionInterval , nil
}
if d . apiObject . Status . Plan . IsEmpty ( ) && status . AppliedVersion != checksum {
2021-04-26 08:30:06 +00:00
if err := d . WithStatusUpdate ( ctx , func ( s * api . DeploymentStatus ) bool {
2020-04-16 05:57:48 +00:00
s . AppliedVersion = checksum
return true
} ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
return minInspectionInterval , nil
} else if status . AppliedVersion == checksum {
if ! status . Plan . IsEmpty ( ) && status . Conditions . IsTrue ( api . ConditionTypeUpToDate ) {
2021-04-26 08:30:06 +00:00
if err = d . updateCondition ( ctx , api . ConditionTypeUpToDate , false , "Plan is not empty" , "There are pending operations in plan" ) ; err != nil {
2020-04-16 05:57:48 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
return minInspectionInterval , nil
}
if status . Plan . IsEmpty ( ) && ! status . Conditions . IsTrue ( api . ConditionTypeUpToDate ) {
2021-04-26 08:30:06 +00:00
if err = d . updateCondition ( ctx , api . ConditionTypeUpToDate , true , "Spec is Up To Date" , "Spec is Up To Date" ) ; err != nil {
2020-04-16 05:57:48 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
return minInspectionInterval , nil
}
2020-04-01 13:38:03 +00:00
}
// Execute current step of scale/update plan
2020-06-08 11:30:32 +00:00
retrySoon , err := d . reconciler . ExecutePlan ( ctx , cachedStatus )
2020-04-01 13:38:03 +00:00
if err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Plan execution failed" )
}
if retrySoon {
nextInterval = minInspectionInterval
}
2018-03-15 15:33:28 +00:00
2020-04-01 13:38:03 +00:00
// Create access packages
2021-04-26 08:30:06 +00:00
if err := d . createAccessPackages ( ctx ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "AccessPackage creation failed" )
}
2019-11-11 13:11:27 +00:00
2020-04-01 13:38:03 +00:00
// Inspect deployment for obsolete members
2021-04-26 08:30:06 +00:00
if err := d . resources . CleanupRemovedMembers ( ctx ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Removed member cleanup failed" )
}
2019-01-14 13:30:39 +00:00
2020-04-01 13:38:03 +00:00
// At the end of the inspect, we cleanup terminated pods.
2021-04-26 08:30:06 +00:00
if x , err := d . resources . CleanupTerminatedPods ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Pod cleanup failed" )
} else {
nextInterval = nextInterval . ReduceTo ( x )
}
2019-09-27 11:04:23 +00:00
2020-04-01 13:38:03 +00:00
return
}
2021-04-26 08:30:06 +00:00
func ( d * Deployment ) ensureResources ( ctx context . Context , lastInterval util . Interval , cachedStatus inspectorInterface . Inspector ) ( util . Interval , error ) {
2020-04-01 13:38:03 +00:00
// Ensure all resources are created
if d . haveServiceMonitorCRD {
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsureServiceMonitor ( ctx ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Service monitor creation failed" )
2018-06-07 14:22:02 +00:00
}
2018-03-26 11:35:00 +00:00
}
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsurePVCs ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "PVC creation failed" )
2018-03-15 15:33:28 +00:00
}
2020-06-08 11:30:32 +00:00
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsurePods ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Pod creation failed" )
}
2020-06-08 11:30:32 +00:00
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsurePDBs ( ctx ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "PDB creation failed" )
}
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsureAnnotations ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Annotation update failed" )
}
2021-04-26 08:30:06 +00:00
if err := d . resources . EnsureLabels ( ctx , cachedStatus ) ; err != nil {
2020-06-23 08:02:26 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Labels update failed" )
}
2020-04-01 13:38:03 +00:00
return lastInterval , nil
2018-03-15 15:33:28 +00:00
}
2018-03-23 14:36:10 +00:00
// triggerInspection ensures that an inspection is run soon.
func ( d * Deployment ) triggerInspection ( ) {
d . inspectTrigger . Trigger ( )
}
2019-05-16 08:43:02 +00:00
// triggerCRDInspection ensures that an inspection is run soon.
func ( d * Deployment ) triggerCRDInspection ( ) {
d . inspectCRDTrigger . Trigger ( )
}
2020-04-16 05:57:48 +00:00
2021-04-26 08:30:06 +00:00
func ( d * Deployment ) updateCondition ( ctx context . Context , conditionType api . ConditionType , status bool , reason , message string ) error {
2020-04-16 05:57:48 +00:00
d . deps . Log . Info ( ) . Str ( "condition" , string ( conditionType ) ) . Bool ( "status" , status ) . Str ( "reason" , reason ) . Str ( "message" , message ) . Msg ( "Updated condition" )
2021-04-26 08:30:06 +00:00
if err := d . WithStatusUpdate ( ctx , func ( s * api . DeploymentStatus ) bool {
2020-04-16 05:57:48 +00:00
return s . Conditions . Update ( conditionType , status , reason , message )
} ) ; err != nil {
return errors . Wrapf ( err , "Unable to update condition" )
}
return nil
}