2018-03-15 15:33:28 +00:00
//
// DISCLAIMER
//
2020-03-04 10:25:14 +00:00
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
2018-03-15 15:33:28 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package deployment
import (
"context"
"time"
2020-06-08 11:30:32 +00:00
operatorErrors "github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/deployment/resources/inspector"
2020-04-01 13:38:03 +00:00
"github.com/pkg/errors"
2020-03-11 07:57:03 +00:00
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
2019-11-04 07:49:24 +00:00
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2018-08-31 14:08:21 +00:00
"github.com/arangodb/kube-arangodb/pkg/metrics"
2018-08-25 10:08:44 +00:00
"github.com/arangodb/kube-arangodb/pkg/util"
2018-03-15 15:33:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
2018-04-03 15:43:42 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2018-03-15 15:33:28 +00:00
)
2018-08-31 14:08:21 +00:00
var (
inspectDeploymentDurationGauges = metrics . MustRegisterGaugeVec ( metricsComponent , "inspect_deployment_duration" , "Amount of time taken by a single inspection of a deployment (in sec)" , metrics . DeploymentName )
)
2018-03-15 15:33:28 +00:00
// inspectDeployment inspects the entire deployment, creates
// a plan to update if needed and inspects underlying resources.
// This function should be called when:
// - the deployment has changed
// - any of the underlying resources has changed
// - once in a while
// Returns the delay until this function should be called again.
2018-08-30 14:57:08 +00:00
func ( d * Deployment ) inspectDeployment ( lastInterval util . Interval ) util . Interval {
2018-03-27 10:11:57 +00:00
log := d . deps . Log
2018-08-31 14:08:21 +00:00
start := time . Now ( )
2020-06-08 11:30:32 +00:00
defer func ( ) {
d . deps . Log . Info ( ) . Msgf ( "Inspect loop took %s" , time . Since ( start ) )
} ( )
2018-03-15 15:33:28 +00:00
nextInterval := lastInterval
hasError := false
ctx := context . Background ( )
2018-08-31 14:08:21 +00:00
deploymentName := d . apiObject . GetName ( )
defer metrics . SetDuration ( inspectDeploymentDurationGauges . WithLabelValues ( deploymentName ) , start )
2018-03-15 15:33:28 +00:00
2020-07-30 06:35:24 +00:00
cachedStatus , err := inspector . NewInspector ( d . GetKubeCli ( ) , d . GetMonitoringV1Cli ( ) , d . GetNamespace ( ) )
2020-06-08 11:30:32 +00:00
if err != nil {
log . Error ( ) . Err ( err ) . Msg ( "Unable to get resources" )
return minInspectionInterval // Retry ASAP
}
2018-04-03 15:43:42 +00:00
// Check deployment still exists
2019-11-04 07:49:24 +00:00
updated , err := d . deps . DatabaseCRCli . DatabaseV1 ( ) . ArangoDeployments ( d . apiObject . GetNamespace ( ) ) . Get ( deploymentName , metav1 . GetOptions { } )
2018-06-07 14:22:02 +00:00
if k8sutil . IsNotFound ( err ) {
2018-04-03 15:43:42 +00:00
// Deployment is gone
log . Info ( ) . Msg ( "Deployment is gone" )
d . Delete ( )
return nextInterval
2018-06-07 14:22:02 +00:00
} else if updated != nil && updated . GetDeletionTimestamp ( ) != nil {
// Deployment is marked for deletion
2020-06-08 11:30:32 +00:00
if err := d . runDeploymentFinalizers ( ctx , cachedStatus ) ; err != nil {
2018-06-07 14:22:02 +00:00
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "ArangoDeployment finalizer inspection failed" , err , d . apiObject ) )
}
} else {
2020-01-27 06:33:12 +00:00
// Check if maintenance annotation is set
if updated != nil && updated . Annotations != nil {
2020-03-11 07:57:03 +00:00
if v , ok := updated . Annotations [ deployment . ArangoDeploymentPodMaintenanceAnnotation ] ; ok && v == "true" {
2020-01-27 06:33:12 +00:00
// Disable checks if we will enter maintenance mode
log . Info ( ) . Str ( "deployment" , deploymentName ) . Msg ( "Deployment in maintenance mode" )
return nextInterval
}
}
2018-06-07 14:22:02 +00:00
// Is the deployment in failed state, if so, give up.
2018-06-12 09:09:42 +00:00
if d . GetPhase ( ) == api . DeploymentPhaseFailed {
2018-06-07 14:22:02 +00:00
log . Debug ( ) . Msg ( "Deployment is in Failed state." )
return nextInterval
}
2018-03-27 10:11:57 +00:00
2020-06-08 11:30:32 +00:00
if inspectNextInterval , err := d . inspectDeploymentWithError ( ctx , nextInterval , cachedStatus ) ; err != nil {
if ! operatorErrors . IsReconcile ( err ) {
nextInterval = inspectNextInterval
hasError = true
2018-03-27 10:11:57 +00:00
2020-06-08 11:30:32 +00:00
d . CreateEvent ( k8sutil . NewErrorEvent ( "Reconcilation failed" , err , d . apiObject ) )
} else {
nextInterval = minInspectionInterval
}
2018-06-07 14:22:02 +00:00
}
2020-04-01 13:38:03 +00:00
}
2018-03-27 10:11:57 +00:00
2020-04-01 13:38:03 +00:00
// Update next interval (on errors)
if hasError {
if d . recentInspectionErrors == 0 {
2018-06-07 14:22:02 +00:00
nextInterval = minInspectionInterval
2020-04-01 13:38:03 +00:00
d . recentInspectionErrors ++
2018-06-07 14:22:02 +00:00
}
2020-04-01 13:38:03 +00:00
} else {
d . recentInspectionErrors = 0
}
return nextInterval . ReduceTo ( maxInspectionInterval )
}
2018-03-20 12:10:52 +00:00
2020-06-08 11:30:32 +00:00
func ( d * Deployment ) inspectDeploymentWithError ( ctx context . Context , lastInterval util . Interval , cachedStatus inspector . Inspector ) ( nextInterval util . Interval , inspectError error ) {
t := time . Now ( )
defer func ( ) {
d . deps . Log . Info ( ) . Msgf ( "Reconciliation loop took %s" , time . Since ( t ) )
} ( )
2020-04-01 13:38:03 +00:00
// Ensure that spec and status checksum are same
spec := d . GetSpec ( )
status , _ := d . getStatus ( )
2018-03-15 15:33:28 +00:00
2020-04-01 13:38:03 +00:00
nextInterval = lastInterval
inspectError = nil
2018-03-29 09:56:57 +00:00
2020-04-01 13:38:03 +00:00
checksum , err := spec . Checksum ( )
if err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Calculation of spec failed" )
} else {
condition , exists := status . Conditions . Get ( api . ConditionTypeUpToDate )
2020-04-16 05:57:48 +00:00
if checksum != status . AppliedVersion && ( ! exists || condition . IsTrue ( ) ) {
if err = d . updateCondition ( api . ConditionTypeUpToDate , false , "Spec Changed" , "Spec Object changed. Waiting until plan will be applied" ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
2019-03-22 09:48:42 +00:00
2020-04-01 13:38:03 +00:00
return minInspectionInterval , nil // Retry ASAP
2018-09-07 11:18:39 +00:00
}
2020-04-01 13:38:03 +00:00
}
2018-09-07 11:18:39 +00:00
2020-06-15 10:39:05 +00:00
if err := d . resources . EnsureSecrets ( d . deps . Log , cachedStatus ) ; err != nil {
2020-06-08 11:30:32 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Secret creation failed" )
}
if err := d . resources . EnsureServices ( cachedStatus ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Service creation failed" )
}
2020-04-01 13:38:03 +00:00
// Inspect secret hashes
2020-06-08 11:30:32 +00:00
if err := d . resources . ValidateSecretHashes ( cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Secret hash validation failed" )
}
2018-03-15 15:33:28 +00:00
2020-04-01 13:38:03 +00:00
// Check for LicenseKeySecret
2020-06-08 11:30:32 +00:00
if err := d . resources . ValidateLicenseKeySecret ( cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "License Key Secret invalid" )
}
// Is the deployment in a good state?
if status . Conditions . IsTrue ( api . ConditionTypeSecretsChanged ) {
return minInspectionInterval , errors . Errorf ( "Secrets changed" )
}
// Ensure we have image info
2020-06-26 06:53:24 +00:00
if retrySoon , exists , err := d . ensureImages ( d . apiObject ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Image detection failed" )
2020-06-26 06:53:24 +00:00
} else if retrySoon || ! exists {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , nil
}
// Inspection of generated resources needed
2020-06-08 11:30:32 +00:00
if x , err := d . resources . InspectPods ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Pod inspection failed" )
} else {
nextInterval = nextInterval . ReduceTo ( x )
}
2020-06-08 11:30:32 +00:00
if x , err := d . resources . InspectPVCs ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "PVC inspection failed" )
} else {
nextInterval = nextInterval . ReduceTo ( x )
}
// Check members for resilience
if err := d . resilience . CheckMemberFailure ( ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Member failure detection failed" )
}
// Immediate actions
if err := d . reconciler . CheckDeployment ( ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Reconciler immediate actions failed" )
}
2020-06-08 11:30:32 +00:00
if interval , err := d . ensureResources ( nextInterval , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Reconciler resource recreation failed" )
} else {
nextInterval = interval
}
// Create scale/update plan
2020-06-08 11:30:32 +00:00
if err , updated := d . reconciler . CreatePlan ( ctx , cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Plan creation failed" )
2020-04-16 05:57:48 +00:00
} else if updated {
return minInspectionInterval , nil
}
if d . apiObject . Status . Plan . IsEmpty ( ) && status . AppliedVersion != checksum {
if err := d . WithStatusUpdate ( func ( s * api . DeploymentStatus ) bool {
s . AppliedVersion = checksum
return true
} ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
return minInspectionInterval , nil
} else if status . AppliedVersion == checksum {
if ! status . Plan . IsEmpty ( ) && status . Conditions . IsTrue ( api . ConditionTypeUpToDate ) {
if err = d . updateCondition ( api . ConditionTypeUpToDate , false , "Plan is not empty" , "There are pending operations in plan" ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
return minInspectionInterval , nil
}
if status . Plan . IsEmpty ( ) && ! status . Conditions . IsTrue ( api . ConditionTypeUpToDate ) {
if err = d . updateCondition ( api . ConditionTypeUpToDate , true , "Spec is Up To Date" , "Spec is Up To Date" ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Unable to update UpToDate condition" )
}
return minInspectionInterval , nil
}
2020-04-01 13:38:03 +00:00
}
// Execute current step of scale/update plan
2020-06-08 11:30:32 +00:00
retrySoon , err := d . reconciler . ExecutePlan ( ctx , cachedStatus )
2020-04-01 13:38:03 +00:00
if err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Plan execution failed" )
}
if retrySoon {
nextInterval = minInspectionInterval
}
2018-03-15 15:33:28 +00:00
2020-04-01 13:38:03 +00:00
// Create access packages
if err := d . createAccessPackages ( ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "AccessPackage creation failed" )
}
2019-11-11 13:11:27 +00:00
2020-04-01 13:38:03 +00:00
// Inspect deployment for obsolete members
if err := d . resources . CleanupRemovedMembers ( ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Removed member cleanup failed" )
}
2019-01-14 13:30:39 +00:00
2020-04-01 13:38:03 +00:00
// At the end of the inspect, we cleanup terminated pods.
2020-06-08 11:30:32 +00:00
if x , err := d . resources . CleanupTerminatedPods ( cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Pod cleanup failed" )
} else {
nextInterval = nextInterval . ReduceTo ( x )
}
2019-09-27 11:04:23 +00:00
2020-04-01 13:38:03 +00:00
return
}
2020-06-08 11:30:32 +00:00
func ( d * Deployment ) ensureResources ( lastInterval util . Interval , cachedStatus inspector . Inspector ) ( util . Interval , error ) {
2020-04-01 13:38:03 +00:00
// Ensure all resources are created
if d . haveServiceMonitorCRD {
if err := d . resources . EnsureServiceMonitor ( ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Service monitor creation failed" )
2018-06-07 14:22:02 +00:00
}
2018-03-26 11:35:00 +00:00
}
2020-06-08 11:30:32 +00:00
if err := d . resources . EnsurePVCs ( cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "PVC creation failed" )
2018-03-15 15:33:28 +00:00
}
2020-06-08 11:30:32 +00:00
if err := d . resources . EnsurePods ( cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Pod creation failed" )
}
2020-06-08 11:30:32 +00:00
2020-04-01 13:38:03 +00:00
if err := d . resources . EnsurePDBs ( ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "PDB creation failed" )
}
2020-06-08 11:30:32 +00:00
if err := d . resources . EnsureAnnotations ( cachedStatus ) ; err != nil {
2020-04-01 13:38:03 +00:00
return minInspectionInterval , errors . Wrapf ( err , "Annotation update failed" )
}
2020-06-23 08:02:26 +00:00
if err := d . resources . EnsureLabels ( cachedStatus ) ; err != nil {
return minInspectionInterval , errors . Wrapf ( err , "Labels update failed" )
}
2020-04-01 13:38:03 +00:00
return lastInterval , nil
2018-03-15 15:33:28 +00:00
}
2018-03-23 14:36:10 +00:00
// triggerInspection ensures that an inspection is run soon.
func ( d * Deployment ) triggerInspection ( ) {
d . inspectTrigger . Trigger ( )
}
2019-05-16 08:43:02 +00:00
// triggerCRDInspection ensures that an inspection is run soon.
func ( d * Deployment ) triggerCRDInspection ( ) {
d . inspectCRDTrigger . Trigger ( )
}
2020-04-16 05:57:48 +00:00
func ( d * Deployment ) updateCondition ( conditionType api . ConditionType , status bool , reason , message string ) error {
d . deps . Log . Info ( ) . Str ( "condition" , string ( conditionType ) ) . Bool ( "status" , status ) . Str ( "reason" , reason ) . Str ( "message" , message ) . Msg ( "Updated condition" )
if err := d . WithStatusUpdate ( func ( s * api . DeploymentStatus ) bool {
return s . Conditions . Update ( conditionType , status , reason , message )
} ) ; err != nil {
return errors . Wrapf ( err , "Unable to update condition" )
}
return nil
}