2018-03-15 15:33:28 +00:00
//
// DISCLAIMER
//
2020-03-04 10:25:14 +00:00
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
2018-03-15 15:33:28 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package deployment
import (
"context"
"time"
2020-03-11 07:57:03 +00:00
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
2019-11-04 07:49:24 +00:00
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2018-08-31 14:08:21 +00:00
"github.com/arangodb/kube-arangodb/pkg/metrics"
2018-08-25 10:08:44 +00:00
"github.com/arangodb/kube-arangodb/pkg/util"
2018-03-15 15:33:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
2018-04-03 15:43:42 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2018-03-15 15:33:28 +00:00
)
2018-08-31 14:08:21 +00:00
var (
inspectDeploymentDurationGauges = metrics . MustRegisterGaugeVec ( metricsComponent , "inspect_deployment_duration" , "Amount of time taken by a single inspection of a deployment (in sec)" , metrics . DeploymentName )
)
2018-03-15 15:33:28 +00:00
// inspectDeployment inspects the entire deployment, creates
// a plan to update if needed and inspects underlying resources.
// This function should be called when:
// - the deployment has changed
// - any of the underlying resources has changed
// - once in a while
// Returns the delay until this function should be called again.
2018-08-30 14:57:08 +00:00
func ( d * Deployment ) inspectDeployment ( lastInterval util . Interval ) util . Interval {
2018-03-27 10:11:57 +00:00
log := d . deps . Log
2018-08-31 14:08:21 +00:00
start := time . Now ( )
2018-03-15 15:33:28 +00:00
nextInterval := lastInterval
hasError := false
ctx := context . Background ( )
2018-08-31 14:08:21 +00:00
deploymentName := d . apiObject . GetName ( )
defer metrics . SetDuration ( inspectDeploymentDurationGauges . WithLabelValues ( deploymentName ) , start )
2018-03-15 15:33:28 +00:00
2018-04-03 15:43:42 +00:00
// Check deployment still exists
2019-11-04 07:49:24 +00:00
updated , err := d . deps . DatabaseCRCli . DatabaseV1 ( ) . ArangoDeployments ( d . apiObject . GetNamespace ( ) ) . Get ( deploymentName , metav1 . GetOptions { } )
2018-06-07 14:22:02 +00:00
if k8sutil . IsNotFound ( err ) {
2018-04-03 15:43:42 +00:00
// Deployment is gone
log . Info ( ) . Msg ( "Deployment is gone" )
d . Delete ( )
return nextInterval
2018-06-07 14:22:02 +00:00
} else if updated != nil && updated . GetDeletionTimestamp ( ) != nil {
// Deployment is marked for deletion
if err := d . runDeploymentFinalizers ( ctx ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "ArangoDeployment finalizer inspection failed" , err , d . apiObject ) )
}
} else {
2020-01-27 06:33:12 +00:00
// Check if maintenance annotation is set
if updated != nil && updated . Annotations != nil {
2020-03-11 07:57:03 +00:00
if v , ok := updated . Annotations [ deployment . ArangoDeploymentPodMaintenanceAnnotation ] ; ok && v == "true" {
2020-01-27 06:33:12 +00:00
// Disable checks if we will enter maintenance mode
log . Info ( ) . Str ( "deployment" , deploymentName ) . Msg ( "Deployment in maintenance mode" )
return nextInterval
}
}
2018-06-07 14:22:02 +00:00
// Is the deployment in failed state, if so, give up.
2018-06-12 09:09:42 +00:00
if d . GetPhase ( ) == api . DeploymentPhaseFailed {
2018-06-07 14:22:02 +00:00
log . Debug ( ) . Msg ( "Deployment is in Failed state." )
return nextInterval
}
2018-03-27 10:11:57 +00:00
2018-06-07 14:22:02 +00:00
// Inspect secret hashes
if err := d . resources . ValidateSecretHashes ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Secret hash validation failed" , err , d . apiObject ) )
}
2018-03-27 10:11:57 +00:00
2018-12-03 11:06:10 +00:00
// Check for LicenseKeySecret
if err := d . resources . ValidateLicenseKeySecret ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "License Key Secret invalid" , err , d . apiObject ) )
}
2018-06-07 14:22:02 +00:00
// Is the deployment in a good state?
2018-06-12 09:09:42 +00:00
status , _ := d . GetStatus ( )
if status . Conditions . IsTrue ( api . ConditionTypeSecretsChanged ) {
2018-06-07 14:22:02 +00:00
log . Debug ( ) . Msg ( "Condition SecretsChanged is true. Revert secrets before we can continue" )
return nextInterval
}
2018-03-27 10:11:57 +00:00
2018-06-07 14:22:02 +00:00
// Ensure we have image info
if retrySoon , err := d . ensureImages ( d . apiObject ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Image detection failed" , err , d . apiObject ) )
} else if retrySoon {
nextInterval = minInspectionInterval
}
2018-03-20 12:10:52 +00:00
2018-06-07 14:22:02 +00:00
// Inspection of generated resources needed
2018-08-25 10:08:44 +00:00
if x , err := d . resources . InspectPods ( ctx ) ; err != nil {
2018-06-07 14:22:02 +00:00
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Pod inspection failed" , err , d . apiObject ) )
2018-08-25 10:08:44 +00:00
} else {
2018-08-30 14:57:08 +00:00
nextInterval = nextInterval . ReduceTo ( x )
2018-06-07 14:22:02 +00:00
}
2018-08-27 12:47:41 +00:00
if x , err := d . resources . InspectPVCs ( ctx ) ; err != nil {
2018-06-07 14:22:02 +00:00
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "PVC inspection failed" , err , d . apiObject ) )
2018-08-27 12:47:41 +00:00
} else {
2018-08-30 14:57:08 +00:00
nextInterval = nextInterval . ReduceTo ( x )
2018-06-07 14:22:02 +00:00
}
2018-03-15 15:33:28 +00:00
2018-06-07 14:22:02 +00:00
// Check members for resilience
if err := d . resilience . CheckMemberFailure ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Member failure detection failed" , err , d . apiObject ) )
}
2018-03-29 09:56:57 +00:00
2019-03-22 09:48:42 +00:00
// Immediate actions
if err := d . reconciler . CheckDeployment ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Reconciler immediate actions failed" , err , d . apiObject ) )
}
2018-06-07 14:22:02 +00:00
// Create scale/update plan
2018-09-07 11:18:39 +00:00
if err := d . reconciler . CreatePlan ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Plan creation failed" , err , d . apiObject ) )
}
// Execute current step of scale/update plan
retrySoon , err := d . reconciler . ExecutePlan ( ctx )
if err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Plan execution failed" , err , d . apiObject ) )
}
if retrySoon {
nextInterval = minInspectionInterval
2018-06-07 14:22:02 +00:00
}
2018-03-15 15:33:28 +00:00
2018-06-07 14:22:02 +00:00
// Ensure all resources are created
2018-08-31 14:08:21 +00:00
if err := d . resources . EnsureSecrets ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Secret creation failed" , err , d . apiObject ) )
}
if err := d . resources . EnsureServices ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Service creation failed" , err , d . apiObject ) )
}
2019-05-16 08:43:02 +00:00
if d . haveServiceMonitorCRD {
if err := d . resources . EnsureServiceMonitor ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Service monitor creation failed" , err , d . apiObject ) )
}
}
2018-08-31 14:08:21 +00:00
if err := d . resources . EnsurePVCs ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "PVC creation failed" , err , d . apiObject ) )
}
if err := d . resources . EnsurePods ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Pod creation failed" , err , d . apiObject ) )
2018-06-07 14:22:02 +00:00
}
2019-03-07 09:51:25 +00:00
if err := d . resources . EnsurePDBs ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "PDB creation failed" , err , d . apiObject ) )
}
2018-03-15 15:33:28 +00:00
2019-11-11 13:11:27 +00:00
if err := d . resources . EnsureAnnotations ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Annotation update failed" , err , d . apiObject ) )
}
2018-06-07 14:22:02 +00:00
// Create access packages
2018-08-31 14:08:21 +00:00
if err := d . createAccessPackages ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "AccessPackage creation failed" , err , d . apiObject ) )
2018-06-07 14:22:02 +00:00
}
2018-05-31 11:20:49 +00:00
2019-01-14 13:30:39 +00:00
// Ensure deployment bootstrap
if err := d . EnsureBootstrap ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Bootstrap failed" , err , d . apiObject ) )
}
2018-06-07 14:22:02 +00:00
// Inspect deployment for obsolete members
if err := d . resources . CleanupRemovedMembers ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Removed member cleanup failed" , err , d . apiObject ) )
}
2018-06-04 17:00:00 +00:00
2019-09-27 11:04:23 +00:00
if err := d . backup . CheckRestore ( ) ; err != nil {
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Restore operation failed" , err , d . apiObject ) )
}
2018-06-07 14:22:02 +00:00
// At the end of the inspect, we cleanup terminated pods.
2018-08-30 14:57:08 +00:00
if x , err := d . resources . CleanupTerminatedPods ( ) ; err != nil {
2018-06-07 14:22:02 +00:00
hasError = true
d . CreateEvent ( k8sutil . NewErrorEvent ( "Pod cleanup failed" , err , d . apiObject ) )
2018-08-30 14:57:08 +00:00
} else {
nextInterval = nextInterval . ReduceTo ( x )
2018-06-07 14:22:02 +00:00
}
2018-03-26 11:35:00 +00:00
}
2018-03-15 15:33:28 +00:00
// Update next interval (on errors)
if hasError {
if d . recentInspectionErrors == 0 {
nextInterval = minInspectionInterval
d . recentInspectionErrors ++
}
} else {
d . recentInspectionErrors = 0
}
2018-08-30 14:57:08 +00:00
return nextInterval . ReduceTo ( maxInspectionInterval )
2018-03-15 15:33:28 +00:00
}
2018-03-23 14:36:10 +00:00
// triggerInspection ensures that an inspection is run soon.
func ( d * Deployment ) triggerInspection ( ) {
d . inspectTrigger . Trigger ( )
}
2019-05-16 08:43:02 +00:00
// triggerCRDInspection ensures that an inspection is run soon.
func ( d * Deployment ) triggerCRDInspection ( ) {
d . inspectCRDTrigger . Trigger ( )
}