2020-01-16 12:36:28 +00:00
//
// DISCLAIMER
//
2022-01-10 11:35:49 +00:00
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
2020-01-16 12:36:28 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package reconcile
import (
2020-06-08 11:30:32 +00:00
"context"
2021-09-06 21:49:24 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/rotation"
2021-07-15 12:07:33 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
2020-10-28 22:46:01 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
2022-02-16 00:36:45 +00:00
"github.com/arangodb/go-driver"
2020-01-16 12:36:28 +00:00
upgraderules "github.com/arangodb/go-upgrade-rules"
2020-04-08 10:32:24 +00:00
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
2020-01-16 12:36:28 +00:00
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
2021-03-10 13:30:47 +00:00
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
2020-01-16 12:36:28 +00:00
"github.com/rs/zerolog"
2020-02-21 11:59:19 +00:00
core "k8s.io/api/core/v1"
2020-01-16 12:36:28 +00:00
)
2020-07-30 13:28:30 +00:00
var (
// rotationByAnnotationOrder - Change order of execution - Coordinators and Agents should be executed before DBServer to save time
rotationByAnnotationOrder = [ ] api . ServerGroup {
api . ServerGroupAgents ,
2020-08-19 15:12:12 +00:00
api . ServerGroupSingle ,
2020-07-30 13:28:30 +00:00
api . ServerGroupCoordinators ,
api . ServerGroupDBServers ,
api . ServerGroupSyncMasters ,
api . ServerGroupSyncWorkers ,
}
)
2020-06-08 11:30:32 +00:00
2021-08-20 13:02:36 +00:00
// upgradeDecision is the result of an upgrade check.
type upgradeDecision struct {
FromVersion driver . Version
FromLicense upgraderules . License
ToVersion driver . Version
ToLicense upgraderules . License
UpgradeNeeded bool // If set, the image version has changed
UpgradeAllowed bool // If set, it is an allowed version change
AutoUpgradeNeeded bool // If set, the database must be started with `--database.auto-upgrade` once
Hold bool
}
2020-07-30 13:28:30 +00:00
// createRotateOrUpgradePlan goes over all pods to check if an upgrade or rotate is needed.
2020-06-08 11:30:32 +00:00
func createRotateOrUpgradePlan ( ctx context . Context ,
log zerolog . Logger , apiObject k8sutil . APIObject ,
spec api . DeploymentSpec , status api . DeploymentStatus ,
2021-03-10 13:30:47 +00:00
cachedStatus inspectorInterface . Inspector , context PlanBuilderContext ) api . Plan {
2020-06-08 11:30:32 +00:00
var plan api . Plan
2021-08-26 07:59:16 +00:00
newPlan , idle := createRotateOrUpgradePlanInternal ( log , apiObject , spec , status , cachedStatus , context )
2020-06-08 11:30:32 +00:00
if idle {
plan = append ( plan ,
api . NewAction ( api . ActionTypeIdle , api . ServerGroupUnknown , "" ) )
} else {
plan = append ( plan , newPlan ... )
}
return plan
}
2022-01-12 12:44:33 +00:00
func createMarkToRemovePlan ( ctx context . Context ,
log zerolog . Logger , apiObject k8sutil . APIObject ,
spec api . DeploymentSpec , status api . DeploymentStatus ,
cachedStatus inspectorInterface . Inspector , context PlanBuilderContext ) api . Plan {
var plan api . Plan
2020-07-30 13:28:30 +00:00
status . Members . ForeachServerInGroups ( func ( group api . ServerGroup , members api . MemberStatusList ) error {
for _ , m := range members {
if m . Phase != api . MemberPhaseCreated || m . PodName == "" {
// Only rotate when phase is created
continue
}
pod , found := cachedStatus . Pod ( m . PodName )
if ! found {
continue
}
2020-03-11 07:57:03 +00:00
if pod . Annotations != nil {
2021-10-04 13:43:47 +00:00
if _ , ok := pod . Annotations [ deployment . ArangoDeploymentPodReplaceAnnotation ] ; ok && ( group == api . ServerGroupDBServers || group == api . ServerGroupAgents || group == api . ServerGroupCoordinators ) {
2021-09-20 15:49:29 +00:00
if ! m . Conditions . IsTrue ( api . ConditionTypeMarkedToRemove ) {
2022-01-12 12:44:33 +00:00
plan = append ( plan , api . NewAction ( api . ActionTypeMarkToRemoveMember , group , m . ID , "Replace flag present" ) )
2021-09-20 15:49:29 +00:00
continue
}
2020-10-29 12:52:13 +00:00
}
2020-03-11 07:57:03 +00:00
}
2020-01-16 12:36:28 +00:00
}
2020-07-30 13:28:30 +00:00
2020-01-16 12:36:28 +00:00
return nil
2020-07-30 13:28:30 +00:00
} , rotationByAnnotationOrder ... )
2020-01-16 12:36:28 +00:00
2022-01-12 12:44:33 +00:00
return plan
}
func createRotateOrUpgradePlanInternal ( log zerolog . Logger , apiObject k8sutil . APIObject , spec api . DeploymentSpec , status api . DeploymentStatus , cachedStatus inspectorInterface . Inspector , context PlanBuilderContext ) ( api . Plan , bool ) {
2022-01-14 09:58:49 +00:00
decision := createRotateOrUpgradeDecision ( log , spec , status , context )
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
if decision . IsUpgrade ( ) {
2022-02-16 00:36:45 +00:00
for _ , m := range status . Members . AsList ( ) {
// Pre-check
d := decision [ m . Member . ID ]
if ! d . upgrade {
continue
}
// We have member to upgrade
if d . upgradeDecision . Hold {
// Holding upgrade
continue
}
if ! d . upgradeDecision . UpgradeAllowed {
context . CreateEvent ( k8sutil . NewUpgradeNotAllowedEvent ( apiObject , d . upgradeDecision . FromVersion , d . upgradeDecision . ToVersion , d . upgradeDecision . FromLicense , d . upgradeDecision . ToLicense ) )
return nil , false
}
}
2022-01-12 12:44:33 +00:00
// Upgrade phase
2022-01-14 09:58:49 +00:00
// During upgrade always get first member which needs to be upgraded
for _ , m := range status . Members . AsList ( ) {
d := decision [ m . Member . ID ]
if ! d . upgrade {
continue
}
2021-07-15 12:07:33 +00:00
2022-01-14 09:58:49 +00:00
// We have member to upgrade
if d . upgradeDecision . Hold {
// Holding upgrade
return nil , false
}
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
if ! d . upgradeDecision . UpgradeNeeded {
// In upgrade scenario but upgrade is not needed
return nil , false
}
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
if ! d . upgradeDecision . UpgradeAllowed {
context . CreateEvent ( k8sutil . NewUpgradeNotAllowedEvent ( apiObject , d . upgradeDecision . FromVersion , d . upgradeDecision . ToVersion , d . upgradeDecision . FromLicense , d . upgradeDecision . ToLicense ) )
return nil , false
}
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
if d . updateAllowed {
// We are fine, group is alive so we can proceed
return createUpgradeMemberPlan ( log , m . Member , m . Group , "Version upgrade" , spec , status , ! d . upgradeDecision . AutoUpgradeNeeded ) , false
} else if d . unsafeUpdateAllowed {
log . Info ( ) . Str ( "member" , m . Member . ID ) . Msg ( "Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready, but unsafe upgrade is allowed" )
return createUpgradeMemberPlan ( log , m . Member , m . Group , "Version upgrade" , spec , status , ! d . upgradeDecision . AutoUpgradeNeeded ) , false
} else {
log . Info ( ) . Str ( "member" , m . Member . ID ) . Msg ( "Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready." )
return nil , true
}
2022-01-12 12:44:33 +00:00
}
2020-01-16 12:36:28 +00:00
2022-01-14 09:58:49 +00:00
log . Warn ( ) . Msg ( "Pod upgrade plan has been made, but it has been dropped due to missing flag" )
return nil , false
} else if decision . IsUpdate ( ) {
// Update phase
for _ , m := range status . Members . AsList ( ) {
d := decision [ m . Member . ID ]
if ! d . update {
continue
}
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
if ! d . updateAllowed {
// Update is not allowed due to constraint
if ! d . unsafeUpdateAllowed {
log . Info ( ) . Str ( "member" , m . Member . ID ) . Msg ( "Pod needs restart but cluster is not ready. Either some shards are not in sync or some member is not ready." )
continue
}
log . Info ( ) . Str ( "member" , m . Member . ID ) . Msg ( "Pod needs restart but cluster is not ready. Either some shards are not in sync or some member is not ready, but unsafe upgrade is allowed" )
}
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
if m . Member . Conditions . IsTrue ( api . ConditionTypeRestart ) {
2022-02-16 00:36:45 +00:00
return createRotateMemberPlan ( log , m . Member , m . Group , spec , "Restart flag present" ) , false
2022-01-14 09:58:49 +00:00
}
arangoMember , ok := cachedStatus . ArangoMember ( m . Member . ArangoMemberName ( apiObject . GetName ( ) , m . Group ) )
if ! ok {
continue
}
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
p , ok := cachedStatus . Pod ( m . Member . PodName )
if ! ok {
p = nil
}
2022-01-12 12:44:33 +00:00
2022-01-14 09:58:49 +00:00
if mode , p , reason , err := rotation . IsRotationRequired ( log , cachedStatus , spec , m . Member , m . Group , p , arangoMember . Spec . Template , arangoMember . Status . Template ) ; err != nil {
log . Err ( err ) . Str ( "member" , m . Member . ID ) . Msgf ( "Error while generating update plan" )
continue
} else if mode != rotation . InPlaceRotation {
return api . Plan { api . NewAction ( api . ActionTypeSetMemberCondition , m . Group , m . Member . ID , "Cleaning update" ) .
AddParam ( api . ConditionTypePendingUpdate . String ( ) , "" ) .
AddParam ( api . ConditionTypeUpdating . String ( ) , "T" ) } , false
} else {
p = p . After (
api . NewAction ( api . ActionTypeWaitForMemberUp , m . Group , m . Member . ID ) ,
api . NewAction ( api . ActionTypeWaitForMemberInSync , m . Group , m . Member . ID ) )
p = p . Wrap ( api . NewAction ( api . ActionTypeSetMemberCondition , m . Group , m . Member . ID , reason ) .
AddParam ( api . ConditionTypePendingUpdate . String ( ) , "" ) . AddParam ( api . ConditionTypeUpdating . String ( ) , "T" ) ,
api . NewAction ( api . ActionTypeSetMemberCondition , m . Group , m . Member . ID , reason ) .
AddParam ( api . ConditionTypeUpdating . String ( ) , "" ) )
return p , false
2022-01-12 12:44:33 +00:00
}
}
2022-01-14 09:58:49 +00:00
return nil , true
2022-01-12 12:44:33 +00:00
}
2022-01-14 09:58:49 +00:00
return nil , false
2022-01-12 12:44:33 +00:00
}
2020-01-16 12:36:28 +00:00
// podNeedsUpgrading decides if an upgrade of the pod is needed (to comply with
// the given spec) and if that is allowed.
2020-11-23 13:19:50 +00:00
func podNeedsUpgrading ( log zerolog . Logger , status api . MemberStatus , spec api . DeploymentSpec , images api . ImageInfoList ) upgradeDecision {
currentImage , found := currentImageInfo ( spec , images )
if ! found {
// Hold rotation tasks - we do not know image
return upgradeDecision { Hold : true }
}
memberImage , found := memberImageInfo ( spec , status , images )
if ! found {
// Member info not found
return upgradeDecision { UpgradeNeeded : false }
}
if currentImage . Image == memberImage . Image {
// No change
return upgradeDecision { UpgradeNeeded : false }
}
// Image changed, check if change is allowed
specVersion := currentImage . ArangoDBVersion
memberVersion := memberImage . ArangoDBVersion
asLicense := func ( info api . ImageInfo ) upgraderules . License {
if info . Enterprise {
return upgraderules . LicenseEnterprise
2020-01-16 12:36:28 +00:00
}
2020-11-23 13:19:50 +00:00
return upgraderules . LicenseCommunity
}
specLicense := asLicense ( currentImage )
memberLicense := asLicense ( memberImage )
if err := upgraderules . CheckUpgradeRulesWithLicense ( memberVersion , specVersion , memberLicense , specLicense ) ; err != nil {
// E.g. 3.x -> 4.x, we cannot allow automatically
return upgradeDecision {
FromVersion : memberVersion ,
FromLicense : memberLicense ,
ToVersion : specVersion ,
ToLicense : specLicense ,
UpgradeNeeded : true ,
UpgradeAllowed : false ,
2020-01-16 12:36:28 +00:00
}
2020-11-23 13:19:50 +00:00
}
if specVersion . Major ( ) != memberVersion . Major ( ) || specVersion . Minor ( ) != memberVersion . Minor ( ) {
// Is allowed, with `--database.auto-upgrade`
log . Info ( ) . Str ( "spec-version" , string ( specVersion ) ) . Str ( "pod-version" , string ( memberVersion ) ) .
Int ( "spec-version.major" , specVersion . Major ( ) ) . Int ( "spec-version.minor" , specVersion . Minor ( ) ) .
Int ( "pod-version.major" , memberVersion . Major ( ) ) . Int ( "pod-version.minor" , memberVersion . Minor ( ) ) .
Msg ( "Deciding to do a upgrade with --auto-upgrade" )
2020-01-16 12:36:28 +00:00
return upgradeDecision {
2020-11-23 13:19:50 +00:00
FromVersion : memberVersion ,
FromLicense : memberLicense ,
2020-01-16 12:36:28 +00:00
ToVersion : specVersion ,
ToLicense : specLicense ,
UpgradeNeeded : true ,
UpgradeAllowed : true ,
2020-11-23 13:19:50 +00:00
AutoUpgradeNeeded : true ,
2020-01-16 12:36:28 +00:00
}
}
2020-11-23 13:19:50 +00:00
// Patch version change, rotate only
return upgradeDecision {
FromVersion : memberVersion ,
FromLicense : memberLicense ,
ToVersion : specVersion ,
ToLicense : specLicense ,
UpgradeNeeded : true ,
UpgradeAllowed : true ,
2020-12-15 11:41:14 +00:00
AutoUpgradeNeeded : true ,
2020-11-23 13:19:50 +00:00
}
}
func currentImageInfo ( spec api . DeploymentSpec , images api . ImageInfoList ) ( api . ImageInfo , bool ) {
if i , ok := images . GetByImage ( spec . GetImage ( ) ) ; ok {
return i , true
}
if i , ok := images . GetByImageID ( spec . GetImage ( ) ) ; ok {
return i , true
}
return api . ImageInfo { } , false
}
func memberImageInfo ( spec api . DeploymentSpec , status api . MemberStatus , images api . ImageInfoList ) ( api . ImageInfo , bool ) {
if status . Image != nil {
return * status . Image , true
}
if i , ok := images . GetByImage ( spec . GetImage ( ) ) ; ok {
return i , true
}
if i , ok := images . GetByImageID ( spec . GetImage ( ) ) ; ok {
return i , true
}
return api . ImageInfo { } , false
2020-01-16 12:36:28 +00:00
}
2021-08-26 07:59:16 +00:00
func getPodDetails ( ctx context . Context , log zerolog . Logger , apiObject k8sutil . APIObject , spec api . DeploymentSpec ,
2020-03-17 08:31:52 +00:00
group api . ServerGroup , status api . DeploymentStatus , m api . MemberStatus ,
2021-08-26 07:59:16 +00:00
cachedStatus inspectorInterface . Inspector , planCtx PlanBuilderContext ) ( string , * core . Pod , * api . ArangoMember , bool ) {
imageInfo , imageFound := planCtx . SelectImageForMember ( spec , status , m )
2020-03-17 08:31:52 +00:00
if ! imageFound {
// Image is not found, so rotation is not needed
2021-08-26 07:59:16 +00:00
return "" , nil , nil , false
2020-03-17 08:31:52 +00:00
}
2021-08-26 07:59:16 +00:00
member , ok := cachedStatus . ArangoMember ( m . ArangoMemberName ( apiObject . GetName ( ) , group ) )
if ! ok {
return "" , nil , nil , false
2020-09-17 13:05:28 +00:00
}
2020-10-28 22:46:01 +00:00
groupSpec := spec . GetServerGroupSpec ( group )
2021-04-26 08:30:06 +00:00
renderedPod , err := planCtx . RenderPodForMember ( ctx , cachedStatus , spec , status , m . ID , imageInfo )
2020-03-17 08:31:52 +00:00
if err != nil {
log . Err ( err ) . Msg ( "Error while rendering pod" )
2021-08-26 07:59:16 +00:00
return "" , nil , nil , false
2020-03-17 08:31:52 +00:00
}
2020-10-28 22:46:01 +00:00
checksum , err := resources . ChecksumArangoPod ( groupSpec , renderedPod )
2020-03-17 08:31:52 +00:00
if err != nil {
log . Err ( err ) . Msg ( "Error while getting pod checksum" )
2021-08-26 07:59:16 +00:00
return "" , nil , nil , false
}
return checksum , renderedPod , member , true
}
// arangoMemberPodTemplateNeedsUpdate returns true when the specification of the
// given pod differs from what it should be according to the
// given deployment spec.
// When true is returned, a reason for the rotation is already returned.
func arangoMemberPodTemplateNeedsUpdate ( ctx context . Context , log zerolog . Logger , apiObject k8sutil . APIObject , spec api . DeploymentSpec ,
group api . ServerGroup , status api . DeploymentStatus , m api . MemberStatus ,
cachedStatus inspectorInterface . Inspector , planCtx PlanBuilderContext ) ( string , bool ) {
checksum , _ , member , valid := getPodDetails ( ctx , log , apiObject , spec , group , status , m , cachedStatus , planCtx )
if valid && ! member . Spec . Template . EqualPodSpecChecksum ( checksum ) {
return "Pod Spec changed" , true
}
return "" , false
}
2020-01-16 12:36:28 +00:00
// clusterReadyForUpgrade returns true if the cluster is ready for the next update, that is:
// - all shards are in sync
// - all members are ready and fine
2022-01-14 09:58:49 +00:00
func groupReadyForRestart ( context PlanBuilderContext , status api . DeploymentStatus , member api . MemberStatus , group api . ServerGroup ) bool {
if group == api . ServerGroupSingle {
2022-01-12 12:44:33 +00:00
return true
}
if ! status . Conditions . IsTrue ( api . ConditionTypeBootstrapCompleted ) {
// Restart is allowed always when bootstrap is not yet completed
return true
}
2022-01-13 12:37:36 +00:00
// If current member did not become ready even once. Kill it
if ! member . Conditions . IsTrue ( api . ConditionTypeStarted ) {
return true
}
// If current core containers are dead kill it.
if ! member . Conditions . IsTrue ( api . ConditionTypeServing ) {
2022-01-12 12:44:33 +00:00
return true
}
switch group {
case api . ServerGroupDBServers :
// TODO: Improve shard placement discovery and keep WriteConcern
2022-01-13 12:37:36 +00:00
return context . GetShardSyncStatus ( ) && status . Members . MembersOfGroup ( group ) . AllMembersServing ( )
2022-01-12 12:44:33 +00:00
default :
// In case of agents we can kill only one agent at same time
2022-01-13 12:37:36 +00:00
return status . Members . MembersOfGroup ( group ) . AllMembersServing ( )
2022-01-12 12:44:33 +00:00
}
2020-01-16 12:36:28 +00:00
}
// createUpgradeMemberPlan creates a plan to upgrade (stop-recreateWithAutoUpgrade-stop-start) an existing
// member.
func createUpgradeMemberPlan ( log zerolog . Logger , member api . MemberStatus ,
2020-11-23 13:19:50 +00:00
group api . ServerGroup , reason string , spec api . DeploymentSpec , status api . DeploymentStatus , rotateStatefull bool ) api . Plan {
2020-01-16 12:36:28 +00:00
upgradeAction := api . ActionTypeUpgradeMember
if rotateStatefull || group . IsStateless ( ) {
upgradeAction = api . ActionTypeRotateMember
}
log . Debug ( ) .
Str ( "id" , member . ID ) .
Str ( "role" , group . AsRole ( ) ) .
Str ( "reason" , reason ) .
Str ( "action" , string ( upgradeAction ) ) .
Msg ( "Creating upgrade plan" )
2022-02-16 00:36:45 +00:00
plan := createRotateMemberPlanWithAction ( member , group , upgradeAction , spec , reason )
if member . Image == nil || member . Image . Image != spec . GetImage ( ) {
plan = plan . Before ( api . NewAction ( api . ActionTypeSetMemberCurrentImage , group , member . ID , reason ) . SetImage ( spec . GetImage ( ) ) )
2021-01-19 14:39:23 +00:00
}
2020-11-23 13:19:50 +00:00
if status . CurrentImage == nil || status . CurrentImage . Image != spec . GetImage ( ) {
2022-02-16 00:36:45 +00:00
plan = plan . Before ( api . NewAction ( api . ActionTypeSetCurrentImage , group , "" , reason ) . SetImage ( spec . GetImage ( ) ) )
2020-09-17 13:05:28 +00:00
}
2021-07-15 12:07:33 +00:00
return withSecureWrap ( member , group , spec , plan ... )
}
func withSecureWrap ( member api . MemberStatus ,
group api . ServerGroup , spec api . DeploymentSpec , plan ... api . Action ) api . Plan {
image := member . Image
if image == nil {
return plan
2020-01-16 12:36:28 +00:00
}
2021-07-15 12:07:33 +00:00
if skipResignLeadership ( spec . GetMode ( ) , image . ArangoDBVersion ) {
// In this case we skip resign leadership but we enable maintenance
return withMaintenanceStart ( plan ... )
} else {
return withResignLeadership ( group , member , "ResignLeadership" , plan ... )
}
}
func skipResignLeadership ( mode api . DeploymentMode , v driver . Version ) bool {
return mode == api . DeploymentModeCluster && features . Maintenance ( ) . Enabled ( ) && ( ( v . CompareTo ( "3.6.0" ) >= 0 && v . CompareTo ( "3.6.14" ) <= 0 ) ||
( v . CompareTo ( "3.7.0" ) >= 0 && v . CompareTo ( "3.7.12" ) <= 0 ) )
2020-01-16 12:36:28 +00:00
}