2020-01-16 12:36:28 +00:00
//
// DISCLAIMER
//
2020-03-04 10:25:14 +00:00
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
2020-01-16 12:36:28 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Tomasz Mielech <tomasz@arangodb.com>
//
package reconcile
import (
2020-06-08 11:30:32 +00:00
"context"
2021-07-15 12:07:33 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
2021-05-18 12:26:32 +00:00
json "github.com/json-iterator/go"
"github.com/arangodb/kube-arangodb/pkg/deployment/pod"
2020-10-28 22:46:01 +00:00
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
2020-01-16 12:36:28 +00:00
"github.com/arangodb/go-driver"
upgraderules "github.com/arangodb/go-upgrade-rules"
2020-04-08 10:32:24 +00:00
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
2020-01-16 12:36:28 +00:00
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2020-05-18 13:27:53 +00:00
"github.com/arangodb/kube-arangodb/pkg/util"
2020-01-16 12:36:28 +00:00
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
2021-03-10 13:30:47 +00:00
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
2020-01-16 12:36:28 +00:00
"github.com/rs/zerolog"
2020-02-21 11:59:19 +00:00
core "k8s.io/api/core/v1"
2020-01-16 12:36:28 +00:00
)
2020-07-30 13:28:30 +00:00
var (
// rotationByAnnotationOrder - Change order of execution - Coordinators and Agents should be executed before DBServer to save time
rotationByAnnotationOrder = [ ] api . ServerGroup {
api . ServerGroupAgents ,
2020-08-19 15:12:12 +00:00
api . ServerGroupSingle ,
2020-07-30 13:28:30 +00:00
api . ServerGroupCoordinators ,
api . ServerGroupDBServers ,
api . ServerGroupSyncMasters ,
api . ServerGroupSyncWorkers ,
}
)
2020-06-08 11:30:32 +00:00
2021-08-20 13:02:36 +00:00
// upgradeDecision is the result of an upgrade check.
type upgradeDecision struct {
FromVersion driver . Version
FromLicense upgraderules . License
ToVersion driver . Version
ToLicense upgraderules . License
UpgradeNeeded bool // If set, the image version has changed
UpgradeAllowed bool // If set, it is an allowed version change
AutoUpgradeNeeded bool // If set, the database must be started with `--database.auto-upgrade` once
Hold bool
}
2020-07-30 13:28:30 +00:00
// createRotateOrUpgradePlan goes over all pods to check if an upgrade or rotate is needed.
2020-06-08 11:30:32 +00:00
func createRotateOrUpgradePlan ( ctx context . Context ,
log zerolog . Logger , apiObject k8sutil . APIObject ,
spec api . DeploymentSpec , status api . DeploymentStatus ,
2021-03-10 13:30:47 +00:00
cachedStatus inspectorInterface . Inspector , context PlanBuilderContext ) api . Plan {
2020-06-08 11:30:32 +00:00
var plan api . Plan
2021-04-26 08:30:06 +00:00
newPlan , idle := createRotateOrUpgradePlanInternal ( ctx , log , apiObject , spec , status , cachedStatus , context )
2020-06-08 11:30:32 +00:00
if idle {
plan = append ( plan ,
api . NewAction ( api . ActionTypeIdle , api . ServerGroupUnknown , "" ) )
} else {
plan = append ( plan , newPlan ... )
}
return plan
}
2021-04-26 08:30:06 +00:00
func createRotateOrUpgradePlanInternal ( ctx context . Context , log zerolog . Logger , apiObject k8sutil . APIObject , spec api . DeploymentSpec ,
2021-03-10 13:30:47 +00:00
status api . DeploymentStatus , cachedStatus inspectorInterface . Inspector , context PlanBuilderContext ) ( api . Plan , bool ) {
2020-01-16 12:36:28 +00:00
var newPlan api . Plan
var upgradeNotAllowed bool
var fromVersion , toVersion driver . Version
var fromLicense , toLicense upgraderules . License
status . Members . ForeachServerGroup ( func ( group api . ServerGroup , members api . MemberStatusList ) error {
for _ , m := range members {
if m . Phase != api . MemberPhaseCreated || m . PodName == "" {
// Only rotate when phase is created
continue
}
2020-06-08 11:30:32 +00:00
pod , found := cachedStatus . Pod ( m . PodName )
2020-01-16 12:36:28 +00:00
if ! found {
continue
}
// Got pod, compare it with what it should be
2020-11-23 13:19:50 +00:00
decision := podNeedsUpgrading ( log , m , spec , status . Images )
if decision . Hold {
return nil
}
2020-01-16 12:36:28 +00:00
if decision . UpgradeNeeded && ! decision . UpgradeAllowed {
// Oops, upgrade is not allowed
upgradeNotAllowed = true
fromVersion = decision . FromVersion
fromLicense = decision . FromLicense
toVersion = decision . ToVersion
toLicense = decision . ToLicense
return nil
}
2020-01-20 07:39:51 +00:00
if ! newPlan . IsEmpty ( ) {
2020-01-16 12:36:28 +00:00
// Only rotate/upgrade 1 pod at a time
continue
}
if decision . UpgradeNeeded {
// Yes, upgrade is needed (and allowed)
2020-11-23 13:19:50 +00:00
newPlan = createUpgradeMemberPlan ( log , m , group , "Version upgrade" , spec , status ,
2020-01-16 12:36:28 +00:00
! decision . AutoUpgradeNeeded )
} else {
2020-04-08 10:32:24 +00:00
// Use new level of rotate logic
2021-05-18 12:26:32 +00:00
rotNeeded , reason := podNeedsRotation ( ctx , log , apiObject , pod , spec , group , status , m , cachedStatus , context )
2020-04-08 10:32:24 +00:00
if rotNeeded {
newPlan = createRotateMemberPlan ( log , m , group , reason )
2020-01-16 12:36:28 +00:00
}
}
2020-03-11 07:57:03 +00:00
if ! newPlan . IsEmpty ( ) {
// Only rotate/upgrade 1 pod at a time
continue
}
2020-07-30 13:28:30 +00:00
}
return nil
} )
status . Members . ForeachServerInGroups ( func ( group api . ServerGroup , members api . MemberStatusList ) error {
for _ , m := range members {
if m . Phase != api . MemberPhaseCreated || m . PodName == "" {
// Only rotate when phase is created
continue
}
if ! newPlan . IsEmpty ( ) {
// Only rotate/upgrade 1 pod at a time
continue
}
pod , found := cachedStatus . Pod ( m . PodName )
if ! found {
continue
}
2020-03-11 07:57:03 +00:00
if pod . Annotations != nil {
2020-10-29 12:52:13 +00:00
if _ , ok := pod . Annotations [ deployment . ArangoDeploymentPodReplaceAnnotation ] ; ok && group == api . ServerGroupDBServers {
newPlan = api . Plan { api . NewAction ( api . ActionTypeMarkToRemoveMember , group , m . ID , "Replace flag present" ) }
continue
}
2020-03-11 07:57:03 +00:00
if _ , ok := pod . Annotations [ deployment . ArangoDeploymentPodRotateAnnotation ] ; ok {
newPlan = createRotateMemberPlan ( log , m , group , "Rotation flag present" )
2020-10-29 12:52:13 +00:00
continue
2020-03-11 07:57:03 +00:00
}
}
2020-01-16 12:36:28 +00:00
}
2020-07-30 13:28:30 +00:00
2020-01-16 12:36:28 +00:00
return nil
2020-07-30 13:28:30 +00:00
} , rotationByAnnotationOrder ... )
2020-01-16 12:36:28 +00:00
if upgradeNotAllowed {
context . CreateEvent ( k8sutil . NewUpgradeNotAllowedEvent ( apiObject , fromVersion , toVersion , fromLicense , toLicense ) )
2020-01-20 07:39:51 +00:00
} else if ! newPlan . IsEmpty ( ) {
2020-01-16 12:36:28 +00:00
if clusterReadyForUpgrade ( context ) {
// Use the new plan
2020-04-16 05:57:48 +00:00
return newPlan , false
2020-01-16 12:36:28 +00:00
} else {
2020-05-18 13:27:53 +00:00
if util . BoolOrDefault ( spec . AllowUnsafeUpgrade , false ) {
log . Info ( ) . Msg ( "Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready, but unsafe upgrade is allowed" )
// Use the new plan
return newPlan , false
} else {
log . Info ( ) . Msg ( "Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready." )
return nil , true
}
2020-01-16 12:36:28 +00:00
}
}
2021-07-15 12:07:33 +00:00
2020-04-16 05:57:48 +00:00
return nil , false
2020-01-16 12:36:28 +00:00
}
// podNeedsUpgrading decides if an upgrade of the pod is needed (to comply with
// the given spec) and if that is allowed.
2020-11-23 13:19:50 +00:00
func podNeedsUpgrading ( log zerolog . Logger , status api . MemberStatus , spec api . DeploymentSpec , images api . ImageInfoList ) upgradeDecision {
currentImage , found := currentImageInfo ( spec , images )
if ! found {
// Hold rotation tasks - we do not know image
return upgradeDecision { Hold : true }
}
memberImage , found := memberImageInfo ( spec , status , images )
if ! found {
// Member info not found
return upgradeDecision { UpgradeNeeded : false }
}
if currentImage . Image == memberImage . Image {
// No change
return upgradeDecision { UpgradeNeeded : false }
}
// Image changed, check if change is allowed
specVersion := currentImage . ArangoDBVersion
memberVersion := memberImage . ArangoDBVersion
asLicense := func ( info api . ImageInfo ) upgraderules . License {
if info . Enterprise {
return upgraderules . LicenseEnterprise
2020-01-16 12:36:28 +00:00
}
2020-11-23 13:19:50 +00:00
return upgraderules . LicenseCommunity
}
specLicense := asLicense ( currentImage )
memberLicense := asLicense ( memberImage )
if err := upgraderules . CheckUpgradeRulesWithLicense ( memberVersion , specVersion , memberLicense , specLicense ) ; err != nil {
// E.g. 3.x -> 4.x, we cannot allow automatically
return upgradeDecision {
FromVersion : memberVersion ,
FromLicense : memberLicense ,
ToVersion : specVersion ,
ToLicense : specLicense ,
UpgradeNeeded : true ,
UpgradeAllowed : false ,
2020-01-16 12:36:28 +00:00
}
2020-11-23 13:19:50 +00:00
}
if specVersion . Major ( ) != memberVersion . Major ( ) || specVersion . Minor ( ) != memberVersion . Minor ( ) {
// Is allowed, with `--database.auto-upgrade`
log . Info ( ) . Str ( "spec-version" , string ( specVersion ) ) . Str ( "pod-version" , string ( memberVersion ) ) .
Int ( "spec-version.major" , specVersion . Major ( ) ) . Int ( "spec-version.minor" , specVersion . Minor ( ) ) .
Int ( "pod-version.major" , memberVersion . Major ( ) ) . Int ( "pod-version.minor" , memberVersion . Minor ( ) ) .
Msg ( "Deciding to do a upgrade with --auto-upgrade" )
2020-01-16 12:36:28 +00:00
return upgradeDecision {
2020-11-23 13:19:50 +00:00
FromVersion : memberVersion ,
FromLicense : memberLicense ,
2020-01-16 12:36:28 +00:00
ToVersion : specVersion ,
ToLicense : specLicense ,
UpgradeNeeded : true ,
UpgradeAllowed : true ,
2020-11-23 13:19:50 +00:00
AutoUpgradeNeeded : true ,
2020-01-16 12:36:28 +00:00
}
}
2020-11-23 13:19:50 +00:00
// Patch version change, rotate only
return upgradeDecision {
FromVersion : memberVersion ,
FromLicense : memberLicense ,
ToVersion : specVersion ,
ToLicense : specLicense ,
UpgradeNeeded : true ,
UpgradeAllowed : true ,
2020-12-15 11:41:14 +00:00
AutoUpgradeNeeded : true ,
2020-11-23 13:19:50 +00:00
}
}
func currentImageInfo ( spec api . DeploymentSpec , images api . ImageInfoList ) ( api . ImageInfo , bool ) {
if i , ok := images . GetByImage ( spec . GetImage ( ) ) ; ok {
return i , true
}
if i , ok := images . GetByImageID ( spec . GetImage ( ) ) ; ok {
return i , true
}
return api . ImageInfo { } , false
}
func memberImageInfo ( spec api . DeploymentSpec , status api . MemberStatus , images api . ImageInfoList ) ( api . ImageInfo , bool ) {
if status . Image != nil {
return * status . Image , true
}
if i , ok := images . GetByImage ( spec . GetImage ( ) ) ; ok {
return i , true
}
if i , ok := images . GetByImageID ( spec . GetImage ( ) ) ; ok {
return i , true
}
return api . ImageInfo { } , false
2020-01-16 12:36:28 +00:00
}
2020-04-08 10:32:24 +00:00
// podNeedsRotation returns true when the specification of the
2020-03-17 08:31:52 +00:00
// given pod differs from what it should be according to the
// given deployment spec.
// When true is returned, a reason for the rotation is already returned.
2021-05-18 12:26:32 +00:00
func podNeedsRotation ( ctx context . Context , log zerolog . Logger , apiObject k8sutil . APIObject , p * core . Pod , spec api . DeploymentSpec ,
2020-03-17 08:31:52 +00:00
group api . ServerGroup , status api . DeploymentStatus , m api . MemberStatus ,
2021-04-26 08:30:06 +00:00
cachedStatus inspectorInterface . Inspector , planCtx PlanBuilderContext ) ( bool , string ) {
2021-05-18 12:26:32 +00:00
2020-03-17 08:31:52 +00:00
if m . PodUID != p . UID {
return true , "Pod UID does not match, this pod is not managed by Operator. Recreating"
}
if m . PodSpecVersion == "" {
return true , "Pod Spec Version is nil - recreating pod"
}
2021-04-26 08:30:06 +00:00
imageInfo , imageFound := planCtx . SelectImage ( spec , status )
2020-03-17 08:31:52 +00:00
if ! imageFound {
// Image is not found, so rotation is not needed
return false , ""
}
2020-09-17 13:05:28 +00:00
if m . Image != nil {
imageInfo = * m . Image
}
2020-10-28 22:46:01 +00:00
groupSpec := spec . GetServerGroupSpec ( group )
2021-04-26 08:30:06 +00:00
renderedPod , err := planCtx . RenderPodForMember ( ctx , cachedStatus , spec , status , m . ID , imageInfo )
2020-03-17 08:31:52 +00:00
if err != nil {
log . Err ( err ) . Msg ( "Error while rendering pod" )
return false , ""
}
2020-10-28 22:46:01 +00:00
checksum , err := resources . ChecksumArangoPod ( groupSpec , renderedPod )
2020-03-17 08:31:52 +00:00
if err != nil {
log . Err ( err ) . Msg ( "Error while getting pod checksum" )
return false , ""
}
if m . PodSpecVersion != checksum {
2021-05-18 12:26:32 +00:00
if _ , err := json . Marshal ( renderedPod ) ; err == nil {
log . Info ( ) . Str ( "id" , m . ID ) . Str ( "Before" , m . PodSpecVersion ) . Str ( "After" , checksum ) . Msgf ( "XXXXXXXXXXX Pod needs rotation - checksum does not match" )
}
2020-03-17 08:31:52 +00:00
return true , "Pod needs rotation - checksum does not match"
}
2021-05-18 12:26:32 +00:00
endpoint , err := pod . GenerateMemberEndpoint ( cachedStatus , apiObject , spec , group , m )
if err != nil {
log . Err ( err ) . Msg ( "Error while getting pod endpoint" )
return false , ""
}
if e := m . Endpoint ; e == nil {
if spec . CommunicationMethod == nil {
// TODO: Remove in 1.2.0 release to allow rotation
return false , "Pod endpoint is not set and CommunicationMethod is not set, do not recreate"
}
return true , "Communication method has been set - ensure endpoint"
} else {
if * e != endpoint {
return true , "Pod endpoint changed"
}
}
2020-03-17 08:31:52 +00:00
return false , ""
}
2020-01-16 12:36:28 +00:00
// clusterReadyForUpgrade returns true if the cluster is ready for the next update, that is:
// - all shards are in sync
// - all members are ready and fine
func clusterReadyForUpgrade ( context PlanBuilderContext ) bool {
status , _ := context . GetStatus ( )
allInSync := context . GetShardSyncStatus ( )
return allInSync && status . Conditions . IsTrue ( api . ConditionTypeReady )
}
// createUpgradeMemberPlan creates a plan to upgrade (stop-recreateWithAutoUpgrade-stop-start) an existing
// member.
func createUpgradeMemberPlan ( log zerolog . Logger , member api . MemberStatus ,
2020-11-23 13:19:50 +00:00
group api . ServerGroup , reason string , spec api . DeploymentSpec , status api . DeploymentStatus , rotateStatefull bool ) api . Plan {
2020-01-16 12:36:28 +00:00
upgradeAction := api . ActionTypeUpgradeMember
if rotateStatefull || group . IsStateless ( ) {
upgradeAction = api . ActionTypeRotateMember
}
log . Debug ( ) .
Str ( "id" , member . ID ) .
Str ( "role" , group . AsRole ( ) ) .
Str ( "reason" , reason ) .
Str ( "action" , string ( upgradeAction ) ) .
Msg ( "Creating upgrade plan" )
2021-01-19 14:39:23 +00:00
var plan = api . Plan {
api . NewAction ( api . ActionTypeCleanTLSKeyfileCertificate , group , member . ID , "Remove server keyfile and enforce renewal/recreation" ) ,
}
2020-11-23 13:19:50 +00:00
if status . CurrentImage == nil || status . CurrentImage . Image != spec . GetImage ( ) {
2021-07-15 12:07:33 +00:00
plan = plan . After ( api . NewAction ( api . ActionTypeSetCurrentImage , group , "" , reason ) . SetImage ( spec . GetImage ( ) ) )
2020-09-17 13:05:28 +00:00
}
2020-11-23 13:19:50 +00:00
if member . Image == nil || member . Image . Image != spec . GetImage ( ) {
2021-07-15 12:07:33 +00:00
plan = plan . After ( api . NewAction ( api . ActionTypeSetMemberCurrentImage , group , member . ID , reason ) . SetImage ( spec . GetImage ( ) ) )
}
plan = plan . After ( api . NewAction ( upgradeAction , group , member . ID , reason ) ,
api . NewAction ( api . ActionTypeWaitForMemberUp , group , member . ID ) )
return withSecureWrap ( member , group , spec , plan ... )
}
func withSecureWrap ( member api . MemberStatus ,
group api . ServerGroup , spec api . DeploymentSpec , plan ... api . Action ) api . Plan {
image := member . Image
if image == nil {
return plan
2020-01-16 12:36:28 +00:00
}
2021-07-15 12:07:33 +00:00
if skipResignLeadership ( spec . GetMode ( ) , image . ArangoDBVersion ) {
// In this case we skip resign leadership but we enable maintenance
return withMaintenanceStart ( plan ... )
} else {
return withResignLeadership ( group , member , "ResignLeadership" , plan ... )
}
}
func skipResignLeadership ( mode api . DeploymentMode , v driver . Version ) bool {
return mode == api . DeploymentModeCluster && features . Maintenance ( ) . Enabled ( ) && ( ( v . CompareTo ( "3.6.0" ) >= 0 && v . CompareTo ( "3.6.14" ) <= 0 ) ||
( v . CompareTo ( "3.7.0" ) >= 0 && v . CompareTo ( "3.7.12" ) <= 0 ) )
2020-01-16 12:36:28 +00:00
}