1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] EnforcedResignLeadership action (#1439)

This commit is contained in:
Adam Janikowski 2023-10-13 15:54:05 +02:00 committed by GitHub
parent 5693d82f40
commit 411efad90a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 368 additions and 58 deletions

View file

@ -10,6 +10,7 @@
- (Bugfix) Fix ArangoBackup Create Backoff & ArangoBackupPolicy propagation
- (Maintenance) Add IndexMethod Documentation
- (Bugfix) Fix VersionCheck args propagation
- (Feature) EnforcedResignLeadership action
## [1.2.33](https://github.com/arangodb/kube-arangodb/tree/1.2.33) (2023-09-27)
- (Maintenance) Bump golang.org/x/net to v0.13.0

View file

@ -58,32 +58,33 @@ covers individual newer features separately.
#### Operator Features
<!-- START(featuresCommunityTable) -->
| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks |
|:-------------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:--------------------------------------------------------------------------|
| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified |
| [Rebalancer V2](docs/design/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A |
| [Secured containers](docs/design/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode |
| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A |
| [Operator Ephemeral Volumes](docs/design/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A |
| [Force Rebuild Out Synced Shards](docs/design/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. |
| [Spec Default Restore](docs/design/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec |
| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A |
| [Failover Leader service](docs/design/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A |
| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A |
| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A |
| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A |
| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A |
| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A |
| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A |
| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A |
| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled |
| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A |
| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required |
| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks |
|:-------------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:-----------------------------------------------------------------------------------|
| Enforced ResignLeadership | 1.2.34 | 1.2.34 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.enforced-resign-leadership | Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer |
| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified |
| [Rebalancer V2](docs/design/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A |
| [Secured containers](docs/design/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode |
| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A |
| [Operator Ephemeral Volumes](docs/design/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A |
| [Force Rebuild Out Synced Shards](docs/design/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. |
| [Spec Default Restore](docs/design/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec |
| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A |
| [Failover Leader service](docs/design/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A |
| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A |
| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A |
| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A |
| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A |
| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A |
| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A |
| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A |
| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled |
| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A |
| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required |
| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
<!-- END(featuresCommunityTable) -->

View file

@ -29,6 +29,7 @@
| EncryptionKeyRefresh | no | 10m0s | no | Enterprise Only | Refresh the encryption keys on member |
| EncryptionKeyRemove | no | 10m0s | no | Enterprise Only | Remove the encryption key to the pool |
| EncryptionKeyStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of encryption propagation |
| EnforceResignLeadership | no | 45m0s | yes | Community & Enterprise | Run the ResignLeadership job on DBServer and checks data compatibility after |
| Idle | no | 10m0s | no | Community & Enterprise | Define idle operation in case if preconditions are not meet |
| JWTAdd | no | 10m0s | no | Enterprise Only | Adds new JWT to the pool |
| JWTClean | no | 10m0s | no | Enterprise Only | Remove JWT key from the pool |
@ -122,6 +123,7 @@ spec:
EncryptionKeyRefresh: 10m0s
EncryptionKeyRemove: 10m0s
EncryptionKeyStatusUpdate: 10m0s
EnforceResignLeadership: 45m0s
Idle: 10m0s
JWTAdd: 10m0s
JWTClean: 10m0s

View file

@ -29,6 +29,10 @@ actions:
description: Run the ResignLeadership job on DBServer
timeout: 30m
optional: true
EnforceResignLeadership:
description: Run the ResignLeadership job on DBServer and checks data compatibility after
timeout: 45m
optional: true
KillMemberPod:
description: Execute Delete on Pod (put pod in Terminating state)
scopes:

View file

@ -220,3 +220,10 @@ features:
releases:
- operatorVersion: 1.2.33
state: Production
- name: Enforced ResignLeadership
enabled: true
remarks: Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer
flag: --deployment.feature.enforced-resign-leadership
releases:
- operatorVersion: 1.2.34
state: Production

View file

@ -98,6 +98,9 @@ const (
// ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate
ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout
// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s
// ActionIdleDefaultTimeout define default timeout for action ActionIdle
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout
@ -353,6 +356,9 @@ const (
// ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation
ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate"
// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"
// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
ActionTypeIdle ActionType = "Idle"
@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
return ActionEncryptionKeyRemoveDefaultTimeout
case ActionTypeEncryptionKeyStatusUpdate:
return ActionEncryptionKeyStatusUpdateDefaultTimeout
case ActionTypeEnforceResignLeadership:
return ActionEnforceResignLeadershipDefaultTimeout
case ActionTypeIdle:
return ActionIdleDefaultTimeout
case ActionTypeJWTAdd:
@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority {
return ActionPriorityNormal
case ActionTypeEncryptionKeyStatusUpdate:
return ActionPriorityNormal
case ActionTypeEnforceResignLeadership:
return ActionPriorityNormal
case ActionTypeIdle:
return ActionPriorityNormal
case ActionTypeJWTAdd:
@ -947,6 +957,8 @@ func (a ActionType) Optional() bool {
return false
case ActionTypeEncryptionKeyStatusUpdate:
return false
case ActionTypeEnforceResignLeadership:
return true
case ActionTypeIdle:
return false
case ActionTypeJWTAdd:

View file

@ -98,6 +98,9 @@ const (
// ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate
ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout
// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s
// ActionIdleDefaultTimeout define default timeout for action ActionIdle
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout
@ -353,6 +356,9 @@ const (
// ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation
ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate"
// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"
// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
ActionTypeIdle ActionType = "Idle"
@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
return ActionEncryptionKeyRemoveDefaultTimeout
case ActionTypeEncryptionKeyStatusUpdate:
return ActionEncryptionKeyStatusUpdateDefaultTimeout
case ActionTypeEnforceResignLeadership:
return ActionEnforceResignLeadershipDefaultTimeout
case ActionTypeIdle:
return ActionIdleDefaultTimeout
case ActionTypeJWTAdd:
@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority {
return ActionPriorityNormal
case ActionTypeEncryptionKeyStatusUpdate:
return ActionPriorityNormal
case ActionTypeEnforceResignLeadership:
return ActionPriorityNormal
case ActionTypeIdle:
return ActionPriorityNormal
case ActionTypeJWTAdd:
@ -947,6 +957,8 @@ func (a ActionType) Optional() bool {
return false
case ActionTypeEncryptionKeyStatusUpdate:
return false
case ActionTypeEnforceResignLeadership:
return true
case ActionTypeIdle:
return false
case ActionTypeJWTAdd:

View file

@ -0,0 +1,38 @@
//
// DISCLAIMER
//
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package features
func init() {
registerFeature(enforcedResignLeadership)
}
var enforcedResignLeadership = &feature{
name: "enforced-resign-leadership",
description: "Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer",
version: "3.7.0",
enterpriseRequired: false,
enabledByDefault: true,
}
// EnforcedResignLeadership returns enforced ResignLeadership.
func EnforcedResignLeadership() Feature {
return enforcedResignLeadership
}

View file

@ -93,6 +93,9 @@ var (
_ Action = &actionEncryptionKeyStatusUpdate{}
_ actionFactory = newEncryptionKeyStatusUpdateAction
_ Action = &actionEnforceResignLeadership{}
_ actionFactory = newEnforceResignLeadershipAction
_ Action = &actionIdle{}
_ actionFactory = newIdleAction
@ -599,6 +602,20 @@ func init() {
registerAction(action, function)
}
// EnforceResignLeadership
{
// Get Action type
action := api.ActionTypeEnforceResignLeadership
// Get Action defition
function := newEnforceResignLeadershipAction
// Wrap action main function
// Register action
registerAction(action, function)
}
// Idle
{
// Get Action type

View file

@ -276,6 +276,16 @@ func Test_Actions(t *testing.T) {
})
})
t.Run("EnforceResignLeadership", func(t *testing.T) {
ActionsExistence(t, api.ActionTypeEnforceResignLeadership)
t.Run("Internal", func(t *testing.T) {
require.False(t, api.ActionTypeEnforceResignLeadership.Internal())
})
t.Run("Optional", func(t *testing.T) {
require.True(t, api.ActionTypeEnforceResignLeadership.Optional())
})
})
t.Run("Idle", func(t *testing.T) {
ActionsExistence(t, api.ActionTypeIdle)
t.Run("Internal", func(t *testing.T) {

View file

@ -0,0 +1,169 @@
//
// DISCLAIMER
//
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package reconcile
import (
"context"
"github.com/arangodb/go-driver"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency/state"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
)
const (
resignLeadershipJobID api.PlanLocalKey = "resignLeadershipJobID"
)
// newEnforceResignLeadershipAction creates a new Action that implements the given
// planned ResignLeadership action.
func newEnforceResignLeadershipAction(action api.Action, actionCtx ActionContext) Action {
a := &actionEnforceResignLeadership{}
a.actionImpl = newActionImplDefRef(action, actionCtx)
return a
}
// actionEnforceResignLeadership implements an ResignLeadershipAction.
type actionEnforceResignLeadership struct {
actionImpl
}
// Start performs the start of the ReasignLeadership process on DBServer.
func (a *actionEnforceResignLeadership) Start(ctx context.Context) (bool, error) {
group := a.action.Group
if a.actionCtx.GetSpec().Mode.Get() != api.DeploymentModeCluster {
a.log.Debug("Resign only allowed in cluster mode")
return true, nil
}
switch group {
case api.ServerGroupDBServers:
if agencyState, agencyOK := a.actionCtx.GetAgencyCache(); !agencyOK {
a.log.Warn("AgencyCache is not ready")
return false, nil
} else if agencyState.Supervision.Maintenance.Exists() {
// We are done, action cannot be handled on maintenance mode
a.log.Warn("Maintenance is enabled, skipping action")
return true, nil
}
return false, nil
default:
return true, nil
}
}
// CheckProgress checks if the Job is completed, if not then start it. Repeat in case of error or if still a leader
func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool, bool, error) {
group := a.action.Group
m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID)
if !ok {
a.log.Error("No such member")
return true, false, nil
}
if group != api.ServerGroupDBServers {
// Only DBServers can use ResignLeadership job
return true, false, nil
}
agencyState, agencyOK := a.actionCtx.GetAgencyCache()
if !agencyOK {
a.log.Error("Unable to get maintenance mode")
return false, false, nil
} else if agencyState.Supervision.Maintenance.Exists() {
a.log.Warn("Maintenance is enabled, skipping action")
// We are done, action cannot be handled on maintenance mode
m.CleanoutJobID = ""
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
return false, false, errors.WithStack(err)
}
return true, false, nil
} else if isServerRebooted(a.log, a.action, agencyState, driver.ServerID(m.ID)) {
return true, false, nil
}
// Lets start resign job if required
if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" {
_, jobStatus := agencyState.Target.GetJob(state.JobID(m.CleanoutJobID))
switch jobStatus {
case state.JobPhaseFailed:
m.CleanoutJobID = ""
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
return false, false, errors.WithStack(err)
}
a.log.Error("Resign server job failed")
return false, false, nil
case state.JobPhaseFinished:
m.CleanoutJobID = ""
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
return false, false, errors.WithStack(err)
}
default:
return false, false, nil
}
// Remove key
a.actionCtx.Add(resignLeadershipJobID, "", true)
// Job is Finished, check if we are not a leader anymore
if agencyState.PlanLeaderServers().Contains(state.Server(m.ID)) {
// We are still a leader!
a.log.Warn("DBServers is still a leader for shards")
return false, false, nil
}
return true, false, nil
}
// Job not in progress, start it
client, err := a.actionCtx.GetMembersState().State().GetDatabaseClient()
if err != nil {
a.log.Err(err).Error("Unable to get client")
return false, false, errors.WithStack(err)
}
ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
defer cancel()
cluster, err := client.Cluster(ctxChild)
if err != nil {
a.log.Err(err).Error("Unable to get cluster client")
return false, false, errors.WithStack(err)
}
var jobID string
ctxChild, cancel = globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
defer cancel()
jobCtx := driver.WithJobIDResponse(ctxChild, &jobID)
a.log.Debug("Temporary shutdown, resign leadership")
if err := cluster.ResignServer(jobCtx, m.ID); err != nil {
a.log.Err(err).Debug("Failed to resign server")
return false, false, errors.WithStack(err)
}
a.actionCtx.Add(resignLeadershipJobID, jobID, true)
return false, false, nil
}

View file

@ -22,7 +22,6 @@ package reconcile
import (
"context"
"strconv"
"github.com/arangodb/go-driver"
@ -132,7 +131,7 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
return false, false, errors.WithStack(err)
}
return true, false, nil
} else if a.isServerRebooted(agencyState, driver.ServerID(m.ID)) {
} else if isServerRebooted(a.log, a.action, agencyState, driver.ServerID(m.ID)) {
return true, false, nil
}
@ -157,31 +156,3 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
}
return false, false, nil
}
// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership.
func (a *actionResignLeadership) isServerRebooted(agencyState state.State, serverID driver.ServerID) bool {
rebootID, ok := agencyState.GetRebootID(serverID)
if !ok {
return false
}
v, ok := a.action.Params[actionResignLeadershipRebootID.String()]
if !ok {
a.log.Warn("missing reboot ID in action's locals")
return false
}
r, err := strconv.Atoi(v)
if err != nil {
a.log.Err(err).Warn("reboot ID '%s' supposed to be a number", v)
return false
}
if rebootID <= r {
// Server has not been restarted.
return false
}
a.log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID)
return true
}

View file

@ -0,0 +1,66 @@
//
// DISCLAIMER
//
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package reconcile
import (
"strconv"
"github.com/arangodb/go-driver"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency/state"
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
"github.com/arangodb/kube-arangodb/pkg/logging"
)
func getResignLeadershipActionType() api.ActionType {
if features.EnforcedResignLeadership().Enabled() {
return api.ActionTypeEnforceResignLeadership
}
return api.ActionTypeResignLeadership
}
// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership.
func isServerRebooted(log logging.Logger, action api.Action, agencyState state.State, serverID driver.ServerID) bool {
rebootID, ok := agencyState.GetRebootID(serverID)
if !ok {
return false
}
v, ok := action.Params[actionResignLeadershipRebootID.String()]
if !ok {
return false
}
r, err := strconv.Atoi(v)
if err != nil {
log.Err(err).Warn("reboot ID '%s' supposed to be a number", v)
return false
}
if rebootID <= r {
// Server has not been restarted.
return false
}
log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID)
return true
}

View file

@ -64,7 +64,7 @@ func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason
return plan
}
action := actions.NewAction(api.ActionTypeResignLeadership, group, member, reason)
action := actions.NewAction(getResignLeadershipActionType(), group, member, reason)
if rebootID != nil {
action = actionResignLeadershipRebootID.Register(action, "%d", *rebootID)
}

View file

@ -175,7 +175,7 @@ func (r *Reconciler) pvcResizePlan(group api.ServerGroup, member api.MemberStatu
}
case api.PVCResizeModeRotate:
return withWaitForMember(api.Plan{
actions.NewAction(api.ActionTypeResignLeadership, group, member),
actions.NewAction(getResignLeadershipActionType(), group, member),
actions.NewAction(api.ActionTypeKillMemberPod, group, member),
actions.NewAction(api.ActionTypeRotateStartMember, group, member),
actions.NewAction(api.ActionTypePVCResize, group, member),