mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Feature] EnforcedResignLeadership action (#1439)
This commit is contained in:
parent
5693d82f40
commit
411efad90a
15 changed files with 368 additions and 58 deletions
|
@ -10,6 +10,7 @@
|
|||
- (Bugfix) Fix ArangoBackup Create Backoff & ArangoBackupPolicy propagation
|
||||
- (Maintenance) Add IndexMethod Documentation
|
||||
- (Bugfix) Fix VersionCheck args propagation
|
||||
- (Feature) EnforcedResignLeadership action
|
||||
|
||||
## [1.2.33](https://github.com/arangodb/kube-arangodb/tree/1.2.33) (2023-09-27)
|
||||
- (Maintenance) Bump golang.org/x/net to v0.13.0
|
||||
|
|
53
README.md
53
README.md
|
@ -58,32 +58,33 @@ covers individual newer features separately.
|
|||
#### Operator Features
|
||||
|
||||
<!-- START(featuresCommunityTable) -->
|
||||
| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks |
|
||||
|:-------------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:--------------------------------------------------------------------------|
|
||||
| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified |
|
||||
| [Rebalancer V2](docs/design/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A |
|
||||
| [Secured containers](docs/design/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode |
|
||||
| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A |
|
||||
| [Operator Ephemeral Volumes](docs/design/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A |
|
||||
| [Force Rebuild Out Synced Shards](docs/design/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. |
|
||||
| [Spec Default Restore](docs/design/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec |
|
||||
| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A |
|
||||
| [Failover Leader service](docs/design/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A |
|
||||
| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A |
|
||||
| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A |
|
||||
| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A |
|
||||
| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A |
|
||||
| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A |
|
||||
| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A |
|
||||
| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A |
|
||||
| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled |
|
||||
| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A |
|
||||
| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required |
|
||||
| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks |
|
||||
|:-------------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:-----------------------------------------------------------------------------------|
|
||||
| Enforced ResignLeadership | 1.2.34 | 1.2.34 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.enforced-resign-leadership | Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer |
|
||||
| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified |
|
||||
| [Rebalancer V2](docs/design/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A |
|
||||
| [Secured containers](docs/design/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode |
|
||||
| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A |
|
||||
| [Operator Ephemeral Volumes](docs/design/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A |
|
||||
| [Force Rebuild Out Synced Shards](docs/design/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. |
|
||||
| [Spec Default Restore](docs/design/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec |
|
||||
| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A |
|
||||
| [Failover Leader service](docs/design/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A |
|
||||
| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A |
|
||||
| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A |
|
||||
| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A |
|
||||
| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A |
|
||||
| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A |
|
||||
| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A |
|
||||
| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A |
|
||||
| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled |
|
||||
| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A |
|
||||
| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required |
|
||||
| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A |
|
||||
|
||||
<!-- END(featuresCommunityTable) -->
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
| EncryptionKeyRefresh | no | 10m0s | no | Enterprise Only | Refresh the encryption keys on member |
|
||||
| EncryptionKeyRemove | no | 10m0s | no | Enterprise Only | Remove the encryption key to the pool |
|
||||
| EncryptionKeyStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of encryption propagation |
|
||||
| EnforceResignLeadership | no | 45m0s | yes | Community & Enterprise | Run the ResignLeadership job on DBServer and checks data compatibility after |
|
||||
| Idle | no | 10m0s | no | Community & Enterprise | Define idle operation in case if preconditions are not meet |
|
||||
| JWTAdd | no | 10m0s | no | Enterprise Only | Adds new JWT to the pool |
|
||||
| JWTClean | no | 10m0s | no | Enterprise Only | Remove JWT key from the pool |
|
||||
|
@ -122,6 +123,7 @@ spec:
|
|||
EncryptionKeyRefresh: 10m0s
|
||||
EncryptionKeyRemove: 10m0s
|
||||
EncryptionKeyStatusUpdate: 10m0s
|
||||
EnforceResignLeadership: 45m0s
|
||||
Idle: 10m0s
|
||||
JWTAdd: 10m0s
|
||||
JWTClean: 10m0s
|
||||
|
|
|
@ -29,6 +29,10 @@ actions:
|
|||
description: Run the ResignLeadership job on DBServer
|
||||
timeout: 30m
|
||||
optional: true
|
||||
EnforceResignLeadership:
|
||||
description: Run the ResignLeadership job on DBServer and checks data compatibility after
|
||||
timeout: 45m
|
||||
optional: true
|
||||
KillMemberPod:
|
||||
description: Execute Delete on Pod (put pod in Terminating state)
|
||||
scopes:
|
||||
|
|
|
@ -220,3 +220,10 @@ features:
|
|||
releases:
|
||||
- operatorVersion: 1.2.33
|
||||
state: Production
|
||||
- name: Enforced ResignLeadership
|
||||
enabled: true
|
||||
remarks: Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer
|
||||
flag: --deployment.feature.enforced-resign-leadership
|
||||
releases:
|
||||
- operatorVersion: 1.2.34
|
||||
state: Production
|
||||
|
|
|
@ -98,6 +98,9 @@ const (
|
|||
// ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate
|
||||
ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout
|
||||
|
||||
// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
|
||||
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s
|
||||
|
||||
// ActionIdleDefaultTimeout define default timeout for action ActionIdle
|
||||
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout
|
||||
|
||||
|
@ -353,6 +356,9 @@ const (
|
|||
// ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation
|
||||
ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate"
|
||||
|
||||
// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
|
||||
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"
|
||||
|
||||
// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
|
||||
ActionTypeIdle ActionType = "Idle"
|
||||
|
||||
|
@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
|
|||
return ActionEncryptionKeyRemoveDefaultTimeout
|
||||
case ActionTypeEncryptionKeyStatusUpdate:
|
||||
return ActionEncryptionKeyStatusUpdateDefaultTimeout
|
||||
case ActionTypeEnforceResignLeadership:
|
||||
return ActionEnforceResignLeadershipDefaultTimeout
|
||||
case ActionTypeIdle:
|
||||
return ActionIdleDefaultTimeout
|
||||
case ActionTypeJWTAdd:
|
||||
|
@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority {
|
|||
return ActionPriorityNormal
|
||||
case ActionTypeEncryptionKeyStatusUpdate:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeEnforceResignLeadership:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeIdle:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeJWTAdd:
|
||||
|
@ -947,6 +957,8 @@ func (a ActionType) Optional() bool {
|
|||
return false
|
||||
case ActionTypeEncryptionKeyStatusUpdate:
|
||||
return false
|
||||
case ActionTypeEnforceResignLeadership:
|
||||
return true
|
||||
case ActionTypeIdle:
|
||||
return false
|
||||
case ActionTypeJWTAdd:
|
||||
|
|
|
@ -98,6 +98,9 @@ const (
|
|||
// ActionEncryptionKeyStatusUpdateDefaultTimeout define default timeout for action ActionEncryptionKeyStatusUpdate
|
||||
ActionEncryptionKeyStatusUpdateDefaultTimeout time.Duration = ActionsDefaultTimeout
|
||||
|
||||
// ActionEnforceResignLeadershipDefaultTimeout define default timeout for action ActionEnforceResignLeadership
|
||||
ActionEnforceResignLeadershipDefaultTimeout time.Duration = 2700 * time.Second // 45m0s
|
||||
|
||||
// ActionIdleDefaultTimeout define default timeout for action ActionIdle
|
||||
ActionIdleDefaultTimeout time.Duration = ActionsDefaultTimeout
|
||||
|
||||
|
@ -353,6 +356,9 @@ const (
|
|||
// ActionTypeEncryptionKeyStatusUpdate in scopes Normal. Update status of encryption propagation
|
||||
ActionTypeEncryptionKeyStatusUpdate ActionType = "EncryptionKeyStatusUpdate"
|
||||
|
||||
// ActionTypeEnforceResignLeadership in scopes Normal. Run the ResignLeadership job on DBServer and checks data compatibility after
|
||||
ActionTypeEnforceResignLeadership ActionType = "EnforceResignLeadership"
|
||||
|
||||
// ActionTypeIdle in scopes Normal. Define idle operation in case if preconditions are not meet
|
||||
ActionTypeIdle ActionType = "Idle"
|
||||
|
||||
|
@ -587,6 +593,8 @@ func (a ActionType) DefaultTimeout() time.Duration {
|
|||
return ActionEncryptionKeyRemoveDefaultTimeout
|
||||
case ActionTypeEncryptionKeyStatusUpdate:
|
||||
return ActionEncryptionKeyStatusUpdateDefaultTimeout
|
||||
case ActionTypeEnforceResignLeadership:
|
||||
return ActionEnforceResignLeadershipDefaultTimeout
|
||||
case ActionTypeIdle:
|
||||
return ActionIdleDefaultTimeout
|
||||
case ActionTypeJWTAdd:
|
||||
|
@ -761,6 +769,8 @@ func (a ActionType) Priority() ActionPriority {
|
|||
return ActionPriorityNormal
|
||||
case ActionTypeEncryptionKeyStatusUpdate:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeEnforceResignLeadership:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeIdle:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeJWTAdd:
|
||||
|
@ -947,6 +957,8 @@ func (a ActionType) Optional() bool {
|
|||
return false
|
||||
case ActionTypeEncryptionKeyStatusUpdate:
|
||||
return false
|
||||
case ActionTypeEnforceResignLeadership:
|
||||
return true
|
||||
case ActionTypeIdle:
|
||||
return false
|
||||
case ActionTypeJWTAdd:
|
||||
|
|
38
pkg/deployment/features/resign_leadership.go
Normal file
38
pkg/deployment/features/resign_leadership.go
Normal file
|
@ -0,0 +1,38 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package features
|
||||
|
||||
func init() {
|
||||
registerFeature(enforcedResignLeadership)
|
||||
}
|
||||
|
||||
var enforcedResignLeadership = &feature{
|
||||
name: "enforced-resign-leadership",
|
||||
description: "Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer",
|
||||
version: "3.7.0",
|
||||
enterpriseRequired: false,
|
||||
enabledByDefault: true,
|
||||
}
|
||||
|
||||
// EnforcedResignLeadership returns enforced ResignLeadership.
|
||||
func EnforcedResignLeadership() Feature {
|
||||
return enforcedResignLeadership
|
||||
}
|
|
@ -93,6 +93,9 @@ var (
|
|||
_ Action = &actionEncryptionKeyStatusUpdate{}
|
||||
_ actionFactory = newEncryptionKeyStatusUpdateAction
|
||||
|
||||
_ Action = &actionEnforceResignLeadership{}
|
||||
_ actionFactory = newEnforceResignLeadershipAction
|
||||
|
||||
_ Action = &actionIdle{}
|
||||
_ actionFactory = newIdleAction
|
||||
|
||||
|
@ -599,6 +602,20 @@ func init() {
|
|||
registerAction(action, function)
|
||||
}
|
||||
|
||||
// EnforceResignLeadership
|
||||
{
|
||||
// Get Action type
|
||||
action := api.ActionTypeEnforceResignLeadership
|
||||
|
||||
// Get Action defition
|
||||
function := newEnforceResignLeadershipAction
|
||||
|
||||
// Wrap action main function
|
||||
|
||||
// Register action
|
||||
registerAction(action, function)
|
||||
}
|
||||
|
||||
// Idle
|
||||
{
|
||||
// Get Action type
|
||||
|
|
|
@ -276,6 +276,16 @@ func Test_Actions(t *testing.T) {
|
|||
})
|
||||
})
|
||||
|
||||
t.Run("EnforceResignLeadership", func(t *testing.T) {
|
||||
ActionsExistence(t, api.ActionTypeEnforceResignLeadership)
|
||||
t.Run("Internal", func(t *testing.T) {
|
||||
require.False(t, api.ActionTypeEnforceResignLeadership.Internal())
|
||||
})
|
||||
t.Run("Optional", func(t *testing.T) {
|
||||
require.True(t, api.ActionTypeEnforceResignLeadership.Optional())
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("Idle", func(t *testing.T) {
|
||||
ActionsExistence(t, api.ActionTypeIdle)
|
||||
t.Run("Internal", func(t *testing.T) {
|
||||
|
|
169
pkg/deployment/reconcile/action_enforce_resign_leadership.go
Normal file
169
pkg/deployment/reconcile/action_enforce_resign_leadership.go
Normal file
|
@ -0,0 +1,169 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package reconcile
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/arangodb/go-driver"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/agency/state"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/errors"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/globals"
|
||||
)
|
||||
|
||||
const (
|
||||
resignLeadershipJobID api.PlanLocalKey = "resignLeadershipJobID"
|
||||
)
|
||||
|
||||
// newEnforceResignLeadershipAction creates a new Action that implements the given
|
||||
// planned ResignLeadership action.
|
||||
func newEnforceResignLeadershipAction(action api.Action, actionCtx ActionContext) Action {
|
||||
a := &actionEnforceResignLeadership{}
|
||||
|
||||
a.actionImpl = newActionImplDefRef(action, actionCtx)
|
||||
|
||||
return a
|
||||
}
|
||||
|
||||
// actionEnforceResignLeadership implements an ResignLeadershipAction.
|
||||
type actionEnforceResignLeadership struct {
|
||||
actionImpl
|
||||
}
|
||||
|
||||
// Start performs the start of the ReasignLeadership process on DBServer.
|
||||
func (a *actionEnforceResignLeadership) Start(ctx context.Context) (bool, error) {
|
||||
group := a.action.Group
|
||||
|
||||
if a.actionCtx.GetSpec().Mode.Get() != api.DeploymentModeCluster {
|
||||
a.log.Debug("Resign only allowed in cluster mode")
|
||||
return true, nil
|
||||
}
|
||||
|
||||
switch group {
|
||||
case api.ServerGroupDBServers:
|
||||
if agencyState, agencyOK := a.actionCtx.GetAgencyCache(); !agencyOK {
|
||||
a.log.Warn("AgencyCache is not ready")
|
||||
return false, nil
|
||||
} else if agencyState.Supervision.Maintenance.Exists() {
|
||||
// We are done, action cannot be handled on maintenance mode
|
||||
a.log.Warn("Maintenance is enabled, skipping action")
|
||||
return true, nil
|
||||
}
|
||||
|
||||
return false, nil
|
||||
default:
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
// CheckProgress checks if the Job is completed, if not then start it. Repeat in case of error or if still a leader
|
||||
func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool, bool, error) {
|
||||
group := a.action.Group
|
||||
m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID)
|
||||
if !ok {
|
||||
a.log.Error("No such member")
|
||||
return true, false, nil
|
||||
}
|
||||
|
||||
if group != api.ServerGroupDBServers {
|
||||
// Only DBServers can use ResignLeadership job
|
||||
return true, false, nil
|
||||
}
|
||||
|
||||
agencyState, agencyOK := a.actionCtx.GetAgencyCache()
|
||||
if !agencyOK {
|
||||
a.log.Error("Unable to get maintenance mode")
|
||||
return false, false, nil
|
||||
} else if agencyState.Supervision.Maintenance.Exists() {
|
||||
a.log.Warn("Maintenance is enabled, skipping action")
|
||||
// We are done, action cannot be handled on maintenance mode
|
||||
m.CleanoutJobID = ""
|
||||
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
|
||||
return false, false, errors.WithStack(err)
|
||||
}
|
||||
return true, false, nil
|
||||
} else if isServerRebooted(a.log, a.action, agencyState, driver.ServerID(m.ID)) {
|
||||
return true, false, nil
|
||||
}
|
||||
|
||||
// Lets start resign job if required
|
||||
if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" {
|
||||
_, jobStatus := agencyState.Target.GetJob(state.JobID(m.CleanoutJobID))
|
||||
switch jobStatus {
|
||||
case state.JobPhaseFailed:
|
||||
m.CleanoutJobID = ""
|
||||
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
|
||||
return false, false, errors.WithStack(err)
|
||||
}
|
||||
a.log.Error("Resign server job failed")
|
||||
return false, false, nil
|
||||
case state.JobPhaseFinished:
|
||||
m.CleanoutJobID = ""
|
||||
if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
|
||||
return false, false, errors.WithStack(err)
|
||||
}
|
||||
default:
|
||||
return false, false, nil
|
||||
}
|
||||
|
||||
// Remove key
|
||||
a.actionCtx.Add(resignLeadershipJobID, "", true)
|
||||
|
||||
// Job is Finished, check if we are not a leader anymore
|
||||
if agencyState.PlanLeaderServers().Contains(state.Server(m.ID)) {
|
||||
// We are still a leader!
|
||||
a.log.Warn("DBServers is still a leader for shards")
|
||||
return false, false, nil
|
||||
}
|
||||
return true, false, nil
|
||||
}
|
||||
|
||||
// Job not in progress, start it
|
||||
client, err := a.actionCtx.GetMembersState().State().GetDatabaseClient()
|
||||
if err != nil {
|
||||
a.log.Err(err).Error("Unable to get client")
|
||||
return false, false, errors.WithStack(err)
|
||||
}
|
||||
|
||||
ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
|
||||
defer cancel()
|
||||
cluster, err := client.Cluster(ctxChild)
|
||||
if err != nil {
|
||||
a.log.Err(err).Error("Unable to get cluster client")
|
||||
return false, false, errors.WithStack(err)
|
||||
}
|
||||
|
||||
var jobID string
|
||||
ctxChild, cancel = globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
|
||||
defer cancel()
|
||||
jobCtx := driver.WithJobIDResponse(ctxChild, &jobID)
|
||||
a.log.Debug("Temporary shutdown, resign leadership")
|
||||
if err := cluster.ResignServer(jobCtx, m.ID); err != nil {
|
||||
a.log.Err(err).Debug("Failed to resign server")
|
||||
return false, false, errors.WithStack(err)
|
||||
}
|
||||
|
||||
a.actionCtx.Add(resignLeadershipJobID, jobID, true)
|
||||
|
||||
return false, false, nil
|
||||
}
|
|
@ -22,7 +22,6 @@ package reconcile
|
|||
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
|
||||
"github.com/arangodb/go-driver"
|
||||
|
||||
|
@ -132,7 +131,7 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
|
|||
return false, false, errors.WithStack(err)
|
||||
}
|
||||
return true, false, nil
|
||||
} else if a.isServerRebooted(agencyState, driver.ServerID(m.ID)) {
|
||||
} else if isServerRebooted(a.log, a.action, agencyState, driver.ServerID(m.ID)) {
|
||||
return true, false, nil
|
||||
}
|
||||
|
||||
|
@ -157,31 +156,3 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
|
|||
}
|
||||
return false, false, nil
|
||||
}
|
||||
|
||||
// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership.
|
||||
func (a *actionResignLeadership) isServerRebooted(agencyState state.State, serverID driver.ServerID) bool {
|
||||
rebootID, ok := agencyState.GetRebootID(serverID)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
v, ok := a.action.Params[actionResignLeadershipRebootID.String()]
|
||||
if !ok {
|
||||
a.log.Warn("missing reboot ID in action's locals")
|
||||
return false
|
||||
}
|
||||
|
||||
r, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
a.log.Err(err).Warn("reboot ID '%s' supposed to be a number", v)
|
||||
return false
|
||||
}
|
||||
|
||||
if rebootID <= r {
|
||||
// Server has not been restarted.
|
||||
return false
|
||||
}
|
||||
|
||||
a.log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID)
|
||||
return true
|
||||
}
|
||||
|
|
66
pkg/deployment/reconcile/action_resign_leadership_utils.go
Normal file
66
pkg/deployment/reconcile/action_resign_leadership_utils.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package reconcile
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
|
||||
"github.com/arangodb/go-driver"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/agency/state"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
|
||||
"github.com/arangodb/kube-arangodb/pkg/logging"
|
||||
)
|
||||
|
||||
func getResignLeadershipActionType() api.ActionType {
|
||||
if features.EnforcedResignLeadership().Enabled() {
|
||||
return api.ActionTypeEnforceResignLeadership
|
||||
}
|
||||
return api.ActionTypeResignLeadership
|
||||
}
|
||||
|
||||
// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership.
|
||||
func isServerRebooted(log logging.Logger, action api.Action, agencyState state.State, serverID driver.ServerID) bool {
|
||||
rebootID, ok := agencyState.GetRebootID(serverID)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
v, ok := action.Params[actionResignLeadershipRebootID.String()]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
r, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
log.Err(err).Warn("reboot ID '%s' supposed to be a number", v)
|
||||
return false
|
||||
}
|
||||
|
||||
if rebootID <= r {
|
||||
// Server has not been restarted.
|
||||
return false
|
||||
}
|
||||
|
||||
log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID)
|
||||
return true
|
||||
}
|
|
@ -64,7 +64,7 @@ func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason
|
|||
return plan
|
||||
}
|
||||
|
||||
action := actions.NewAction(api.ActionTypeResignLeadership, group, member, reason)
|
||||
action := actions.NewAction(getResignLeadershipActionType(), group, member, reason)
|
||||
if rebootID != nil {
|
||||
action = actionResignLeadershipRebootID.Register(action, "%d", *rebootID)
|
||||
}
|
||||
|
|
|
@ -175,7 +175,7 @@ func (r *Reconciler) pvcResizePlan(group api.ServerGroup, member api.MemberStatu
|
|||
}
|
||||
case api.PVCResizeModeRotate:
|
||||
return withWaitForMember(api.Plan{
|
||||
actions.NewAction(api.ActionTypeResignLeadership, group, member),
|
||||
actions.NewAction(getResignLeadershipActionType(), group, member),
|
||||
actions.NewAction(api.ActionTypeKillMemberPod, group, member),
|
||||
actions.NewAction(api.ActionTypeRotateStartMember, group, member),
|
||||
actions.NewAction(api.ActionTypePVCResize, group, member),
|
||||
|
|
Loading…
Reference in a new issue