mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Feature] Move member recovery to high plan (#1026)
This commit is contained in:
parent
07d6e01545
commit
e09d35e258
6 changed files with 143 additions and 89 deletions
|
@ -6,6 +6,7 @@
|
|||
- (Feature) Move PVC resize action to high-priority plan
|
||||
- (Feature) Remove forgotten ArangoDB jobs during restart
|
||||
- (Feature) Add support for managed services
|
||||
- (Feature) Recreation member in the high plan
|
||||
|
||||
## [1.2.14](https://github.com/arangodb/kube-arangodb/tree/1.2.14) (2022-07-14)
|
||||
- (Feature) Add ArangoSync TLS based rotation
|
||||
|
|
|
@ -57,6 +57,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb
|
|||
ApplyIfEmptyWithBackOff(LicenseCheck, 30*time.Second, r.updateClusterLicense).
|
||||
ApplyIfEmpty(r.createTopologyMemberConditionPlan).
|
||||
ApplyIfEmpty(r.createRebalancerCheckPlan).
|
||||
ApplyIfEmpty(r.createMemberFailedRestoreHighPlan).
|
||||
ApplyWithBackOff(BackOffCheck, time.Minute, r.emptyPlanBuilder)).
|
||||
Apply(r.createBackupInProgressConditionPlan). // Discover backups always
|
||||
Apply(r.createMaintenanceConditionPlan). // Discover maintenance always
|
||||
|
|
128
pkg/deployment/reconcile/plan_builder_member_recovery.go
Normal file
128
pkg/deployment/reconcile/plan_builder_member_recovery.go
Normal file
|
@ -0,0 +1,128 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package reconcile
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
)
|
||||
|
||||
// createMemberFailedRestoreNormalPlan returns only actions which are not recreate member.
|
||||
func (r *Reconciler) createMemberFailedRestoreNormalPlan(ctx context.Context, apiObject k8sutil.APIObject,
|
||||
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
|
||||
condition := func(a api.Action) bool {
|
||||
return a.Type != api.ActionTypeRecreateMember
|
||||
}
|
||||
|
||||
return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
|
||||
}
|
||||
|
||||
// createMemberFailedRestoreHighPlan returns only recreate member actions.
|
||||
func (r *Reconciler) createMemberFailedRestoreHighPlan(ctx context.Context, apiObject k8sutil.APIObject,
|
||||
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
|
||||
condition := func(a api.Action) bool {
|
||||
return a.Type == api.ActionTypeRecreateMember
|
||||
}
|
||||
|
||||
return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
|
||||
}
|
||||
|
||||
func (r *Reconciler) createMemberFailedRestoreInternal(_ context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec,
|
||||
status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
|
||||
var plan api.Plan
|
||||
|
||||
// Fetch agency plan.
|
||||
agencyState, agencyOK := context.GetAgencyCache()
|
||||
|
||||
// Check for members in failed state.
|
||||
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
|
||||
failed := 0
|
||||
for _, m := range members {
|
||||
if m.Phase == api.MemberPhaseFailed {
|
||||
failed++
|
||||
}
|
||||
}
|
||||
for _, m := range members {
|
||||
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())
|
||||
|
||||
if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
|
||||
if !agencyOK {
|
||||
// If agency is down DBServers should not be touched.
|
||||
memberLog.Info("Agency state is not present")
|
||||
continue
|
||||
}
|
||||
|
||||
if c := spec.DBServers.GetCount(); c <= len(members)-failed {
|
||||
// There are more or equal alive members than current count. A member should not be recreated.
|
||||
continue
|
||||
}
|
||||
|
||||
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
|
||||
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated.
|
||||
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
|
||||
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
|
||||
continue
|
||||
}
|
||||
// From here on, DBServer can be recreated.
|
||||
}
|
||||
|
||||
switch group {
|
||||
case api.ServerGroupAgents:
|
||||
// For agents just recreate member do not rotate ID, do not remove PVC or service.
|
||||
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
|
||||
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
case api.ServerGroupSingle:
|
||||
// Do not remove data for single.
|
||||
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
|
||||
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
default:
|
||||
if spec.GetAllowMemberRecreation(group) {
|
||||
memberLog.Info("Creating member replacement plan because member has failed")
|
||||
plan = append(plan,
|
||||
actions.NewAction(api.ActionTypeRemoveMember, group, m),
|
||||
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
|
||||
actions.NewAction(api.ActionTypeWaitForMemberUp, group, withPredefinedMember(api.MemberIDPreviousAction)),
|
||||
)
|
||||
} else {
|
||||
memberLog.Info("Restoring old member. Recreation is disabled for group")
|
||||
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if len(plan) == 0 && !agencyOK {
|
||||
r.log.Warn("unable to build further plan without access to agency")
|
||||
plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle))
|
||||
}
|
||||
|
||||
return plan
|
||||
}
|
|
@ -24,8 +24,6 @@ import (
|
|||
"context"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
)
|
||||
|
||||
|
@ -50,7 +48,7 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
|
|||
// Check for scale up
|
||||
ApplyIfEmpty(r.createScaleUPMemberPlan).
|
||||
// Check for failed members
|
||||
ApplyIfEmpty(r.createMemberFailedRestorePlan).
|
||||
ApplyIfEmpty(r.createMemberFailedRestoreNormalPlan).
|
||||
// Check for scale up/down
|
||||
ApplyIfEmpty(r.createScaleMemberPlan).
|
||||
// Update status
|
||||
|
@ -86,90 +84,6 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
|
|||
return q.Plan(), q.BackOff(), true
|
||||
}
|
||||
|
||||
func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObject k8sutil.APIObject,
|
||||
spec api.DeploymentSpec, status api.DeploymentStatus,
|
||||
context PlanBuilderContext) api.Plan {
|
||||
var plan api.Plan
|
||||
|
||||
// Fetch agency plan
|
||||
agencyState, agencyOK := context.GetAgencyCache()
|
||||
|
||||
// Check for members in failed state
|
||||
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
|
||||
failed := 0
|
||||
for _, m := range members {
|
||||
if m.Phase == api.MemberPhaseFailed {
|
||||
failed++
|
||||
}
|
||||
}
|
||||
for _, m := range members {
|
||||
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())
|
||||
|
||||
if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
|
||||
// Do pre check for DBServers. If agency is down DBServers should not be touch
|
||||
if !agencyOK {
|
||||
memberLog.Info("Agency state is not present")
|
||||
continue
|
||||
}
|
||||
|
||||
if c := spec.DBServers.GetCount(); c <= len(members)-failed {
|
||||
// We have more or equal alive members than current count, we should not recreate this member
|
||||
continue
|
||||
}
|
||||
|
||||
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
|
||||
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated
|
||||
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
|
||||
plan = append(plan,
|
||||
actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
continue
|
||||
}
|
||||
|
||||
// Everything is fine, proceed
|
||||
}
|
||||
|
||||
switch group {
|
||||
case api.ServerGroupAgents:
|
||||
// For agents just recreate member do not rotate ID, do not remove PVC or service
|
||||
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
|
||||
plan = append(plan,
|
||||
actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
case api.ServerGroupSingle:
|
||||
// Do not remove data for singles
|
||||
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
|
||||
plan = append(plan,
|
||||
actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
default:
|
||||
if spec.GetAllowMemberRecreation(group) {
|
||||
memberLog.Info("Creating member replacement plan because member has failed")
|
||||
plan = append(plan,
|
||||
actions.NewAction(api.ActionTypeRemoveMember, group, m),
|
||||
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
|
||||
)
|
||||
} else {
|
||||
memberLog.Info("Restoring old member. Recreation is disabled for group")
|
||||
plan = append(plan,
|
||||
actions.NewAction(api.ActionTypeRecreateMember, group, m))
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Ensure that we were able to get agency info
|
||||
if len(plan) == 0 && !agencyOK {
|
||||
r.log.Warn("unable to build further plan without access to agency")
|
||||
plan = append(plan,
|
||||
actions.NewClusterAction(api.ActionTypeIdle))
|
||||
}
|
||||
|
||||
return plan
|
||||
}
|
||||
|
||||
func (r *Reconciler) createRemoveCleanedDBServersPlan(ctx context.Context, apiObject k8sutil.APIObject,
|
||||
spec api.DeploymentSpec, status api.DeploymentStatus,
|
||||
context PlanBuilderContext) api.Plan {
|
||||
|
|
|
@ -1017,8 +1017,14 @@ func TestCreatePlan(t *testing.T) {
|
|||
}
|
||||
ad.Status.Members.Agents[0].Phase = api.MemberPhaseFailed
|
||||
ad.Status.Members.Agents[0].ID = "id"
|
||||
for i := range ad.Status.Members.Coordinators {
|
||||
ad.Status.Members.Coordinators[i].Phase = api.MemberPhaseCreated
|
||||
}
|
||||
for i := range ad.Status.Members.DBServers {
|
||||
ad.Status.Members.DBServers[i].Phase = api.MemberPhaseCreated
|
||||
}
|
||||
},
|
||||
ExpectedPlan: []api.Action{
|
||||
ExpectedHighPlan: []api.Action{
|
||||
actions.NewAction(api.ActionTypeRecreateMember, api.ServerGroupAgents, withPredefinedMember("id")),
|
||||
},
|
||||
ExpectedLog: "Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss",
|
||||
|
@ -1038,6 +1044,8 @@ func TestCreatePlan(t *testing.T) {
|
|||
ExpectedPlan: []api.Action{
|
||||
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupCoordinators, withPredefinedMember("id")),
|
||||
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupCoordinators, withPredefinedMember("")),
|
||||
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupCoordinators,
|
||||
withPredefinedMember(api.MemberIDPreviousAction)),
|
||||
},
|
||||
ExpectedLog: "Creating member replacement plan because member has failed",
|
||||
},
|
||||
|
@ -1056,6 +1064,8 @@ func TestCreatePlan(t *testing.T) {
|
|||
ExpectedPlan: []api.Action{
|
||||
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, withPredefinedMember("id")),
|
||||
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, withPredefinedMember("")),
|
||||
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupDBServers,
|
||||
withPredefinedMember(api.MemberIDPreviousAction)),
|
||||
},
|
||||
ExpectedLog: "Creating member replacement plan because member has failed",
|
||||
},
|
||||
|
|
|
@ -76,7 +76,7 @@ func (r *Reconciler) CheckDeployment(ctx context.Context) error {
|
|||
}
|
||||
|
||||
if err := cache.Client().Kubernetes().CoreV1().Secrets(cache.Namespace()).Delete(ctx, m.PodName, meta.DeleteOptions{}); err != nil {
|
||||
r.log.Err(err).Error("Failed to delete pod")
|
||||
r.log.Err(err).Error("Failed to delete secret")
|
||||
}
|
||||
m.Phase = api.MemberPhaseNone
|
||||
|
||||
|
|
Loading…
Reference in a new issue