1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] Move member recovery to high plan (#1026)

This commit is contained in:
Tomasz Mielech 2022-07-16 17:17:39 +02:00 committed by GitHub
parent 07d6e01545
commit e09d35e258
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 143 additions and 89 deletions

View file

@ -6,6 +6,7 @@
- (Feature) Move PVC resize action to high-priority plan
- (Feature) Remove forgotten ArangoDB jobs during restart
- (Feature) Add support for managed services
- (Feature) Recreation member in the high plan
## [1.2.14](https://github.com/arangodb/kube-arangodb/tree/1.2.14) (2022-07-14)
- (Feature) Add ArangoSync TLS based rotation

View file

@ -57,6 +57,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb
ApplyIfEmptyWithBackOff(LicenseCheck, 30*time.Second, r.updateClusterLicense).
ApplyIfEmpty(r.createTopologyMemberConditionPlan).
ApplyIfEmpty(r.createRebalancerCheckPlan).
ApplyIfEmpty(r.createMemberFailedRestoreHighPlan).
ApplyWithBackOff(BackOffCheck, time.Minute, r.emptyPlanBuilder)).
Apply(r.createBackupInProgressConditionPlan). // Discover backups always
Apply(r.createMaintenanceConditionPlan). // Discover maintenance always

View file

@ -0,0 +1,128 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package reconcile
import (
"context"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)
// createMemberFailedRestoreNormalPlan returns only actions which are not recreate member.
func (r *Reconciler) createMemberFailedRestoreNormalPlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
condition := func(a api.Action) bool {
return a.Type != api.ActionTypeRecreateMember
}
return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
}
// createMemberFailedRestoreHighPlan returns only recreate member actions.
func (r *Reconciler) createMemberFailedRestoreHighPlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
condition := func(a api.Action) bool {
return a.Type == api.ActionTypeRecreateMember
}
return r.createMemberFailedRestoreInternal(ctx, apiObject, spec, status, context).Filter(condition)
}
func (r *Reconciler) createMemberFailedRestoreInternal(_ context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec,
status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
var plan api.Plan
// Fetch agency plan.
agencyState, agencyOK := context.GetAgencyCache()
// Check for members in failed state.
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
failed := 0
for _, m := range members {
if m.Phase == api.MemberPhaseFailed {
failed++
}
}
for _, m := range members {
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
continue
}
memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())
if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
if !agencyOK {
// If agency is down DBServers should not be touched.
memberLog.Info("Agency state is not present")
continue
}
if c := spec.DBServers.GetCount(); c <= len(members)-failed {
// There are more or equal alive members than current count. A member should not be recreated.
continue
}
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated.
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
continue
}
// From here on, DBServer can be recreated.
}
switch group {
case api.ServerGroupAgents:
// For agents just recreate member do not rotate ID, do not remove PVC or service.
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
case api.ServerGroupSingle:
// Do not remove data for single.
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
default:
if spec.GetAllowMemberRecreation(group) {
memberLog.Info("Creating member replacement plan because member has failed")
plan = append(plan,
actions.NewAction(api.ActionTypeRemoveMember, group, m),
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
actions.NewAction(api.ActionTypeWaitForMemberUp, group, withPredefinedMember(api.MemberIDPreviousAction)),
)
} else {
memberLog.Info("Restoring old member. Recreation is disabled for group")
plan = append(plan, actions.NewAction(api.ActionTypeRecreateMember, group, m))
}
}
}
return nil
})
if len(plan) == 0 && !agencyOK {
r.log.Warn("unable to build further plan without access to agency")
plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle))
}
return plan
}

View file

@ -24,8 +24,6 @@ import (
"context"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)
@ -50,7 +48,7 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
// Check for scale up
ApplyIfEmpty(r.createScaleUPMemberPlan).
// Check for failed members
ApplyIfEmpty(r.createMemberFailedRestorePlan).
ApplyIfEmpty(r.createMemberFailedRestoreNormalPlan).
// Check for scale up/down
ApplyIfEmpty(r.createScaleMemberPlan).
// Update status
@ -86,90 +84,6 @@ func (r *Reconciler) createNormalPlan(ctx context.Context, apiObject k8sutil.API
return q.Plan(), q.BackOff(), true
}
func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus,
context PlanBuilderContext) api.Plan {
var plan api.Plan
// Fetch agency plan
agencyState, agencyOK := context.GetAgencyCache()
// Check for members in failed state
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
failed := 0
for _, m := range members {
if m.Phase == api.MemberPhaseFailed {
failed++
}
}
for _, m := range members {
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
continue
}
memberLog := r.log.Str("id", m.ID).Str("role", group.AsRole())
if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
// Do pre check for DBServers. If agency is down DBServers should not be touch
if !agencyOK {
memberLog.Info("Agency state is not present")
continue
}
if c := spec.DBServers.GetCount(); c <= len(members)-failed {
// We have more or equal alive members than current count, we should not recreate this member
continue
}
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
continue
}
// Everything is fine, proceed
}
switch group {
case api.ServerGroupAgents:
// For agents just recreate member do not rotate ID, do not remove PVC or service
memberLog.Info("Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
case api.ServerGroupSingle:
// Do not remove data for singles
memberLog.Info("Restoring old member. Rotation for single servers is not safe")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
default:
if spec.GetAllowMemberRecreation(group) {
memberLog.Info("Creating member replacement plan because member has failed")
plan = append(plan,
actions.NewAction(api.ActionTypeRemoveMember, group, m),
actions.NewAction(api.ActionTypeAddMember, group, withPredefinedMember("")),
)
} else {
memberLog.Info("Restoring old member. Recreation is disabled for group")
plan = append(plan,
actions.NewAction(api.ActionTypeRecreateMember, group, m))
}
}
}
return nil
})
// Ensure that we were able to get agency info
if len(plan) == 0 && !agencyOK {
r.log.Warn("unable to build further plan without access to agency")
plan = append(plan,
actions.NewClusterAction(api.ActionTypeIdle))
}
return plan
}
func (r *Reconciler) createRemoveCleanedDBServersPlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus,
context PlanBuilderContext) api.Plan {

View file

@ -1017,8 +1017,14 @@ func TestCreatePlan(t *testing.T) {
}
ad.Status.Members.Agents[0].Phase = api.MemberPhaseFailed
ad.Status.Members.Agents[0].ID = "id"
for i := range ad.Status.Members.Coordinators {
ad.Status.Members.Coordinators[i].Phase = api.MemberPhaseCreated
}
for i := range ad.Status.Members.DBServers {
ad.Status.Members.DBServers[i].Phase = api.MemberPhaseCreated
}
},
ExpectedPlan: []api.Action{
ExpectedHighPlan: []api.Action{
actions.NewAction(api.ActionTypeRecreateMember, api.ServerGroupAgents, withPredefinedMember("id")),
},
ExpectedLog: "Restoring old member. For agency members recreation of PVC is not supported - to prevent DataLoss",
@ -1038,6 +1044,8 @@ func TestCreatePlan(t *testing.T) {
ExpectedPlan: []api.Action{
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupCoordinators, withPredefinedMember("id")),
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupCoordinators, withPredefinedMember("")),
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupCoordinators,
withPredefinedMember(api.MemberIDPreviousAction)),
},
ExpectedLog: "Creating member replacement plan because member has failed",
},
@ -1056,6 +1064,8 @@ func TestCreatePlan(t *testing.T) {
ExpectedPlan: []api.Action{
actions.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, withPredefinedMember("id")),
actions.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, withPredefinedMember("")),
actions.NewAction(api.ActionTypeWaitForMemberUp, api.ServerGroupDBServers,
withPredefinedMember(api.MemberIDPreviousAction)),
},
ExpectedLog: "Creating member replacement plan because member has failed",
},

View file

@ -76,7 +76,7 @@ func (r *Reconciler) CheckDeployment(ctx context.Context) error {
}
if err := cache.Client().Kubernetes().CoreV1().Secrets(cache.Namespace()).Delete(ctx, m.PodName, meta.DeleteOptions{}); err != nil {
r.log.Err(err).Error("Failed to delete pod")
r.log.Err(err).Error("Failed to delete secret")
}
m.Phase = api.MemberPhaseNone