From db986e2b0f7292222a6a76e2e23f1a76d3f0bf39 Mon Sep 17 00:00:00 2001 From: Adam Janikowski <12255597+ajanikow@users.noreply.github.com> Date: Tue, 21 Jun 2022 16:08:52 +0200 Subject: [PATCH] [Feature] Change DBServer Cleanup Logic (#1025) --- CHANGELOG.md | 1 + pkg/deployment/agency/current_collections.go | 31 +++++++++++ pkg/deployment/agency/plan_collections.go | 2 +- .../reconcile/plan_builder_normal.go | 2 +- pkg/deployment/resilience/member_failure.go | 54 +++++++++---------- 5 files changed, 60 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fa4b3c08..0471a2c63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ - (Feature) Add `ArangoDeploymentReplication` CRD auto-installer - (Bugfix) Allow missing `token` key in License secret - (Feature) Unify agency access +- (Feature) Change DBServer Cleanup Logic ## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07) - (Bugfix) Fix arangosync members state inspection diff --git a/pkg/deployment/agency/current_collections.go b/pkg/deployment/agency/current_collections.go index 7b5af0205..354322acb 100644 --- a/pkg/deployment/agency/current_collections.go +++ b/pkg/deployment/agency/current_collections.go @@ -22,10 +22,41 @@ package agency type StateCurrentCollections map[string]StateCurrentDBCollections +func (a StateCurrentCollections) IsDBServerPresent(name Server) bool { + for _, v := range a { + if v.IsDBServerPresent(name) { + return true + } + } + + return false +} + type StateCurrentDBCollections map[string]StateCurrentDBCollection +func (a StateCurrentDBCollections) IsDBServerPresent(name Server) bool { + for _, v := range a { + if v.IsDBServerPresent(name) { + return true + } + } + + return false +} + type StateCurrentDBCollection map[string]StateCurrentDBShard +func (a StateCurrentDBCollection) IsDBServerPresent(name Server) bool { + + for _, v := range a { + if v.Servers.Contains(name) { + return true + } + } + + return false +} + type StateCurrentDBShard struct { Servers Servers `json:"servers,omitempty"` } diff --git a/pkg/deployment/agency/plan_collections.go b/pkg/deployment/agency/plan_collections.go index c136ea60b..a044e29c3 100644 --- a/pkg/deployment/agency/plan_collections.go +++ b/pkg/deployment/agency/plan_collections.go @@ -22,7 +22,7 @@ package agency type StatePlanCollections map[string]StatePlanDBCollections -func (a StatePlanCollections) IsDBServerInDatabases(name Server) bool { +func (a StatePlanCollections) IsDBServerPresent(name Server) bool { for _, collections := range a { if collections.IsDBServerInCollections(name) { return true diff --git a/pkg/deployment/reconcile/plan_builder_normal.go b/pkg/deployment/reconcile/plan_builder_normal.go index e85595db6..9b2d83bcc 100644 --- a/pkg/deployment/reconcile/plan_builder_normal.go +++ b/pkg/deployment/reconcile/plan_builder_normal.go @@ -121,7 +121,7 @@ func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObjec continue } - if agencyState.Plan.Collections.IsDBServerInDatabases(agency.Server(m.ID)) { + if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) { // DBServer still exists in agency plan! Will not be removed, but needs to be recreated memberLog.Info("Recreating DBServer - it cannot be removed gracefully") plan = append(plan, diff --git a/pkg/deployment/resilience/member_failure.go b/pkg/deployment/resilience/member_failure.go index c42776370..80b610dfe 100644 --- a/pkg/deployment/resilience/member_failure.go +++ b/pkg/deployment/resilience/member_failure.go @@ -24,12 +24,10 @@ import ( "context" "time" - "github.com/arangodb/kube-arangodb/pkg/util/globals" - "github.com/arangodb/kube-arangodb/pkg/util/errors" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" - "github.com/arangodb/kube-arangodb/pkg/util/arangod" + "github.com/arangodb/kube-arangodb/pkg/deployment/agency" ) const ( @@ -74,10 +72,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error { if m.IsNotReadySince(time.Now().Add(-notReadySinceGracePeriod)) { // Member has terminated too often in recent history. - failureAcceptable, reason, err := r.isMemberFailureAcceptable(ctx, group, m) - if err != nil { - log.Err(err).Warn("Failed to check is member failure is acceptable") - } else if failureAcceptable { + failureAcceptable, reason := r.isMemberFailureAcceptable(group, m) + if failureAcceptable { log.Info("Member is not ready for long time, marking is failed") m.Phase = api.MemberPhaseFailed status.Members.Update(m, group) @@ -93,10 +89,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error { count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod)) if count >= recentTerminationThreshold { // Member has terminated too often in recent history. - failureAcceptable, reason, err := r.isMemberFailureAcceptable(ctx, group, m) - if err != nil { - log.Err(err).Warn("Failed to check is member failure is acceptable") - } else if failureAcceptable { + failureAcceptable, reason := r.isMemberFailureAcceptable(group, m) + if failureAcceptable { log.Info("Member has terminated too often in recent history, marking is failed") m.Phase = api.MemberPhaseFailed status.Members.Update(m, group) @@ -123,42 +117,46 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error { // isMemberFailureAcceptable checks if it is currently acceptable to switch the phase of the given member // to failed, which means that it will be replaced. -// Return: failureAcceptable, notAcceptableReason, error -func (r *Resilience) isMemberFailureAcceptable(ctx context.Context, group api.ServerGroup, m api.MemberStatus) (bool, string, error) { +// Return: failureAcceptable, notAcceptableReason +func (r *Resilience) isMemberFailureAcceptable(group api.ServerGroup, m api.MemberStatus) (bool, string) { switch group { case api.ServerGroupAgents: agencyHealth, ok := r.context.GetAgencyHealth() if !ok { - return false, "AgencyHealth is not present", nil + return false, "AgencyHealth is not present" } if err := agencyHealth.Healthy(); err != nil { - return false, err.Error(), nil + return false, err.Error() } - return true, "", nil + return true, "" case api.ServerGroupDBServers: - ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx) - defer cancel() - client, err := r.context.GetDatabaseClient(ctxChild) - if err != nil { - return false, "", errors.WithStack(err) + agencyState, ok := r.context.GetAgencyCache() + if !ok { + return false, "AgencyHealth is not present" } - if err := arangod.IsDBServerEmpty(ctx, m.ID, client); err != nil { - return false, err.Error(), nil + + if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) { + return false, "DBServer still in Plan" } - return true, "", nil + + if agencyState.Current.Collections.IsDBServerPresent(agency.Server(m.ID)) { + return false, "DBServer still in Current" + } + + return true, "" case api.ServerGroupCoordinators: // Coordinators can be replaced at will - return true, "", nil + return true, "" case api.ServerGroupSyncMasters, api.ServerGroupSyncWorkers: // Sync masters & workers can be replaced at will - return true, "", nil + return true, "" case api.ServerGroupSingle: - return false, "ServerGroupSingle can not marked as a failed", nil + return false, "ServerGroupSingle can not marked as a failed" default: // TODO - return false, "TODO", nil + return false, "TODO" } }