1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] Change DBServer Cleanup Logic (#1025)

This commit is contained in:
Adam Janikowski 2022-06-21 16:08:52 +02:00 committed by GitHub
parent a9d7849169
commit db986e2b0f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 60 additions and 30 deletions

View file

@ -21,6 +21,7 @@
- (Feature) Add `ArangoDeploymentReplication` CRD auto-installer
- (Bugfix) Allow missing `token` key in License secret
- (Feature) Unify agency access
- (Feature) Change DBServer Cleanup Logic
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
- (Bugfix) Fix arangosync members state inspection

View file

@ -22,10 +22,41 @@ package agency
type StateCurrentCollections map[string]StateCurrentDBCollections
func (a StateCurrentCollections) IsDBServerPresent(name Server) bool {
for _, v := range a {
if v.IsDBServerPresent(name) {
return true
}
}
return false
}
type StateCurrentDBCollections map[string]StateCurrentDBCollection
func (a StateCurrentDBCollections) IsDBServerPresent(name Server) bool {
for _, v := range a {
if v.IsDBServerPresent(name) {
return true
}
}
return false
}
type StateCurrentDBCollection map[string]StateCurrentDBShard
func (a StateCurrentDBCollection) IsDBServerPresent(name Server) bool {
for _, v := range a {
if v.Servers.Contains(name) {
return true
}
}
return false
}
type StateCurrentDBShard struct {
Servers Servers `json:"servers,omitempty"`
}

View file

@ -22,7 +22,7 @@ package agency
type StatePlanCollections map[string]StatePlanDBCollections
func (a StatePlanCollections) IsDBServerInDatabases(name Server) bool {
func (a StatePlanCollections) IsDBServerPresent(name Server) bool {
for _, collections := range a {
if collections.IsDBServerInCollections(name) {
return true

View file

@ -121,7 +121,7 @@ func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObjec
continue
}
if agencyState.Plan.Collections.IsDBServerInDatabases(agency.Server(m.ID)) {
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
plan = append(plan,

View file

@ -24,12 +24,10 @@ import (
"context"
"time"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/util/arangod"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
)
const (
@ -74,10 +72,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
if m.IsNotReadySince(time.Now().Add(-notReadySinceGracePeriod)) {
// Member has terminated too often in recent history.
failureAcceptable, reason, err := r.isMemberFailureAcceptable(ctx, group, m)
if err != nil {
log.Err(err).Warn("Failed to check is member failure is acceptable")
} else if failureAcceptable {
failureAcceptable, reason := r.isMemberFailureAcceptable(group, m)
if failureAcceptable {
log.Info("Member is not ready for long time, marking is failed")
m.Phase = api.MemberPhaseFailed
status.Members.Update(m, group)
@ -93,10 +89,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod))
if count >= recentTerminationThreshold {
// Member has terminated too often in recent history.
failureAcceptable, reason, err := r.isMemberFailureAcceptable(ctx, group, m)
if err != nil {
log.Err(err).Warn("Failed to check is member failure is acceptable")
} else if failureAcceptable {
failureAcceptable, reason := r.isMemberFailureAcceptable(group, m)
if failureAcceptable {
log.Info("Member has terminated too often in recent history, marking is failed")
m.Phase = api.MemberPhaseFailed
status.Members.Update(m, group)
@ -123,42 +117,46 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
// isMemberFailureAcceptable checks if it is currently acceptable to switch the phase of the given member
// to failed, which means that it will be replaced.
// Return: failureAcceptable, notAcceptableReason, error
func (r *Resilience) isMemberFailureAcceptable(ctx context.Context, group api.ServerGroup, m api.MemberStatus) (bool, string, error) {
// Return: failureAcceptable, notAcceptableReason
func (r *Resilience) isMemberFailureAcceptable(group api.ServerGroup, m api.MemberStatus) (bool, string) {
switch group {
case api.ServerGroupAgents:
agencyHealth, ok := r.context.GetAgencyHealth()
if !ok {
return false, "AgencyHealth is not present", nil
return false, "AgencyHealth is not present"
}
if err := agencyHealth.Healthy(); err != nil {
return false, err.Error(), nil
return false, err.Error()
}
return true, "", nil
return true, ""
case api.ServerGroupDBServers:
ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
defer cancel()
client, err := r.context.GetDatabaseClient(ctxChild)
if err != nil {
return false, "", errors.WithStack(err)
agencyState, ok := r.context.GetAgencyCache()
if !ok {
return false, "AgencyHealth is not present"
}
if err := arangod.IsDBServerEmpty(ctx, m.ID, client); err != nil {
return false, err.Error(), nil
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
return false, "DBServer still in Plan"
}
return true, "", nil
if agencyState.Current.Collections.IsDBServerPresent(agency.Server(m.ID)) {
return false, "DBServer still in Current"
}
return true, ""
case api.ServerGroupCoordinators:
// Coordinators can be replaced at will
return true, "", nil
return true, ""
case api.ServerGroupSyncMasters, api.ServerGroupSyncWorkers:
// Sync masters & workers can be replaced at will
return true, "", nil
return true, ""
case api.ServerGroupSingle:
return false, "ServerGroupSingle can not marked as a failed", nil
return false, "ServerGroupSingle can not marked as a failed"
default:
// TODO
return false, "TODO", nil
return false, "TODO"
}
}