[Bugfix] Allow shards with RF1 in EnforcedResignLeadership action (#1441)

2024-12-14 11:57:37 +00:00 · 2023-10-15 17:29:40 +02:00 · 2023-10-15 17:29:40 +02:00 · ebd0dfdd5d
commit ebd0dfdd5d
parent 83c5c83589
3 changed files with 37 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,7 @@
 - (Feature) EnforcedResignLeadership action
 - (Maintenance) Make scale_down_candidate annotation obsolete
 - (Bugfix) Fix ResignJob ID propagation
+- (Bugfix) Allow shards with RF1 in EnforcedResignLeadership action
 
 ## [1.2.33](https://github.com/arangodb/kube-arangodb/tree/1.2.33) (2023-09-27)
 - (Maintenance) Bump golang.org/x/net to v0.13.0
--- a/pkg/deployment/agency/state/state.go
+++ b/pkg/deployment/agency/state/state.go
@ -238,6 +238,30 @@ func (s State) PlanLeaderServers() Servers {
 	return r
 }

+// PlanLeaderServersWithFailOver returns all servers which are part of the plan as a leader and can fail over
+func (s State) PlanLeaderServersWithFailOver() Servers {
+	q := map[Server]bool{}
+
+	for _, db := range s.Plan.Collections {
+		for _, col := range db {
+			for _, shards := range col.Shards {
+				if len(shards) <= 1 {
+					continue
+				}
+				q[shards[0]] = true
+			}
+		}
+	}
+
+	r := make([]Server, 0, len(q))
+
+	for k := range q {
+		r = append(r, k)
+	}
+
+	return r
+}
+
 type CollectionShardDetails []CollectionShardDetail

 type CollectionShardDetail struct {
--- a/pkg/deployment/reconcile/action_enforce_resign_leadership.go
+++ b/pkg/deployment/reconcile/action_enforce_resign_leadership.go
@ -103,18 +103,18 @@ func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool
 	}

 	// Lets start resign job if required
-	if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" {
+	if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" && j != "N/A" {
 		_, jobStatus := agencyState.Target.GetJob(state.JobID(j))
 		switch jobStatus {
 		case state.JobPhaseFailed:
 			a.log.Error("Resign server job failed")
 			// Remove key
-			a.actionCtx.Add(resignLeadershipJobID, "", true)
+			a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
 			return false, false, nil
 		case state.JobPhaseFinished:
 			a.log.Info("Job finished")
 			// Remove key
-			a.actionCtx.Add(resignLeadershipJobID, "", true)
+			a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
 		case state.JobPhaseUnknown:
 			a.log.Str("status", string(jobStatus)).Error("Resign server job unknown status")
 			return false, false, nil
@ -122,11 +122,18 @@ func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool
 			return false, false, nil
 		}

+		a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
+
 		// Job is Finished, check if we are not a leader anymore
 		if agencyState.PlanLeaderServers().Contains(state.Server(m.ID)) {
 			// We are still a leader!
-			a.log.Warn("DBServers is still a leader for shards")
-			return false, false, nil
+			if agencyState.PlanLeaderServersWithFailOver().Contains(state.Server(m.ID)) {
+				// We need to retry
+				a.log.Warn("DBServer is still a leader for shards")
+				return false, false, nil
+			}
+			// Nothing to do as RF is set to 1
+			a.log.Warn("DBServer is still a leader for shards, but ReplicationFactor is set to 1")
 		}
 		return true, false, nil
 	}