[Feature] Startup Coordinator probe (#1059)

2024-12-14 11:57:37 +00:00 · 2022-07-25 09:28:58 +02:00 · 2022-07-25 09:28:58 +02:00 · a8002e1e2b
commit a8002e1e2b
parent 1a44b19b7c
5 changed files with 51 additions and 41 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,7 @@
 - (Feature) (EE) Ordered Member IDs
 - (Refactor) Deprecate ForeachServerGroup, ForeachServerInGroups and ForServerGroup functions and refactor code accordingly
 - (Bugfix) Memory leaks due to incorrect time.After function usage
+- (Feature) Add startup probe for coordinators

 ## [1.2.15](https://github.com/arangodb/kube-arangodb/tree/1.2.15) (2022-07-20)
 - (Bugfix) Ensure pod names not too long
--- a/pkg/deployment/deployment_suite_test.go
+++ b/pkg/deployment/deployment_suite_test.go
@ -147,6 +147,16 @@ func createTestReadinessProbe(mode string, secure bool, authorization string) *c
 	return p
 }

+func createTestStartupProbe(mode string, secure bool, authorization string, failureThreshold int32) *core.Probe {
+	p := getProbeCreator(mode)(secure, authorization, "/_api/version", shared.ArangoPort).Create()
+
+	p.InitialDelaySeconds = 1
+	p.PeriodSeconds = 5
+	p.FailureThreshold = failureThreshold
+
+	return p
+}
+
 type probeCreator func(secure bool, authorization, endpoint string, port int) resources.Probe

 const (
@ -576,6 +586,21 @@ func (testCase *testCaseStruct) createTestPodData(deployment *Deployment, group
 	groupSpec := testCase.ArangoDeployment.Spec.GetServerGroupSpec(group)
 	testCase.ExpectedPod.Spec.Tolerations = deployment.resources.CreatePodTolerations(group, groupSpec)

+	if group == api.ServerGroupCoordinators || group == api.ServerGroupDBServers {
+		// Set default startup probes.
+		isSecure := deployment.GetSpec().IsSecure()
+		var auth string
+		var retries int32 = 720 // one hour divide by 5.
+		if group == api.ServerGroupDBServers {
+			retries = 4320 // 6 hours divide by 5.
+		}
+		if deployment.GetSpec().IsAuthenticated() {
+			auth, _ = createTestToken(deployment, testCase, []string{"/_api/version"})
+		}
+
+		testCase.ExpectedPod.Spec.Containers[0].StartupProbe = createTestStartupProbe(httpProbe, isSecure, auth, retries)
+	}
+
 	// Add image info
 	if member, group, ok := deployment.apiObject.Status.Members.ElementByID(memberStatus.ID); ok {
 		member.Image = deployment.apiObject.Status.CurrentImage
--- a/pkg/deployment/pod/probes.go
+++ b/pkg/deployment/pod/probes.go
@ -62,12 +62,12 @@ var probeMap = map[api.ServerGroup]probes{
 		readiness: newProbe(true, false),
 	},
 	api.ServerGroupDBServers: {
-		startup:   newProbe(true, false),
+		startup:   newProbe(true, true),
 		liveness:  newProbe(true, true),
 		readiness: newProbe(true, false),
 	},
 	api.ServerGroupCoordinators: {
-		startup:   newProbe(true, false),
+		startup:   newProbe(true, true),
 		liveness:  newProbe(true, false),
 		readiness: newProbe(true, true),
 	},
--- a/pkg/deployment/resilience/member_failure.go
+++ b/pkg/deployment/resilience/member_failure.go
@ -31,7 +31,6 @@ import (

 const (
 	recentTerminationsSinceGracePeriod = time.Minute * 10
-	notReadySinceGracePeriod           = time.Minute * 5
 	recentTerminationThreshold         = 5
 )

@ -68,23 +67,6 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
 			continue
 		}

-		// Check not ready for a long time
-		if !m.Phase.IsFailed() {
-			if m.IsNotReadySince(time.Now().Add(-notReadySinceGracePeriod)) {
-				// Member has terminated too often in recent history.
-
-				failureAcceptable, reason := r.isMemberFailureAcceptable(group, m)
-				if failureAcceptable {
-					log.Info("Member is not ready for long time, marking is failed")
-					m.Phase = api.MemberPhaseFailed
-					status.Members.Update(m, group)
-					updateStatusNeeded = true
-				} else {
-					log.Warn("Member is not ready for long time, but it is not safe to mark it a failed because: %s", reason)
-				}
-			}
-		}
-
 		// Check recent terminations
 		if !m.Phase.IsFailed() {
 			count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod))
--- a/pkg/deployment/resources/pod_creator_probes.go
+++ b/pkg/deployment/resources/pod_creator_probes.go
@ -24,6 +24,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"time"

 	core "k8s.io/api/core/v1"

@ -280,19 +281,12 @@ func (r *Resources) probeBuilderStartupCoreOperator(spec api.DeploymentSpec, gro
 		return nil, err
 	}

-	var retries int32
-
-	switch group {
-	case api.ServerGroupDBServers:
-		retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay
-	default:
-		retries = 60
-	}
+	retries, periodSeconds := getProbeRetries(group)

 	return &probes.CMDProbeConfig{
 		Command:             args,
 		FailureThreshold:    retries,
-		PeriodSeconds:       5,
+		PeriodSeconds:       periodSeconds,
 		InitialDelaySeconds: 1,
 	}, nil
 }
@ -316,16 +310,8 @@ func (r *Resources) probeBuilderLivenessCore(spec api.DeploymentSpec, group api.
 	}, nil
 }

-func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) {
-
-	var retries int32
-
-	switch group {
-	case api.ServerGroupDBServers:
-		retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay
-	default:
-		retries = 60
-	}
+func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) {
+	retries, periodSeconds := getProbeRetries(group)

 	authorization := ""
 	if spec.IsAuthenticated() {
@ -343,7 +329,7 @@ func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.S
 		Secure:              spec.IsSecure(),
 		Authorization:       authorization,
 		FailureThreshold:    retries,
-		PeriodSeconds:       5,
+		PeriodSeconds:       periodSeconds,
 		InitialDelaySeconds: 1,
 	}, nil
 }
@ -482,7 +468,7 @@ func (r *Resources) probeBuilderLivenessSync(spec api.DeploymentSpec, group api.
 	}, nil
 }

-func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) {
+func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) {
 	authorization := ""
 	port := shared.ArangoSyncMasterPort
 	if group == api.ServerGroupSyncWorkers {
@ -516,3 +502,19 @@ func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.S
 		Port:          port,
 	}, nil
 }
+
+// getProbeRetries returns how many attempts should be performed and what is the period in seconds between these attempts.
+func getProbeRetries(group api.ServerGroup) (int32, int32) {
+	// Set default values.
+	period, howLong := 5*time.Second, 300*time.Second
+
+	if group == api.ServerGroupDBServers {
+		// Wait 6 hours (in seconds) for WAL replay.
+		howLong = 6 * time.Hour
+	} else if group == api.ServerGroupCoordinators {
+		// Coordinator should wait for agents, but agents could take more time to spin up.
+		howLong = time.Hour
+	}
+
+	return int32(howLong / period), int32(period / time.Second)
+}