diff --git a/CHANGELOG.md b/CHANGELOG.md index b70625968..eb3b2970f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ - (Feature) (EE) Ordered Member IDs - (Refactor) Deprecate ForeachServerGroup, ForeachServerInGroups and ForServerGroup functions and refactor code accordingly - (Bugfix) Memory leaks due to incorrect time.After function usage +- (Feature) Add startup probe for coordinators ## [1.2.15](https://github.com/arangodb/kube-arangodb/tree/1.2.15) (2022-07-20) - (Bugfix) Ensure pod names not too long diff --git a/pkg/deployment/deployment_suite_test.go b/pkg/deployment/deployment_suite_test.go index 498d540ac..7d69aaf67 100644 --- a/pkg/deployment/deployment_suite_test.go +++ b/pkg/deployment/deployment_suite_test.go @@ -147,6 +147,16 @@ func createTestReadinessProbe(mode string, secure bool, authorization string) *c return p } +func createTestStartupProbe(mode string, secure bool, authorization string, failureThreshold int32) *core.Probe { + p := getProbeCreator(mode)(secure, authorization, "/_api/version", shared.ArangoPort).Create() + + p.InitialDelaySeconds = 1 + p.PeriodSeconds = 5 + p.FailureThreshold = failureThreshold + + return p +} + type probeCreator func(secure bool, authorization, endpoint string, port int) resources.Probe const ( @@ -576,6 +586,21 @@ func (testCase *testCaseStruct) createTestPodData(deployment *Deployment, group groupSpec := testCase.ArangoDeployment.Spec.GetServerGroupSpec(group) testCase.ExpectedPod.Spec.Tolerations = deployment.resources.CreatePodTolerations(group, groupSpec) + if group == api.ServerGroupCoordinators || group == api.ServerGroupDBServers { + // Set default startup probes. + isSecure := deployment.GetSpec().IsSecure() + var auth string + var retries int32 = 720 // one hour divide by 5. + if group == api.ServerGroupDBServers { + retries = 4320 // 6 hours divide by 5. + } + if deployment.GetSpec().IsAuthenticated() { + auth, _ = createTestToken(deployment, testCase, []string{"/_api/version"}) + } + + testCase.ExpectedPod.Spec.Containers[0].StartupProbe = createTestStartupProbe(httpProbe, isSecure, auth, retries) + } + // Add image info if member, group, ok := deployment.apiObject.Status.Members.ElementByID(memberStatus.ID); ok { member.Image = deployment.apiObject.Status.CurrentImage diff --git a/pkg/deployment/pod/probes.go b/pkg/deployment/pod/probes.go index f993eb2a7..d114644e7 100644 --- a/pkg/deployment/pod/probes.go +++ b/pkg/deployment/pod/probes.go @@ -62,12 +62,12 @@ var probeMap = map[api.ServerGroup]probes{ readiness: newProbe(true, false), }, api.ServerGroupDBServers: { - startup: newProbe(true, false), + startup: newProbe(true, true), liveness: newProbe(true, true), readiness: newProbe(true, false), }, api.ServerGroupCoordinators: { - startup: newProbe(true, false), + startup: newProbe(true, true), liveness: newProbe(true, false), readiness: newProbe(true, true), }, diff --git a/pkg/deployment/resilience/member_failure.go b/pkg/deployment/resilience/member_failure.go index 14ea6c3f9..3f67c651f 100644 --- a/pkg/deployment/resilience/member_failure.go +++ b/pkg/deployment/resilience/member_failure.go @@ -31,7 +31,6 @@ import ( const ( recentTerminationsSinceGracePeriod = time.Minute * 10 - notReadySinceGracePeriod = time.Minute * 5 recentTerminationThreshold = 5 ) @@ -68,23 +67,6 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error { continue } - // Check not ready for a long time - if !m.Phase.IsFailed() { - if m.IsNotReadySince(time.Now().Add(-notReadySinceGracePeriod)) { - // Member has terminated too often in recent history. - - failureAcceptable, reason := r.isMemberFailureAcceptable(group, m) - if failureAcceptable { - log.Info("Member is not ready for long time, marking is failed") - m.Phase = api.MemberPhaseFailed - status.Members.Update(m, group) - updateStatusNeeded = true - } else { - log.Warn("Member is not ready for long time, but it is not safe to mark it a failed because: %s", reason) - } - } - } - // Check recent terminations if !m.Phase.IsFailed() { count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod)) diff --git a/pkg/deployment/resources/pod_creator_probes.go b/pkg/deployment/resources/pod_creator_probes.go index a53c87a6b..a4c8ea615 100644 --- a/pkg/deployment/resources/pod_creator_probes.go +++ b/pkg/deployment/resources/pod_creator_probes.go @@ -24,6 +24,7 @@ import ( "fmt" "os" "path/filepath" + "time" core "k8s.io/api/core/v1" @@ -280,19 +281,12 @@ func (r *Resources) probeBuilderStartupCoreOperator(spec api.DeploymentSpec, gro return nil, err } - var retries int32 - - switch group { - case api.ServerGroupDBServers: - retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay - default: - retries = 60 - } + retries, periodSeconds := getProbeRetries(group) return &probes.CMDProbeConfig{ Command: args, FailureThreshold: retries, - PeriodSeconds: 5, + PeriodSeconds: periodSeconds, InitialDelaySeconds: 1, }, nil } @@ -316,16 +310,8 @@ func (r *Resources) probeBuilderLivenessCore(spec api.DeploymentSpec, group api. }, nil } -func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { - - var retries int32 - - switch group { - case api.ServerGroupDBServers: - retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay - default: - retries = 60 - } +func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) { + retries, periodSeconds := getProbeRetries(group) authorization := "" if spec.IsAuthenticated() { @@ -343,7 +329,7 @@ func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.S Secure: spec.IsSecure(), Authorization: authorization, FailureThreshold: retries, - PeriodSeconds: 5, + PeriodSeconds: periodSeconds, InitialDelaySeconds: 1, }, nil } @@ -482,7 +468,7 @@ func (r *Resources) probeBuilderLivenessSync(spec api.DeploymentSpec, group api. }, nil } -func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) { +func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) { authorization := "" port := shared.ArangoSyncMasterPort if group == api.ServerGroupSyncWorkers { @@ -516,3 +502,19 @@ func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.S Port: port, }, nil } + +// getProbeRetries returns how many attempts should be performed and what is the period in seconds between these attempts. +func getProbeRetries(group api.ServerGroup) (int32, int32) { + // Set default values. + period, howLong := 5*time.Second, 300*time.Second + + if group == api.ServerGroupDBServers { + // Wait 6 hours (in seconds) for WAL replay. + howLong = 6 * time.Hour + } else if group == api.ServerGroupCoordinators { + // Coordinator should wait for agents, but agents could take more time to spin up. + howLong = time.Hour + } + + return int32(howLong / period), int32(period / time.Second) +}