1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] Startup Coordinator probe (#1059)

This commit is contained in:
Tomasz Mielech 2022-07-25 09:28:58 +02:00 committed by GitHub
parent 1a44b19b7c
commit a8002e1e2b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 51 additions and 41 deletions

View file

@ -5,6 +5,7 @@
- (Feature) (EE) Ordered Member IDs
- (Refactor) Deprecate ForeachServerGroup, ForeachServerInGroups and ForServerGroup functions and refactor code accordingly
- (Bugfix) Memory leaks due to incorrect time.After function usage
- (Feature) Add startup probe for coordinators
## [1.2.15](https://github.com/arangodb/kube-arangodb/tree/1.2.15) (2022-07-20)
- (Bugfix) Ensure pod names not too long

View file

@ -147,6 +147,16 @@ func createTestReadinessProbe(mode string, secure bool, authorization string) *c
return p
}
func createTestStartupProbe(mode string, secure bool, authorization string, failureThreshold int32) *core.Probe {
p := getProbeCreator(mode)(secure, authorization, "/_api/version", shared.ArangoPort).Create()
p.InitialDelaySeconds = 1
p.PeriodSeconds = 5
p.FailureThreshold = failureThreshold
return p
}
type probeCreator func(secure bool, authorization, endpoint string, port int) resources.Probe
const (
@ -576,6 +586,21 @@ func (testCase *testCaseStruct) createTestPodData(deployment *Deployment, group
groupSpec := testCase.ArangoDeployment.Spec.GetServerGroupSpec(group)
testCase.ExpectedPod.Spec.Tolerations = deployment.resources.CreatePodTolerations(group, groupSpec)
if group == api.ServerGroupCoordinators || group == api.ServerGroupDBServers {
// Set default startup probes.
isSecure := deployment.GetSpec().IsSecure()
var auth string
var retries int32 = 720 // one hour divide by 5.
if group == api.ServerGroupDBServers {
retries = 4320 // 6 hours divide by 5.
}
if deployment.GetSpec().IsAuthenticated() {
auth, _ = createTestToken(deployment, testCase, []string{"/_api/version"})
}
testCase.ExpectedPod.Spec.Containers[0].StartupProbe = createTestStartupProbe(httpProbe, isSecure, auth, retries)
}
// Add image info
if member, group, ok := deployment.apiObject.Status.Members.ElementByID(memberStatus.ID); ok {
member.Image = deployment.apiObject.Status.CurrentImage

View file

@ -62,12 +62,12 @@ var probeMap = map[api.ServerGroup]probes{
readiness: newProbe(true, false),
},
api.ServerGroupDBServers: {
startup: newProbe(true, false),
startup: newProbe(true, true),
liveness: newProbe(true, true),
readiness: newProbe(true, false),
},
api.ServerGroupCoordinators: {
startup: newProbe(true, false),
startup: newProbe(true, true),
liveness: newProbe(true, false),
readiness: newProbe(true, true),
},

View file

@ -31,7 +31,6 @@ import (
const (
recentTerminationsSinceGracePeriod = time.Minute * 10
notReadySinceGracePeriod = time.Minute * 5
recentTerminationThreshold = 5
)
@ -68,23 +67,6 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
continue
}
// Check not ready for a long time
if !m.Phase.IsFailed() {
if m.IsNotReadySince(time.Now().Add(-notReadySinceGracePeriod)) {
// Member has terminated too often in recent history.
failureAcceptable, reason := r.isMemberFailureAcceptable(group, m)
if failureAcceptable {
log.Info("Member is not ready for long time, marking is failed")
m.Phase = api.MemberPhaseFailed
status.Members.Update(m, group)
updateStatusNeeded = true
} else {
log.Warn("Member is not ready for long time, but it is not safe to mark it a failed because: %s", reason)
}
}
}
// Check recent terminations
if !m.Phase.IsFailed() {
count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod))

View file

@ -24,6 +24,7 @@ import (
"fmt"
"os"
"path/filepath"
"time"
core "k8s.io/api/core/v1"
@ -280,19 +281,12 @@ func (r *Resources) probeBuilderStartupCoreOperator(spec api.DeploymentSpec, gro
return nil, err
}
var retries int32
switch group {
case api.ServerGroupDBServers:
retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay
default:
retries = 60
}
retries, periodSeconds := getProbeRetries(group)
return &probes.CMDProbeConfig{
Command: args,
FailureThreshold: retries,
PeriodSeconds: 5,
PeriodSeconds: periodSeconds,
InitialDelaySeconds: 1,
}, nil
}
@ -316,16 +310,8 @@ func (r *Resources) probeBuilderLivenessCore(spec api.DeploymentSpec, group api.
}, nil
}
func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) {
var retries int32
switch group {
case api.ServerGroupDBServers:
retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay
default:
retries = 60
}
func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) {
retries, periodSeconds := getProbeRetries(group)
authorization := ""
if spec.IsAuthenticated() {
@ -343,7 +329,7 @@ func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.S
Secure: spec.IsSecure(),
Authorization: authorization,
FailureThreshold: retries,
PeriodSeconds: 5,
PeriodSeconds: periodSeconds,
InitialDelaySeconds: 1,
}, nil
}
@ -482,7 +468,7 @@ func (r *Resources) probeBuilderLivenessSync(spec api.DeploymentSpec, group api.
}, nil
}
func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) {
func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) {
authorization := ""
port := shared.ArangoSyncMasterPort
if group == api.ServerGroupSyncWorkers {
@ -516,3 +502,19 @@ func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.S
Port: port,
}, nil
}
// getProbeRetries returns how many attempts should be performed and what is the period in seconds between these attempts.
func getProbeRetries(group api.ServerGroup) (int32, int32) {
// Set default values.
period, howLong := 5*time.Second, 300*time.Second
if group == api.ServerGroupDBServers {
// Wait 6 hours (in seconds) for WAL replay.
howLong = 6 * time.Hour
} else if group == api.ServerGroupCoordinators {
// Coordinator should wait for agents, but agents could take more time to spin up.
howLong = time.Hour
}
return int32(howLong / period), int32(period / time.Second)
}