mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Feature] Startup Coordinator probe (#1059)
This commit is contained in:
parent
1a44b19b7c
commit
a8002e1e2b
5 changed files with 51 additions and 41 deletions
|
@ -5,6 +5,7 @@
|
|||
- (Feature) (EE) Ordered Member IDs
|
||||
- (Refactor) Deprecate ForeachServerGroup, ForeachServerInGroups and ForServerGroup functions and refactor code accordingly
|
||||
- (Bugfix) Memory leaks due to incorrect time.After function usage
|
||||
- (Feature) Add startup probe for coordinators
|
||||
|
||||
## [1.2.15](https://github.com/arangodb/kube-arangodb/tree/1.2.15) (2022-07-20)
|
||||
- (Bugfix) Ensure pod names not too long
|
||||
|
|
|
@ -147,6 +147,16 @@ func createTestReadinessProbe(mode string, secure bool, authorization string) *c
|
|||
return p
|
||||
}
|
||||
|
||||
func createTestStartupProbe(mode string, secure bool, authorization string, failureThreshold int32) *core.Probe {
|
||||
p := getProbeCreator(mode)(secure, authorization, "/_api/version", shared.ArangoPort).Create()
|
||||
|
||||
p.InitialDelaySeconds = 1
|
||||
p.PeriodSeconds = 5
|
||||
p.FailureThreshold = failureThreshold
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
type probeCreator func(secure bool, authorization, endpoint string, port int) resources.Probe
|
||||
|
||||
const (
|
||||
|
@ -576,6 +586,21 @@ func (testCase *testCaseStruct) createTestPodData(deployment *Deployment, group
|
|||
groupSpec := testCase.ArangoDeployment.Spec.GetServerGroupSpec(group)
|
||||
testCase.ExpectedPod.Spec.Tolerations = deployment.resources.CreatePodTolerations(group, groupSpec)
|
||||
|
||||
if group == api.ServerGroupCoordinators || group == api.ServerGroupDBServers {
|
||||
// Set default startup probes.
|
||||
isSecure := deployment.GetSpec().IsSecure()
|
||||
var auth string
|
||||
var retries int32 = 720 // one hour divide by 5.
|
||||
if group == api.ServerGroupDBServers {
|
||||
retries = 4320 // 6 hours divide by 5.
|
||||
}
|
||||
if deployment.GetSpec().IsAuthenticated() {
|
||||
auth, _ = createTestToken(deployment, testCase, []string{"/_api/version"})
|
||||
}
|
||||
|
||||
testCase.ExpectedPod.Spec.Containers[0].StartupProbe = createTestStartupProbe(httpProbe, isSecure, auth, retries)
|
||||
}
|
||||
|
||||
// Add image info
|
||||
if member, group, ok := deployment.apiObject.Status.Members.ElementByID(memberStatus.ID); ok {
|
||||
member.Image = deployment.apiObject.Status.CurrentImage
|
||||
|
|
|
@ -62,12 +62,12 @@ var probeMap = map[api.ServerGroup]probes{
|
|||
readiness: newProbe(true, false),
|
||||
},
|
||||
api.ServerGroupDBServers: {
|
||||
startup: newProbe(true, false),
|
||||
startup: newProbe(true, true),
|
||||
liveness: newProbe(true, true),
|
||||
readiness: newProbe(true, false),
|
||||
},
|
||||
api.ServerGroupCoordinators: {
|
||||
startup: newProbe(true, false),
|
||||
startup: newProbe(true, true),
|
||||
liveness: newProbe(true, false),
|
||||
readiness: newProbe(true, true),
|
||||
},
|
||||
|
|
|
@ -31,7 +31,6 @@ import (
|
|||
|
||||
const (
|
||||
recentTerminationsSinceGracePeriod = time.Minute * 10
|
||||
notReadySinceGracePeriod = time.Minute * 5
|
||||
recentTerminationThreshold = 5
|
||||
)
|
||||
|
||||
|
@ -68,23 +67,6 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
|
|||
continue
|
||||
}
|
||||
|
||||
// Check not ready for a long time
|
||||
if !m.Phase.IsFailed() {
|
||||
if m.IsNotReadySince(time.Now().Add(-notReadySinceGracePeriod)) {
|
||||
// Member has terminated too often in recent history.
|
||||
|
||||
failureAcceptable, reason := r.isMemberFailureAcceptable(group, m)
|
||||
if failureAcceptable {
|
||||
log.Info("Member is not ready for long time, marking is failed")
|
||||
m.Phase = api.MemberPhaseFailed
|
||||
status.Members.Update(m, group)
|
||||
updateStatusNeeded = true
|
||||
} else {
|
||||
log.Warn("Member is not ready for long time, but it is not safe to mark it a failed because: %s", reason)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check recent terminations
|
||||
if !m.Phase.IsFailed() {
|
||||
count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod))
|
||||
|
|
|
@ -24,6 +24,7 @@ import (
|
|||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
core "k8s.io/api/core/v1"
|
||||
|
||||
|
@ -280,19 +281,12 @@ func (r *Resources) probeBuilderStartupCoreOperator(spec api.DeploymentSpec, gro
|
|||
return nil, err
|
||||
}
|
||||
|
||||
var retries int32
|
||||
|
||||
switch group {
|
||||
case api.ServerGroupDBServers:
|
||||
retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay
|
||||
default:
|
||||
retries = 60
|
||||
}
|
||||
retries, periodSeconds := getProbeRetries(group)
|
||||
|
||||
return &probes.CMDProbeConfig{
|
||||
Command: args,
|
||||
FailureThreshold: retries,
|
||||
PeriodSeconds: 5,
|
||||
PeriodSeconds: periodSeconds,
|
||||
InitialDelaySeconds: 1,
|
||||
}, nil
|
||||
}
|
||||
|
@ -316,16 +310,8 @@ func (r *Resources) probeBuilderLivenessCore(spec api.DeploymentSpec, group api.
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) {
|
||||
|
||||
var retries int32
|
||||
|
||||
switch group {
|
||||
case api.ServerGroupDBServers:
|
||||
retries = 6 * 60 * 60 / 5 // Wait 6 hours for wal replay
|
||||
default:
|
||||
retries = 60
|
||||
}
|
||||
func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) {
|
||||
retries, periodSeconds := getProbeRetries(group)
|
||||
|
||||
authorization := ""
|
||||
if spec.IsAuthenticated() {
|
||||
|
@ -343,7 +329,7 @@ func (r *Resources) probeBuilderStartupCore(spec api.DeploymentSpec, group api.S
|
|||
Secure: spec.IsSecure(),
|
||||
Authorization: authorization,
|
||||
FailureThreshold: retries,
|
||||
PeriodSeconds: 5,
|
||||
PeriodSeconds: periodSeconds,
|
||||
InitialDelaySeconds: 1,
|
||||
}, nil
|
||||
}
|
||||
|
@ -482,7 +468,7 @@ func (r *Resources) probeBuilderLivenessSync(spec api.DeploymentSpec, group api.
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, version driver.Version) (Probe, error) {
|
||||
func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.ServerGroup, _ driver.Version) (Probe, error) {
|
||||
authorization := ""
|
||||
port := shared.ArangoSyncMasterPort
|
||||
if group == api.ServerGroupSyncWorkers {
|
||||
|
@ -516,3 +502,19 @@ func (r *Resources) probeBuilderStartupSync(spec api.DeploymentSpec, group api.S
|
|||
Port: port,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// getProbeRetries returns how many attempts should be performed and what is the period in seconds between these attempts.
|
||||
func getProbeRetries(group api.ServerGroup) (int32, int32) {
|
||||
// Set default values.
|
||||
period, howLong := 5*time.Second, 300*time.Second
|
||||
|
||||
if group == api.ServerGroupDBServers {
|
||||
// Wait 6 hours (in seconds) for WAL replay.
|
||||
howLong = 6 * time.Hour
|
||||
} else if group == api.ServerGroupCoordinators {
|
||||
// Coordinator should wait for agents, but agents could take more time to spin up.
|
||||
howLong = time.Hour
|
||||
}
|
||||
|
||||
return int32(howLong / period), int32(period / time.Second)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue