1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] Parametrize Scheduling Graceful Duration (#1641)

This commit is contained in:
Adam Janikowski 2024-04-08 13:03:01 +02:00 committed by GitHub
parent a4d7331a0c
commit 1d86f4ee29
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 39 additions and 1 deletions

View file

@ -20,6 +20,7 @@
- (Maintenance) Update Go to 1.22.2
- (Feature) Object Checksum
- (Bugfix) Use Rendered Spec in case of scheduling compare
- (Feature) Parametrize Scheduling Graceful Duration
## [1.2.39](https://github.com/arangodb/kube-arangodb/tree/1.2.39) (2024-03-11)
- (Feature) Extract Scheduler API

View file

@ -198,6 +198,7 @@ Flags:
--timeout.backup-upload duration The request timeout to the ArangoDB during uploading files (default 5m0s)
--timeout.force-delete-pod-grace-period duration Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals (default 15m0s)
--timeout.k8s duration The request timeout to the kubernetes (default 2s)
--timeout.pod-scheduling-grace-period duration Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable (default 15s)
--timeout.reconciliation duration The reconciliation timeout to the ArangoDB CR (default 1m0s)
--timeout.shard-rebuild duration Timeout after which particular out-synced shard is considered as failed and rebuild is triggered (default 1h0m0s)
--timeout.shard-rebuild-retry duration Timeout after which rebuild shards retry flow is triggered (default 4h0m0s)

View file

@ -157,6 +157,7 @@ var (
backupArangoD time.Duration
backupUploadArangoD time.Duration
forcePodDeletionGracePeriod time.Duration
podSchedulingGracePeriod time.Duration
}
operatorImageDiscovery struct {
timeout time.Duration
@ -226,6 +227,7 @@ func init() {
f.DurationVar(&operatorTimeouts.backupArangoD, "timeout.backup-arangod", globals.BackupDefaultArangoClientTimeout, "The request timeout to the ArangoDB during backup calls")
f.DurationVar(&operatorTimeouts.backupUploadArangoD, "timeout.backup-upload", globals.BackupUploadArangoClientTimeout, "The request timeout to the ArangoDB during uploading files")
f.DurationVar(&operatorTimeouts.forcePodDeletionGracePeriod, "timeout.force-delete-pod-grace-period", globals.DefaultForcePodDeletionGracePeriodTimeout, "Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals")
f.DurationVar(&operatorTimeouts.podSchedulingGracePeriod, "timeout.pod-scheduling-grace-period", globals.DefaultPodSchedulingGracePeriod, "Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable")
f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers")
f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers")
f.DurationVar(&operatorReconciliationRetry.delay, "operator.reconciliation.retry.delay", globals.DefaultOperatorUpdateRetryDelay, "Delay between Object Update operations in the Reconciliation loop")
@ -294,6 +296,7 @@ func executeMain(cmd *cobra.Command, args []string) {
globals.GetGlobalTimeouts().BackupArangoClientTimeout().Set(operatorTimeouts.backupArangoD)
globals.GetGlobalTimeouts().BackupArangoClientUploadTimeout().Set(operatorTimeouts.backupUploadArangoD)
globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Set(operatorTimeouts.forcePodDeletionGracePeriod)
globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Set(operatorTimeouts.podSchedulingGracePeriod)
globals.GetGlobals().Retry().OperatorUpdateRetryDelay().Set(operatorReconciliationRetry.delay)
globals.GetGlobals().Retry().OperatorUpdateRetryCount().Set(operatorReconciliationRetry.count)

View file

@ -23,12 +23,14 @@ package reconcile
import (
"context"
"reflect"
"time"
core "k8s.io/api/core/v1"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/util"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)
@ -38,6 +40,12 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
_ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
var p api.Plan
if globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() == 0 {
// Scheduling grace period is not enabled
return nil
}
if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) {
return p
}
@ -55,6 +63,19 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
continue
}
if c, ok := m.Member.Conditions.Get(api.ConditionTypeScheduled); !ok {
// Action cant proceed if pod is not scheduled
continue
} else if c.LastTransitionTime.IsZero() {
// LastTransitionTime is not set
continue
} else {
if time.Since(c.LastTransitionTime.Time) <= globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() {
// In grace period
continue
}
}
imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member)
if !imageFound {
l.Warn("could not find image for already created member")

View file

@ -393,6 +393,11 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
}
} else {
if memberStatus.Conditions.Update(api.ConditionTypeScheduled, false, "Pod is not scheduled", "") {
updateMemberStatusNeeded = true
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
}
if k8sutil.IsPodNotScheduledFor(pod, podScheduleTimeout) {
// Pod cannot be scheduled for to long
log.Str("pod-name", pod.GetName()).Debug("Pod scheduling timeout")

View file

@ -29,6 +29,7 @@ const (
DefaultArangoDCheckTimeout = time.Second * 2
DefaultReconciliationTimeout = time.Minute
DefaultForcePodDeletionGracePeriodTimeout = 15 * time.Minute
DefaultPodSchedulingGracePeriod = 15 * time.Second
BackupDefaultArangoClientTimeout = 30 * time.Second
BackupUploadArangoClientTimeout = 300 * time.Second
@ -61,6 +62,7 @@ var globalObj = &globals{
backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout),
backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout),
forcePodDeletionGracePeriodTimeout: NewTimeout(DefaultForcePodDeletionGracePeriodTimeout),
podSchedulingGracePeriod: NewTimeout(DefaultPodSchedulingGracePeriod),
},
kubernetes: &globalKubernetes{
requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize),
@ -147,6 +149,7 @@ type GlobalTimeouts interface {
Agency() Timeout
ForcePodDeletionGracePeriodTimeout() Timeout
PodSchedulingGracePeriod() Timeout
BackupArangoClientTimeout() Timeout
BackupArangoClientUploadTimeout() Timeout
@ -156,13 +159,17 @@ type globalTimeouts struct {
requests, arangod, reconciliation, arangodCheck, agency, shardRebuild, shardRebuildRetry Timeout
backupArangoClientTimeout Timeout
backupArangoClientUploadTimeout Timeout
forcePodDeletionGracePeriodTimeout Timeout
forcePodDeletionGracePeriodTimeout, podSchedulingGracePeriod Timeout
}
func (g *globalTimeouts) ForcePodDeletionGracePeriodTimeout() Timeout {
return g.forcePodDeletionGracePeriodTimeout
}
func (g *globalTimeouts) PodSchedulingGracePeriod() Timeout {
return g.podSchedulingGracePeriod
}
func (g *globalTimeouts) Agency() Timeout {
return g.agency
}