mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Feature] GT-26 Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure (#989)
This commit is contained in:
parent
9843a56d47
commit
4ff879f4f8
13 changed files with 140 additions and 32 deletions
|
@ -8,6 +8,7 @@
|
|||
- (Feature) Add `ArangoBackup` CRD auto-installer
|
||||
- (Feature) Add `ArangoBackupPolicy` CRD auto-installer
|
||||
- (Feature) Add `ArangoJob` CRD auto-installer
|
||||
- (Feature) Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure
|
||||
|
||||
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
|
||||
- (Bugfix) Fix arangosync members state inspection
|
||||
|
|
|
@ -97,6 +97,7 @@ Feature-wise production readiness table:
|
|||
| Operator Internal Metrics Exporter | 1.2.0 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A |
|
||||
| Operator Internal Metrics Exporter | 1.2.3 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | It is always enabled |
|
||||
| Operator Ephemeral Volumes | 1.2.2 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.ephemeral-volumes | N/A |
|
||||
| Pod RestartPolicyAlways | 1.2.13 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.restart-policy-always | N/A |
|
||||
|
||||
## Release notes for 0.3.16
|
||||
|
||||
|
|
37
pkg/deployment/features/restart_policy_always.go
Normal file
37
pkg/deployment/features/restart_policy_always.go
Normal file
|
@ -0,0 +1,37 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package features
|
||||
|
||||
func init() {
|
||||
registerFeature(restartPolicyAlways)
|
||||
}
|
||||
|
||||
var restartPolicyAlways = &feature{
|
||||
name: "restart-policy-always",
|
||||
description: "Allow to restart containers with always restart policy",
|
||||
version: "3.6.0",
|
||||
enterpriseRequired: false,
|
||||
enabledByDefault: false,
|
||||
}
|
||||
|
||||
func RestartPolicyAlways() Feature {
|
||||
return restartPolicyAlways
|
||||
}
|
|
@ -279,6 +279,10 @@ func (i *ImageUpdatePod) GetContainerCreator() interfaces.ContainerCreator {
|
|||
return i.containerCreator
|
||||
}
|
||||
|
||||
func (i *ImageUpdatePod) GetRestartPolicy() core.RestartPolicy {
|
||||
return core.RestartPolicyNever
|
||||
}
|
||||
|
||||
func (i *ImageUpdatePod) GetAffinityRole() string {
|
||||
return ""
|
||||
}
|
||||
|
|
|
@ -22,14 +22,16 @@ package reconcile
|
|||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/rotation"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
"github.com/rs/zerolog"
|
||||
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"time"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/rotation"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/errors"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
core "k8s.io/api/core/v1"
|
||||
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
func init() {
|
||||
|
@ -257,9 +259,21 @@ func (a actionRuntimeContainerImageUpdate) CheckProgress(ctx context.Context) (b
|
|||
return false, false, nil
|
||||
}
|
||||
|
||||
// Pod wont get up and running
|
||||
// Pod won't get up and running
|
||||
return true, false, errors.Newf("Container %s failed during image replacement: (%d) %s: %s", name, s.ExitCode, s.Reason, s.Message)
|
||||
} else if s := cstatus.State.Waiting; s != nil {
|
||||
if pod.Spec.RestartPolicy == core.RestartPolicyAlways {
|
||||
lastTermination := cstatus.LastTerminationState.Terminated
|
||||
if lastTermination != nil {
|
||||
allowedRestartPeriod := time.Now().Add(time.Second * -20)
|
||||
if lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) {
|
||||
return true, false, errors.Newf("Container %s continuously failing during image replacement: (%d) %s: %s", name, lastTermination.ExitCode, lastTermination.Reason, lastTermination.Message)
|
||||
} else {
|
||||
a.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pod is still pulling image or pending for pod start
|
||||
return false, false, nil
|
||||
} else if s := cstatus.State.Running; s != nil {
|
||||
|
|
|
@ -158,6 +158,8 @@ func (r *Resilience) isMemberFailureAcceptable(ctx context.Context, group api.Se
|
|||
case api.ServerGroupSyncMasters, api.ServerGroupSyncWorkers:
|
||||
// Sync masters & workers can be replaced at will
|
||||
return true, "", nil
|
||||
case api.ServerGroupSingle:
|
||||
return false, "ServerGroupSingle can not marked as a failed", nil
|
||||
default:
|
||||
// TODO
|
||||
return false, "TODO", nil
|
||||
|
|
|
@ -493,6 +493,13 @@ func (m *MemberArangoDPod) GetContainerCreator() interfaces.ContainerCreator {
|
|||
}
|
||||
}
|
||||
|
||||
func (m *MemberArangoDPod) GetRestartPolicy() core.RestartPolicy {
|
||||
if features.RestartPolicyAlways().Enabled() {
|
||||
return core.RestartPolicyAlways
|
||||
}
|
||||
return core.RestartPolicyNever
|
||||
}
|
||||
|
||||
func (m *MemberArangoDPod) createMetricsExporterSidecarInternalExporter() (*core.Container, error) {
|
||||
image := m.GetContainerCreator().GetImage()
|
||||
|
||||
|
|
|
@ -24,24 +24,19 @@ import (
|
|||
"context"
|
||||
"math"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/globals"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/errors"
|
||||
|
||||
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/collection"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/pod"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/constants"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/apis/shared"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/pod"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/collection"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/constants"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/errors"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/globals"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces"
|
||||
|
||||
core "k8s.io/api/core/v1"
|
||||
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -312,6 +307,13 @@ func (m *MemberSyncPod) GetContainerCreator() interfaces.ContainerCreator {
|
|||
}
|
||||
}
|
||||
|
||||
func (m *MemberSyncPod) GetRestartPolicy() core.RestartPolicy {
|
||||
if features.RestartPolicyAlways().Enabled() {
|
||||
return core.RestartPolicyAlways
|
||||
}
|
||||
return core.RestartPolicyNever
|
||||
}
|
||||
|
||||
// Init initializes the arangosync pod.
|
||||
func (m *MemberSyncPod) Init(ctx context.Context, cachedStatus interfaces.Inspector, pod *core.Pod) error {
|
||||
terminationGracePeriodSeconds := int64(math.Ceil(m.groupSpec.GetTerminationGracePeriod(m.group).Seconds()))
|
||||
|
|
|
@ -23,22 +23,20 @@ package resources
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/errors"
|
||||
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
|
||||
|
||||
core "k8s.io/api/core/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
"strings"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
|
||||
"github.com/arangodb/kube-arangodb/pkg/metrics"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/errors"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
|
||||
podv1 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/pod/v1"
|
||||
)
|
||||
|
||||
|
@ -48,11 +46,36 @@ var (
|
|||
)
|
||||
|
||||
const (
|
||||
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
|
||||
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
|
||||
terminationRestartPeriod = time.Second * -30 // If previous pod termination happened less than this time ago,
|
||||
// we will mark the pod as scheduled for termination
|
||||
recheckSoonPodInspectorInterval = util.Interval(time.Second) // Time between Pod inspection if we think something will change soon
|
||||
maxPodInspectorInterval = util.Interval(time.Hour) // Maximum time between Pod inspection (if nothing else happens)
|
||||
)
|
||||
|
||||
func (r *Resources) handleRestartedPod(pod *core.Pod, memberStatus *api.MemberStatus, wasTerminated, markAsTerminated *bool) {
|
||||
containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer)
|
||||
if exist && containerStatus.State.Terminated != nil {
|
||||
// do not record termination time again in the code below
|
||||
*wasTerminated = true
|
||||
|
||||
termination := containerStatus.State.Terminated.FinishedAt
|
||||
if memberStatus.RecentTerminationsSince(termination.Time) == 0 {
|
||||
memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination)
|
||||
}
|
||||
|
||||
previousTermination := containerStatus.LastTerminationState.Terminated
|
||||
allowedRestartPeriod := time.Now().Add(terminationRestartPeriod)
|
||||
if previousTermination != nil && !previousTermination.FinishedAt.Time.Before(allowedRestartPeriod) {
|
||||
r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it")
|
||||
*markAsTerminated = true
|
||||
} else {
|
||||
*markAsTerminated = false
|
||||
r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// InspectPods lists all pods that belong to the given deployment and updates
|
||||
// the member status of the deployment accordingly.
|
||||
// Returns: Interval_till_next_inspection, error
|
||||
|
@ -102,10 +125,17 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
|
|||
if k8sutil.IsPodSucceeded(pod, coreContainers) {
|
||||
// Pod has terminated with exit code 0.
|
||||
wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated)
|
||||
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
|
||||
markAsTerminated := true
|
||||
|
||||
if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated {
|
||||
r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated)
|
||||
}
|
||||
|
||||
if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
|
||||
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded")
|
||||
updateMemberStatusNeeded = true
|
||||
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
|
||||
|
||||
if !wasTerminated {
|
||||
// Record termination time
|
||||
now := meta.Now()
|
||||
|
@ -115,7 +145,13 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
|
|||
} else if k8sutil.IsPodFailed(pod, coreContainers) {
|
||||
// Pod has terminated with at least 1 container with a non-zero exit code.
|
||||
wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated)
|
||||
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
|
||||
markAsTerminated := true
|
||||
|
||||
if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated {
|
||||
r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated)
|
||||
}
|
||||
|
||||
if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
|
||||
if containers := k8sutil.GetFailedContainerNames(pod.Status.InitContainerStatuses); len(containers) > 0 {
|
||||
for _, container := range containers {
|
||||
switch container {
|
||||
|
@ -171,6 +207,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
|
|||
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Failed")
|
||||
updateMemberStatusNeeded = true
|
||||
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
|
||||
|
||||
if !wasTerminated {
|
||||
// Record termination time
|
||||
now := meta.Now()
|
||||
|
|
|
@ -126,6 +126,8 @@ func IsRotationRequired(log zerolog.Logger, acs sutil.ACS, spec api.DeploymentSp
|
|||
|
||||
if mode, plan, err := compare(log, spec, member, group, specTemplate, statusTemplate); err != nil {
|
||||
return SkippedRotation, nil, "", err
|
||||
} else if mode == SkippedRotation {
|
||||
return mode, plan, "No rotation needed", nil
|
||||
} else {
|
||||
return mode, plan, "Pod needs rotation", nil
|
||||
}
|
||||
|
|
|
@ -151,7 +151,7 @@ func NewAccessPackageDeletedEvent(apiObject APIObject, apSecretName string) *Eve
|
|||
return event
|
||||
}
|
||||
|
||||
// NewPlanTimeoutEvent creates an event indicating that an item on a reconciliation plan has been added
|
||||
// NewPlanAppendEvent creates an event indicating that an item on a reconciliation plan has been added
|
||||
func NewPlanAppendEvent(apiObject APIObject, itemType, memberID, role, reason string) *Event {
|
||||
event := newDeploymentEvent(apiObject)
|
||||
event.Type = v1.EventTypeNormal
|
||||
|
|
|
@ -51,6 +51,7 @@ type PodCreator interface {
|
|||
GetPodAntiAffinity() *core.PodAntiAffinity
|
||||
GetPodAffinity() *core.PodAffinity
|
||||
GetNodeAffinity() *core.NodeAffinity
|
||||
GetRestartPolicy() core.RestartPolicy
|
||||
GetContainerCreator() ContainerCreator
|
||||
GetImagePullSecrets() []string
|
||||
IsDeploymentMode() bool
|
||||
|
|
|
@ -526,7 +526,7 @@ func NewPod(deploymentName, role, id, podName string, podCreator interfaces.PodC
|
|||
Spec: core.PodSpec{
|
||||
Hostname: hostname,
|
||||
Subdomain: CreateHeadlessServiceName(deploymentName),
|
||||
RestartPolicy: core.RestartPolicyNever,
|
||||
RestartPolicy: podCreator.GetRestartPolicy(),
|
||||
Tolerations: podCreator.GetTolerations(),
|
||||
ServiceAccountName: podCreator.GetServiceAccountName(),
|
||||
NodeSelector: podCreator.GetNodeSelector(),
|
||||
|
|
Loading…
Reference in a new issue