1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] GT-26 Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure (#989)

This commit is contained in:
jwierzbo 2022-06-10 13:30:56 +02:00 committed by GitHub
parent 9843a56d47
commit 4ff879f4f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 140 additions and 32 deletions

View file

@ -8,6 +8,7 @@
- (Feature) Add `ArangoBackup` CRD auto-installer
- (Feature) Add `ArangoBackupPolicy` CRD auto-installer
- (Feature) Add `ArangoJob` CRD auto-installer
- (Feature) Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
- (Bugfix) Fix arangosync members state inspection

View file

@ -97,6 +97,7 @@ Feature-wise production readiness table:
| Operator Internal Metrics Exporter | 1.2.0 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A |
| Operator Internal Metrics Exporter | 1.2.3 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | It is always enabled |
| Operator Ephemeral Volumes | 1.2.2 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.ephemeral-volumes | N/A |
| Pod RestartPolicyAlways | 1.2.13 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.restart-policy-always | N/A |
## Release notes for 0.3.16

View file

@ -0,0 +1,37 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package features
func init() {
registerFeature(restartPolicyAlways)
}
var restartPolicyAlways = &feature{
name: "restart-policy-always",
description: "Allow to restart containers with always restart policy",
version: "3.6.0",
enterpriseRequired: false,
enabledByDefault: false,
}
func RestartPolicyAlways() Feature {
return restartPolicyAlways
}

View file

@ -279,6 +279,10 @@ func (i *ImageUpdatePod) GetContainerCreator() interfaces.ContainerCreator {
return i.containerCreator
}
func (i *ImageUpdatePod) GetRestartPolicy() core.RestartPolicy {
return core.RestartPolicyNever
}
func (i *ImageUpdatePod) GetAffinityRole() string {
return ""
}

View file

@ -22,14 +22,16 @@ package reconcile
import (
"context"
"github.com/arangodb/kube-arangodb/pkg/deployment/rotation"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
"github.com/rs/zerolog"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"time"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/rotation"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
"github.com/rs/zerolog"
core "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func init() {
@ -257,9 +259,21 @@ func (a actionRuntimeContainerImageUpdate) CheckProgress(ctx context.Context) (b
return false, false, nil
}
// Pod wont get up and running
// Pod won't get up and running
return true, false, errors.Newf("Container %s failed during image replacement: (%d) %s: %s", name, s.ExitCode, s.Reason, s.Message)
} else if s := cstatus.State.Waiting; s != nil {
if pod.Spec.RestartPolicy == core.RestartPolicyAlways {
lastTermination := cstatus.LastTerminationState.Terminated
if lastTermination != nil {
allowedRestartPeriod := time.Now().Add(time.Second * -20)
if lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) {
return true, false, errors.Newf("Container %s continuously failing during image replacement: (%d) %s: %s", name, lastTermination.ExitCode, lastTermination.Reason, lastTermination.Message)
} else {
a.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..")
}
}
}
// Pod is still pulling image or pending for pod start
return false, false, nil
} else if s := cstatus.State.Running; s != nil {

View file

@ -158,6 +158,8 @@ func (r *Resilience) isMemberFailureAcceptable(ctx context.Context, group api.Se
case api.ServerGroupSyncMasters, api.ServerGroupSyncWorkers:
// Sync masters & workers can be replaced at will
return true, "", nil
case api.ServerGroupSingle:
return false, "ServerGroupSingle can not marked as a failed", nil
default:
// TODO
return false, "TODO", nil

View file

@ -493,6 +493,13 @@ func (m *MemberArangoDPod) GetContainerCreator() interfaces.ContainerCreator {
}
}
func (m *MemberArangoDPod) GetRestartPolicy() core.RestartPolicy {
if features.RestartPolicyAlways().Enabled() {
return core.RestartPolicyAlways
}
return core.RestartPolicyNever
}
func (m *MemberArangoDPod) createMetricsExporterSidecarInternalExporter() (*core.Container, error) {
image := m.GetContainerCreator().GetImage()

View file

@ -24,24 +24,19 @@ import (
"context"
"math"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/arangodb/kube-arangodb/pkg/util/collection"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces"
"github.com/arangodb/kube-arangodb/pkg/deployment/pod"
"github.com/arangodb/kube-arangodb/pkg/util/constants"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/apis/shared"
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
"github.com/arangodb/kube-arangodb/pkg/deployment/pod"
"github.com/arangodb/kube-arangodb/pkg/util/collection"
"github.com/arangodb/kube-arangodb/pkg/util/constants"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const (
@ -312,6 +307,13 @@ func (m *MemberSyncPod) GetContainerCreator() interfaces.ContainerCreator {
}
}
func (m *MemberSyncPod) GetRestartPolicy() core.RestartPolicy {
if features.RestartPolicyAlways().Enabled() {
return core.RestartPolicyAlways
}
return core.RestartPolicyNever
}
// Init initializes the arangosync pod.
func (m *MemberSyncPod) Init(ctx context.Context, cachedStatus interfaces.Inspector, pod *core.Pod) error {
terminationGracePeriodSeconds := int64(math.Ceil(m.groupSpec.GetTerminationGracePeriod(m.group).Seconds()))

View file

@ -23,22 +23,20 @@ package resources
import (
"context"
"fmt"
"strings"
"time"
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
core "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"strings"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
"github.com/arangodb/kube-arangodb/pkg/metrics"
"github.com/arangodb/kube-arangodb/pkg/util"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
podv1 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/pod/v1"
)
@ -48,11 +46,36 @@ var (
)
const (
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
terminationRestartPeriod = time.Second * -30 // If previous pod termination happened less than this time ago,
// we will mark the pod as scheduled for termination
recheckSoonPodInspectorInterval = util.Interval(time.Second) // Time between Pod inspection if we think something will change soon
maxPodInspectorInterval = util.Interval(time.Hour) // Maximum time between Pod inspection (if nothing else happens)
)
func (r *Resources) handleRestartedPod(pod *core.Pod, memberStatus *api.MemberStatus, wasTerminated, markAsTerminated *bool) {
containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer)
if exist && containerStatus.State.Terminated != nil {
// do not record termination time again in the code below
*wasTerminated = true
termination := containerStatus.State.Terminated.FinishedAt
if memberStatus.RecentTerminationsSince(termination.Time) == 0 {
memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination)
}
previousTermination := containerStatus.LastTerminationState.Terminated
allowedRestartPeriod := time.Now().Add(terminationRestartPeriod)
if previousTermination != nil && !previousTermination.FinishedAt.Time.Before(allowedRestartPeriod) {
r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it")
*markAsTerminated = true
} else {
*markAsTerminated = false
r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..")
}
}
}
// InspectPods lists all pods that belong to the given deployment and updates
// the member status of the deployment accordingly.
// Returns: Interval_till_next_inspection, error
@ -102,10 +125,17 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
if k8sutil.IsPodSucceeded(pod, coreContainers) {
// Pod has terminated with exit code 0.
wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated)
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
markAsTerminated := true
if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated {
r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated)
}
if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded")
updateMemberStatusNeeded = true
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
if !wasTerminated {
// Record termination time
now := meta.Now()
@ -115,7 +145,13 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
} else if k8sutil.IsPodFailed(pod, coreContainers) {
// Pod has terminated with at least 1 container with a non-zero exit code.
wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated)
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
markAsTerminated := true
if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated {
r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated)
}
if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
if containers := k8sutil.GetFailedContainerNames(pod.Status.InitContainerStatuses); len(containers) > 0 {
for _, container := range containers {
switch container {
@ -171,6 +207,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Failed")
updateMemberStatusNeeded = true
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
if !wasTerminated {
// Record termination time
now := meta.Now()

View file

@ -126,6 +126,8 @@ func IsRotationRequired(log zerolog.Logger, acs sutil.ACS, spec api.DeploymentSp
if mode, plan, err := compare(log, spec, member, group, specTemplate, statusTemplate); err != nil {
return SkippedRotation, nil, "", err
} else if mode == SkippedRotation {
return mode, plan, "No rotation needed", nil
} else {
return mode, plan, "Pod needs rotation", nil
}

View file

@ -151,7 +151,7 @@ func NewAccessPackageDeletedEvent(apiObject APIObject, apSecretName string) *Eve
return event
}
// NewPlanTimeoutEvent creates an event indicating that an item on a reconciliation plan has been added
// NewPlanAppendEvent creates an event indicating that an item on a reconciliation plan has been added
func NewPlanAppendEvent(apiObject APIObject, itemType, memberID, role, reason string) *Event {
event := newDeploymentEvent(apiObject)
event.Type = v1.EventTypeNormal

View file

@ -51,6 +51,7 @@ type PodCreator interface {
GetPodAntiAffinity() *core.PodAntiAffinity
GetPodAffinity() *core.PodAffinity
GetNodeAffinity() *core.NodeAffinity
GetRestartPolicy() core.RestartPolicy
GetContainerCreator() ContainerCreator
GetImagePullSecrets() []string
IsDeploymentMode() bool

View file

@ -526,7 +526,7 @@ func NewPod(deploymentName, role, id, podName string, podCreator interfaces.PodC
Spec: core.PodSpec{
Hostname: hostname,
Subdomain: CreateHeadlessServiceName(deploymentName),
RestartPolicy: core.RestartPolicyNever,
RestartPolicy: podCreator.GetRestartPolicy(),
Tolerations: podCreator.GetTolerations(),
ServiceAccountName: podCreator.GetServiceAccountName(),
NodeSelector: podCreator.GetNodeSelector(),