mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
Detect member failure and replace member if needed
This commit is contained in:
parent
fbc0d1b84c
commit
0fdb0c0c35
18 changed files with 342 additions and 112 deletions
43
pkg/apis/deployment/v1alpha/member_phase.go
Normal file
43
pkg/apis/deployment/v1alpha/member_phase.go
Normal file
|
@ -0,0 +1,43 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Author Ewout Prangsma
|
||||
//
|
||||
|
||||
package v1alpha
|
||||
|
||||
// MemberPhase is a strongly typed lifetime phase of a deployment member
|
||||
type MemberPhase string
|
||||
|
||||
const (
|
||||
// MemberPhaseNone indicates that the state is not set yet
|
||||
MemberPhaseNone MemberPhase = ""
|
||||
// MemberPhaseCreated indicates that all resources needed for the member have been created
|
||||
MemberPhaseCreated MemberPhase = "Created"
|
||||
// MemberPhaseFailed indicates that the member is gone beyond hope of recovery. It must be replaced with a new member.
|
||||
MemberPhaseFailed MemberPhase = "Failed"
|
||||
// MemberPhaseCleanOut indicates that a dbserver is in the process of being cleaned out
|
||||
MemberPhaseCleanOut MemberPhase = "CleanOut"
|
||||
// MemberPhaseShuttingDown indicates that a member is shutting down
|
||||
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
|
||||
// MemberPhaseRotating indicates that a member is being rotated
|
||||
MemberPhaseRotating MemberPhase = "Rotating"
|
||||
// MemberPhaseUpgrading indicates that a member is in the process of upgrading its database data format
|
||||
MemberPhaseUpgrading MemberPhase = "Upgrading"
|
||||
)
|
|
@ -1,41 +0,0 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Author Ewout Prangsma
|
||||
//
|
||||
|
||||
package v1alpha
|
||||
|
||||
// MemberState is a strongly typed state of a deployment member
|
||||
type MemberState string
|
||||
|
||||
const (
|
||||
// MemberStateNone indicates that the state is not set yet
|
||||
MemberStateNone MemberState = ""
|
||||
// MemberStateCreated indicates that all resources needed for the member have been created
|
||||
MemberStateCreated MemberState = "Created"
|
||||
// MemberStateCleanOut indicates that a dbserver is in the process of being cleaned out
|
||||
MemberStateCleanOut MemberState = "CleanOut"
|
||||
// MemberStateShuttingDown indicates that a member is shutting down
|
||||
MemberStateShuttingDown MemberState = "ShuttingDown"
|
||||
// MemberStateRotating indicates that a member is being rotated
|
||||
MemberStateRotating MemberState = "Rotating"
|
||||
// MemberStateUpgrading indicates that a member is in the process of upgrading its database data format
|
||||
MemberStateUpgrading MemberState = "Upgrading"
|
||||
)
|
|
@ -33,8 +33,8 @@ type MemberStatus struct {
|
|||
// ID holds the unique ID of the member.
|
||||
// This id is also used within the ArangoDB cluster to identify this server.
|
||||
ID string `json:"id"`
|
||||
// State holds the current state of this member
|
||||
State MemberState `json:"state"`
|
||||
// Phase holds the current lifetime phase of this member
|
||||
Phase MemberPhase `json:"phase"`
|
||||
// PersistentVolumeClaimName holds the name of the persistent volume claim used for this member (if any).
|
||||
PersistentVolumeClaimName string `json:"persistentVolumeClaimName,omitempty"`
|
||||
// PodName holds the name of the Pod that currently runs this member
|
||||
|
|
|
@ -108,7 +108,7 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
|
|||
if len(l) > 0 {
|
||||
// Try to find a not ready member
|
||||
for _, m := range l {
|
||||
if m.State == MemberStateNone {
|
||||
if m.Phase == MemberPhaseNone {
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
|
@ -116,7 +116,7 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
|
|||
perm := rand.Perm(len(l))
|
||||
for _, idx := range perm {
|
||||
m := l[idx]
|
||||
if m.State == MemberStateCreated {
|
||||
if m.Phase == MemberPhaseCreated {
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ import (
|
|||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/reconcile"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/resilience"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
|
||||
"github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
|
@ -92,6 +93,7 @@ type Deployment struct {
|
|||
recentInspectionErrors int
|
||||
clusterScalingIntegration *clusterScalingIntegration
|
||||
reconciler *reconcile.Reconciler
|
||||
resilience *resilience.Resilience
|
||||
resources *resources.Resources
|
||||
}
|
||||
|
||||
|
@ -111,6 +113,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
|
|||
clientCache: newClientCache(deps.KubeCli, apiObject),
|
||||
}
|
||||
d.reconciler = reconcile.NewReconciler(deps.Log, d)
|
||||
d.resilience = resilience.NewResilience(deps.Log, d)
|
||||
d.resources = resources.NewResources(deps.Log, d)
|
||||
if d.status.AcceptedSpec == nil {
|
||||
// We've validated the spec, so let's use it from now.
|
||||
|
|
|
@ -76,6 +76,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
|
|||
d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject))
|
||||
}
|
||||
|
||||
// Check members for resilience
|
||||
if err := d.resilience.CheckMemberFailure(); err != nil {
|
||||
hasError = true
|
||||
d.CreateEvent(k8sutil.NewErrorEvent("Member failure detection failed", err, d.apiObject))
|
||||
}
|
||||
|
||||
// Create scale/update plan
|
||||
if err := d.reconciler.CreatePlan(); err != nil {
|
||||
hasError = true
|
||||
|
|
|
@ -81,7 +81,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
|
|||
log.Debug().Str("id", id).Msg("Adding single server")
|
||||
if err := d.status.Members.Single.Add(api.MemberStatus{
|
||||
ID: id,
|
||||
State: api.MemberStateNone,
|
||||
Phase: api.MemberPhaseNone,
|
||||
PersistentVolumeClaimName: k8sutil.CreatePersistentVolumeClaimName(deploymentName, role, id),
|
||||
PodName: "",
|
||||
}); err != nil {
|
||||
|
@ -91,7 +91,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
|
|||
log.Debug().Str("id", id).Msg("Adding agent")
|
||||
if err := d.status.Members.Agents.Add(api.MemberStatus{
|
||||
ID: id,
|
||||
State: api.MemberStateNone,
|
||||
Phase: api.MemberPhaseNone,
|
||||
PersistentVolumeClaimName: k8sutil.CreatePersistentVolumeClaimName(deploymentName, role, id),
|
||||
PodName: "",
|
||||
}); err != nil {
|
||||
|
@ -101,7 +101,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
|
|||
log.Debug().Str("id", id).Msg("Adding dbserver")
|
||||
if err := d.status.Members.DBServers.Add(api.MemberStatus{
|
||||
ID: id,
|
||||
State: api.MemberStateNone,
|
||||
Phase: api.MemberPhaseNone,
|
||||
PersistentVolumeClaimName: k8sutil.CreatePersistentVolumeClaimName(deploymentName, role, id),
|
||||
PodName: "",
|
||||
}); err != nil {
|
||||
|
@ -111,7 +111,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
|
|||
log.Debug().Str("id", id).Msg("Adding coordinator")
|
||||
if err := d.status.Members.Coordinators.Add(api.MemberStatus{
|
||||
ID: id,
|
||||
State: api.MemberStateNone,
|
||||
Phase: api.MemberPhaseNone,
|
||||
PersistentVolumeClaimName: "",
|
||||
PodName: "",
|
||||
}); err != nil {
|
||||
|
@ -121,7 +121,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
|
|||
log.Debug().Str("id", id).Msg("Adding syncmaster")
|
||||
if err := d.status.Members.SyncMasters.Add(api.MemberStatus{
|
||||
ID: id,
|
||||
State: api.MemberStateNone,
|
||||
Phase: api.MemberPhaseNone,
|
||||
PersistentVolumeClaimName: "",
|
||||
PodName: "",
|
||||
}); err != nil {
|
||||
|
@ -131,7 +131,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
|
|||
log.Debug().Str("id", id).Msg("Adding syncworker")
|
||||
if err := d.status.Members.SyncWorkers.Add(api.MemberStatus{
|
||||
ID: id,
|
||||
State: api.MemberStateNone,
|
||||
Phase: api.MemberPhaseNone,
|
||||
PersistentVolumeClaimName: "",
|
||||
PodName: "",
|
||||
}); err != nil {
|
||||
|
|
|
@ -71,7 +71,7 @@ func (a *actionCleanoutMember) Start(ctx context.Context) (bool, error) {
|
|||
return false, maskAny(err)
|
||||
}
|
||||
// Update status
|
||||
m.State = api.MemberStateCleanOut
|
||||
m.Phase = api.MemberPhaseCleanOut
|
||||
if a.actionCtx.UpdateMember(m); err != nil {
|
||||
return false, maskAny(err)
|
||||
}
|
||||
|
|
|
@ -83,7 +83,7 @@ func (a *actionRotateMember) Start(ctx context.Context) (bool, error) {
|
|||
}
|
||||
}
|
||||
// Update status
|
||||
m.State = api.MemberStateRotating
|
||||
m.Phase = api.MemberPhaseRotating
|
||||
if err := a.actionCtx.UpdateMember(m); err != nil {
|
||||
return false, maskAny(err)
|
||||
}
|
||||
|
@ -109,7 +109,7 @@ func (a *actionRotateMember) CheckProgress(ctx context.Context) (bool, error) {
|
|||
return false, maskAny(err)
|
||||
}
|
||||
// Pod is now gone, update the member status
|
||||
m.State = api.MemberStateNone
|
||||
m.Phase = api.MemberPhaseNone
|
||||
m.RecentTerminations = nil // Since we're rotating, we do not care about old terminations.
|
||||
if err := a.actionCtx.UpdateMember(m); err != nil {
|
||||
return false, maskAny(err)
|
||||
|
|
|
@ -89,7 +89,7 @@ func (a *actionShutdownMember) Start(ctx context.Context) (bool, error) {
|
|||
}
|
||||
}
|
||||
// Update status
|
||||
m.State = api.MemberStateShuttingDown
|
||||
m.Phase = api.MemberPhaseShuttingDown
|
||||
if err := a.actionCtx.UpdateMember(m); err != nil {
|
||||
return false, maskAny(err)
|
||||
}
|
||||
|
|
|
@ -88,7 +88,7 @@ func (a *actionUpgradeMember) Start(ctx context.Context) (bool, error) {
|
|||
}
|
||||
}
|
||||
// Update status
|
||||
m.State = api.MemberStateRotating // We keep the rotation state here, since only when a new pod is created, it will get the Upgrading state.
|
||||
m.Phase = api.MemberPhaseRotating // We keep the rotation phase here, since only when a new pod is created, it will get the Upgrading phase.
|
||||
if err := a.actionCtx.UpdateMember(m); err != nil {
|
||||
return false, maskAny(err)
|
||||
}
|
||||
|
@ -105,7 +105,7 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, error) {
|
|||
log.Error().Msg("No such member")
|
||||
return true, nil
|
||||
}
|
||||
isUpgrading := m.State == api.MemberStateUpgrading
|
||||
isUpgrading := m.Phase == api.MemberPhaseUpgrading
|
||||
log = log.With().
|
||||
Str("pod-name", m.PodName).
|
||||
Bool("is-upgrading", isUpgrading).Logger()
|
||||
|
@ -119,7 +119,7 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, error) {
|
|||
return false, maskAny(err)
|
||||
}
|
||||
// Pod is now gone, update the member status
|
||||
m.State = api.MemberStateNone
|
||||
m.Phase = api.MemberPhaseNone
|
||||
m.RecentTerminations = nil // Since we're upgrading, we do not care about old terminations.
|
||||
if err := a.actionCtx.UpdateMember(m); err != nil {
|
||||
return false, maskAny(err)
|
||||
|
|
|
@ -85,58 +85,72 @@ func createPlan(log zerolog.Logger, apiObject metav1.Object,
|
|||
// Check for various scenario's
|
||||
var plan api.Plan
|
||||
|
||||
// Check for scale up/down
|
||||
switch spec.GetMode() {
|
||||
case api.DeploymentModeSingle:
|
||||
// Never scale down
|
||||
case api.DeploymentModeResilientSingle:
|
||||
// Only scale singles
|
||||
plan = append(plan, createScalePlan(log, status.Members.Single, api.ServerGroupSingle, spec.Single.GetCount())...)
|
||||
case api.DeploymentModeCluster:
|
||||
// Scale dbservers, coordinators, syncmasters & syncworkers
|
||||
plan = append(plan, createScalePlan(log, status.Members.DBServers, api.ServerGroupDBServers, spec.DBServers.GetCount())...)
|
||||
plan = append(plan, createScalePlan(log, status.Members.Coordinators, api.ServerGroupCoordinators, spec.Coordinators.GetCount())...)
|
||||
plan = append(plan, createScalePlan(log, status.Members.SyncMasters, api.ServerGroupSyncMasters, spec.SyncMasters.GetCount())...)
|
||||
plan = append(plan, createScalePlan(log, status.Members.SyncWorkers, api.ServerGroupSyncWorkers, spec.SyncWorkers.GetCount())...)
|
||||
}
|
||||
|
||||
// Check for the need to rotate one or more members
|
||||
getPod := func(podName string) *v1.Pod {
|
||||
for _, p := range pods {
|
||||
if p.GetName() == podName {
|
||||
return &p
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// Check for members in failed state
|
||||
status.Members.ForeachServerGroup(func(group api.ServerGroup, members *api.MemberStatusList) error {
|
||||
for _, m := range *members {
|
||||
if len(plan) > 0 {
|
||||
// Only 1 change at a time
|
||||
continue
|
||||
}
|
||||
if m.State != api.MemberStateCreated {
|
||||
// Only rotate when state is created
|
||||
continue
|
||||
}
|
||||
if podName := m.PodName; podName != "" {
|
||||
if p := getPod(podName); p != nil {
|
||||
// Got pod, compare it with what it should be
|
||||
decision := podNeedsUpgrading(*p, spec, status.Images)
|
||||
if decision.UpgradeNeeded && decision.UpgradeAllowed {
|
||||
plan = append(plan, createUpgradeMemberPlan(log, m, group, "Version upgrade")...)
|
||||
} else {
|
||||
rotNeeded, reason := podNeedsRotation(*p, apiObject, spec, group, status.Members.Agents, m.ID)
|
||||
if rotNeeded {
|
||||
plan = append(plan, createRotateMemberPlan(log, m, group, reason)...)
|
||||
}
|
||||
}
|
||||
}
|
||||
if m.Phase == api.MemberPhaseFailed && len(plan) == 0 {
|
||||
plan = append(plan, api.NewAction(api.ActionTypeRemoveMember, group, m.ID))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Check for scale up/down
|
||||
if len(plan) == 0 {
|
||||
switch spec.GetMode() {
|
||||
case api.DeploymentModeSingle:
|
||||
// Never scale down
|
||||
case api.DeploymentModeResilientSingle:
|
||||
// Only scale singles
|
||||
plan = append(plan, createScalePlan(log, status.Members.Single, api.ServerGroupSingle, spec.Single.GetCount())...)
|
||||
case api.DeploymentModeCluster:
|
||||
// Scale dbservers, coordinators, syncmasters & syncworkers
|
||||
plan = append(plan, createScalePlan(log, status.Members.DBServers, api.ServerGroupDBServers, spec.DBServers.GetCount())...)
|
||||
plan = append(plan, createScalePlan(log, status.Members.Coordinators, api.ServerGroupCoordinators, spec.Coordinators.GetCount())...)
|
||||
plan = append(plan, createScalePlan(log, status.Members.SyncMasters, api.ServerGroupSyncMasters, spec.SyncMasters.GetCount())...)
|
||||
plan = append(plan, createScalePlan(log, status.Members.SyncWorkers, api.ServerGroupSyncWorkers, spec.SyncWorkers.GetCount())...)
|
||||
}
|
||||
}
|
||||
|
||||
// Check for the need to rotate one or more members
|
||||
if len(plan) == 0 {
|
||||
getPod := func(podName string) *v1.Pod {
|
||||
for _, p := range pods {
|
||||
if p.GetName() == podName {
|
||||
return &p
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
status.Members.ForeachServerGroup(func(group api.ServerGroup, members *api.MemberStatusList) error {
|
||||
for _, m := range *members {
|
||||
if len(plan) > 0 {
|
||||
// Only 1 change at a time
|
||||
continue
|
||||
}
|
||||
if m.Phase != api.MemberPhaseCreated {
|
||||
// Only rotate when phase is created
|
||||
continue
|
||||
}
|
||||
if podName := m.PodName; podName != "" {
|
||||
if p := getPod(podName); p != nil {
|
||||
// Got pod, compare it with what it should be
|
||||
decision := podNeedsUpgrading(*p, spec, status.Images)
|
||||
if decision.UpgradeNeeded && decision.UpgradeAllowed {
|
||||
plan = append(plan, createUpgradeMemberPlan(log, m, group, "Version upgrade")...)
|
||||
} else {
|
||||
rotNeeded, reason := podNeedsRotation(*p, apiObject, spec, group, status.Members.Agents, m.ID)
|
||||
if rotNeeded {
|
||||
plan = append(plan, createRotateMemberPlan(log, m, group, reason)...)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// Return plan
|
||||
return plan, true
|
||||
}
|
||||
|
|
38
pkg/deployment/resilience/context.go
Normal file
38
pkg/deployment/resilience/context.go
Normal file
|
@ -0,0 +1,38 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Author Ewout Prangsma
|
||||
//
|
||||
|
||||
package resilience
|
||||
|
||||
import (
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
|
||||
)
|
||||
|
||||
// Context provides methods to the resilience package.
|
||||
type Context interface {
|
||||
// GetSpec returns the current specification of the deployment
|
||||
GetSpec() api.DeploymentSpec
|
||||
// GetStatus returns the current status of the deployment
|
||||
GetStatus() api.DeploymentStatus
|
||||
// UpdateStatus replaces the status of the deployment with the given status and
|
||||
// updates the resources in k8s.
|
||||
UpdateStatus(status api.DeploymentStatus, force ...bool) error
|
||||
}
|
29
pkg/deployment/resilience/errors.go
Normal file
29
pkg/deployment/resilience/errors.go
Normal file
|
@ -0,0 +1,29 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Author Ewout Prangsma
|
||||
//
|
||||
|
||||
package resilience
|
||||
|
||||
import "github.com/pkg/errors"
|
||||
|
||||
var (
|
||||
maskAny = errors.WithStack
|
||||
)
|
98
pkg/deployment/resilience/member_failure.go
Normal file
98
pkg/deployment/resilience/member_failure.go
Normal file
|
@ -0,0 +1,98 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Author Ewout Prangsma
|
||||
//
|
||||
|
||||
package resilience
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
|
||||
)
|
||||
|
||||
const (
|
||||
recentTerminationsSinceGracePeriod = time.Minute * 10
|
||||
recentTerminationThreshold = 5
|
||||
)
|
||||
|
||||
// CheckMemberFailure performs a check for members that should be in failed state because:
|
||||
// - They are frequently restarted
|
||||
// - They cannot be scheduled for a long time (TODO)
|
||||
func (r *Resilience) CheckMemberFailure() error {
|
||||
status := r.context.GetStatus()
|
||||
updateStatusNeeded := false
|
||||
if err := status.Members.ForeachServerGroup(func(group api.ServerGroup, list *api.MemberStatusList) error {
|
||||
for _, m := range *list {
|
||||
log := r.log.With().
|
||||
Str("id", m.ID).
|
||||
Str("role", group.AsRole()).
|
||||
Logger()
|
||||
// Check current state
|
||||
if m.Phase != api.MemberPhaseCreated {
|
||||
continue
|
||||
}
|
||||
// Check if pod is ready
|
||||
if m.Conditions.IsTrue(api.ConditionTypeReady) {
|
||||
continue
|
||||
}
|
||||
// Check recent terminations
|
||||
count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod))
|
||||
if count >= recentTerminationThreshold {
|
||||
// Member has terminated too often in recent history.
|
||||
failureAcceptable, reason, err := r.isMemberFailureAcceptable(status, group, m)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to check is member failure is acceptable")
|
||||
} else if failureAcceptable {
|
||||
log.Info().Msg("Member has terminated too often in recent history, marking is failed")
|
||||
m.Phase = api.MemberPhaseFailed
|
||||
list.Update(m)
|
||||
updateStatusNeeded = true
|
||||
} else {
|
||||
log.Warn().Msgf("Member has terminated too often in recent history, but it is not safe to mark it a failed because: %s", reason)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}); err != nil {
|
||||
return maskAny(err)
|
||||
}
|
||||
if updateStatusNeeded {
|
||||
if err := r.context.UpdateStatus(status); err != nil {
|
||||
return maskAny(err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// isMemberFailureAcceptable checks if it is currently acceptable to switch the phase of the given member
|
||||
// to failed, which means that it will be replaced.
|
||||
// Return: failureAcceptable, notAcceptableReason, error
|
||||
func (r *Resilience) isMemberFailureAcceptable(status api.DeploymentStatus, group api.ServerGroup, m api.MemberStatus) (bool, string, error) {
|
||||
switch group {
|
||||
case api.ServerGroupCoordinators:
|
||||
return true, "", nil
|
||||
default:
|
||||
// TODO
|
||||
return false, "TODO", nil
|
||||
}
|
||||
}
|
40
pkg/deployment/resilience/resilience.go
Normal file
40
pkg/deployment/resilience/resilience.go
Normal file
|
@ -0,0 +1,40 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Author Ewout Prangsma
|
||||
//
|
||||
|
||||
package resilience
|
||||
|
||||
import "github.com/rs/zerolog"
|
||||
|
||||
// Resilience is the service that inspects the overall state of the deployment
|
||||
// to improve resilience.
|
||||
type Resilience struct {
|
||||
log zerolog.Logger
|
||||
context Context
|
||||
}
|
||||
|
||||
// NewResilience creates a new resilience with given context.
|
||||
func NewResilience(log zerolog.Logger, context Context) *Resilience {
|
||||
return &Resilience{
|
||||
log: log,
|
||||
context: context,
|
||||
}
|
||||
}
|
|
@ -317,7 +317,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
|
|||
roleAbbr := group.AsRoleAbbreviated()
|
||||
podSuffix := createPodSuffix(spec)
|
||||
m.PodName = k8sutil.CreatePodName(apiObject.GetName(), roleAbbr, m.ID, podSuffix)
|
||||
newState := api.MemberStateCreated
|
||||
newPhase := api.MemberPhaseCreated
|
||||
// Create pod
|
||||
if group.IsArangod() {
|
||||
// Find image ID
|
||||
|
@ -329,7 +329,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
|
|||
// Prepare arguments
|
||||
autoUpgrade := m.Conditions.IsTrue(api.ConditionTypeAutoUpgrade)
|
||||
if autoUpgrade {
|
||||
newState = api.MemberStateUpgrading
|
||||
newPhase = api.MemberPhaseUpgrading
|
||||
}
|
||||
args := createArangodArgs(apiObject, spec, group, status.Members.Agents, m.ID, autoUpgrade)
|
||||
env := make(map[string]k8sutil.EnvValue)
|
||||
|
@ -393,8 +393,8 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
|
|||
}
|
||||
log.Debug().Str("pod-name", m.PodName).Msg("Created pod")
|
||||
}
|
||||
// Record new member state
|
||||
m.State = newState
|
||||
// Record new member phase
|
||||
m.Phase = newPhase
|
||||
m.Conditions.Remove(api.ConditionTypeReady)
|
||||
m.Conditions.Remove(api.ConditionTypeTerminated)
|
||||
m.Conditions.Remove(api.ConditionTypeAutoUpgrade)
|
||||
|
@ -416,7 +416,7 @@ func (r *Resources) EnsurePods() error {
|
|||
status := r.context.GetStatus()
|
||||
if err := iterator.ForeachServerGroup(func(group api.ServerGroup, groupSpec api.ServerGroupSpec, status *api.MemberStatusList) error {
|
||||
for _, m := range *status {
|
||||
if m.State != api.MemberStateNone {
|
||||
if m.Phase != api.MemberPhaseNone {
|
||||
continue
|
||||
}
|
||||
spec := r.context.GetSpec()
|
||||
|
|
|
@ -137,10 +137,10 @@ func (r *Resources) InspectPods() error {
|
|||
for _, m := range *members {
|
||||
if podName := m.PodName; podName != "" {
|
||||
if !podExists(podName) {
|
||||
switch m.State {
|
||||
case api.MemberStateNone:
|
||||
switch m.Phase {
|
||||
case api.MemberPhaseNone:
|
||||
// Do nothing
|
||||
case api.MemberStateShuttingDown, api.MemberStateRotating, api.MemberStateUpgrading:
|
||||
case api.MemberPhaseShuttingDown, api.MemberPhaseRotating, api.MemberPhaseUpgrading, api.MemberPhaseFailed:
|
||||
// Shutdown was intended, so not need to do anything here.
|
||||
// Just mark terminated
|
||||
if m.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Terminated", "") {
|
||||
|
@ -150,7 +150,7 @@ func (r *Resources) InspectPods() error {
|
|||
}
|
||||
default:
|
||||
log.Debug().Str("pod-name", podName).Msg("Pod is gone")
|
||||
m.State = api.MemberStateNone // This is trigger a recreate of the pod.
|
||||
m.Phase = api.MemberPhaseNone // This is trigger a recreate of the pod.
|
||||
// Create event
|
||||
events = append(events, k8sutil.NewPodGoneEvent(podName, group.AsRole(), apiObject))
|
||||
if m.Conditions.Update(api.ConditionTypeReady, false, "Pod Does Not Exist", "") {
|
||||
|
|
Loading…
Reference in a new issue