1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

Detect member failure and replace member if needed

This commit is contained in:
Ewout Prangsma 2018-03-29 11:56:57 +02:00
parent fbc0d1b84c
commit 0fdb0c0c35
No known key found for this signature in database
GPG key ID: 4DBAD380D93D0698
18 changed files with 342 additions and 112 deletions

View file

@ -0,0 +1,43 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package v1alpha
// MemberPhase is a strongly typed lifetime phase of a deployment member
type MemberPhase string
const (
// MemberPhaseNone indicates that the state is not set yet
MemberPhaseNone MemberPhase = ""
// MemberPhaseCreated indicates that all resources needed for the member have been created
MemberPhaseCreated MemberPhase = "Created"
// MemberPhaseFailed indicates that the member is gone beyond hope of recovery. It must be replaced with a new member.
MemberPhaseFailed MemberPhase = "Failed"
// MemberPhaseCleanOut indicates that a dbserver is in the process of being cleaned out
MemberPhaseCleanOut MemberPhase = "CleanOut"
// MemberPhaseShuttingDown indicates that a member is shutting down
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
// MemberPhaseRotating indicates that a member is being rotated
MemberPhaseRotating MemberPhase = "Rotating"
// MemberPhaseUpgrading indicates that a member is in the process of upgrading its database data format
MemberPhaseUpgrading MemberPhase = "Upgrading"
)

View file

@ -1,41 +0,0 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package v1alpha
// MemberState is a strongly typed state of a deployment member
type MemberState string
const (
// MemberStateNone indicates that the state is not set yet
MemberStateNone MemberState = ""
// MemberStateCreated indicates that all resources needed for the member have been created
MemberStateCreated MemberState = "Created"
// MemberStateCleanOut indicates that a dbserver is in the process of being cleaned out
MemberStateCleanOut MemberState = "CleanOut"
// MemberStateShuttingDown indicates that a member is shutting down
MemberStateShuttingDown MemberState = "ShuttingDown"
// MemberStateRotating indicates that a member is being rotated
MemberStateRotating MemberState = "Rotating"
// MemberStateUpgrading indicates that a member is in the process of upgrading its database data format
MemberStateUpgrading MemberState = "Upgrading"
)

View file

@ -33,8 +33,8 @@ type MemberStatus struct {
// ID holds the unique ID of the member.
// This id is also used within the ArangoDB cluster to identify this server.
ID string `json:"id"`
// State holds the current state of this member
State MemberState `json:"state"`
// Phase holds the current lifetime phase of this member
Phase MemberPhase `json:"phase"`
// PersistentVolumeClaimName holds the name of the persistent volume claim used for this member (if any).
PersistentVolumeClaimName string `json:"persistentVolumeClaimName,omitempty"`
// PodName holds the name of the Pod that currently runs this member

View file

@ -108,7 +108,7 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
if len(l) > 0 {
// Try to find a not ready member
for _, m := range l {
if m.State == MemberStateNone {
if m.Phase == MemberPhaseNone {
return m, nil
}
}
@ -116,7 +116,7 @@ func (l MemberStatusList) SelectMemberToRemove() (MemberStatus, error) {
perm := rand.Perm(len(l))
for _, idx := range perm {
m := l[idx]
if m.State == MemberStateCreated {
if m.Phase == MemberPhaseCreated {
return m, nil
}
}

View file

@ -36,6 +36,7 @@ import (
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/deployment/reconcile"
"github.com/arangodb/kube-arangodb/pkg/deployment/resilience"
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
"github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
@ -92,6 +93,7 @@ type Deployment struct {
recentInspectionErrors int
clusterScalingIntegration *clusterScalingIntegration
reconciler *reconcile.Reconciler
resilience *resilience.Resilience
resources *resources.Resources
}
@ -111,6 +113,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
clientCache: newClientCache(deps.KubeCli, apiObject),
}
d.reconciler = reconcile.NewReconciler(deps.Log, d)
d.resilience = resilience.NewResilience(deps.Log, d)
d.resources = resources.NewResources(deps.Log, d)
if d.status.AcceptedSpec == nil {
// We've validated the spec, so let's use it from now.

View file

@ -76,6 +76,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject))
}
// Check members for resilience
if err := d.resilience.CheckMemberFailure(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Member failure detection failed", err, d.apiObject))
}
// Create scale/update plan
if err := d.reconciler.CreatePlan(); err != nil {
hasError = true

View file

@ -81,7 +81,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
log.Debug().Str("id", id).Msg("Adding single server")
if err := d.status.Members.Single.Add(api.MemberStatus{
ID: id,
State: api.MemberStateNone,
Phase: api.MemberPhaseNone,
PersistentVolumeClaimName: k8sutil.CreatePersistentVolumeClaimName(deploymentName, role, id),
PodName: "",
}); err != nil {
@ -91,7 +91,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
log.Debug().Str("id", id).Msg("Adding agent")
if err := d.status.Members.Agents.Add(api.MemberStatus{
ID: id,
State: api.MemberStateNone,
Phase: api.MemberPhaseNone,
PersistentVolumeClaimName: k8sutil.CreatePersistentVolumeClaimName(deploymentName, role, id),
PodName: "",
}); err != nil {
@ -101,7 +101,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
log.Debug().Str("id", id).Msg("Adding dbserver")
if err := d.status.Members.DBServers.Add(api.MemberStatus{
ID: id,
State: api.MemberStateNone,
Phase: api.MemberPhaseNone,
PersistentVolumeClaimName: k8sutil.CreatePersistentVolumeClaimName(deploymentName, role, id),
PodName: "",
}); err != nil {
@ -111,7 +111,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
log.Debug().Str("id", id).Msg("Adding coordinator")
if err := d.status.Members.Coordinators.Add(api.MemberStatus{
ID: id,
State: api.MemberStateNone,
Phase: api.MemberPhaseNone,
PersistentVolumeClaimName: "",
PodName: "",
}); err != nil {
@ -121,7 +121,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
log.Debug().Str("id", id).Msg("Adding syncmaster")
if err := d.status.Members.SyncMasters.Add(api.MemberStatus{
ID: id,
State: api.MemberStateNone,
Phase: api.MemberPhaseNone,
PersistentVolumeClaimName: "",
PodName: "",
}); err != nil {
@ -131,7 +131,7 @@ func (d *Deployment) createMember(group api.ServerGroup, apiObject *api.ArangoDe
log.Debug().Str("id", id).Msg("Adding syncworker")
if err := d.status.Members.SyncWorkers.Add(api.MemberStatus{
ID: id,
State: api.MemberStateNone,
Phase: api.MemberPhaseNone,
PersistentVolumeClaimName: "",
PodName: "",
}); err != nil {

View file

@ -71,7 +71,7 @@ func (a *actionCleanoutMember) Start(ctx context.Context) (bool, error) {
return false, maskAny(err)
}
// Update status
m.State = api.MemberStateCleanOut
m.Phase = api.MemberPhaseCleanOut
if a.actionCtx.UpdateMember(m); err != nil {
return false, maskAny(err)
}

View file

@ -83,7 +83,7 @@ func (a *actionRotateMember) Start(ctx context.Context) (bool, error) {
}
}
// Update status
m.State = api.MemberStateRotating
m.Phase = api.MemberPhaseRotating
if err := a.actionCtx.UpdateMember(m); err != nil {
return false, maskAny(err)
}
@ -109,7 +109,7 @@ func (a *actionRotateMember) CheckProgress(ctx context.Context) (bool, error) {
return false, maskAny(err)
}
// Pod is now gone, update the member status
m.State = api.MemberStateNone
m.Phase = api.MemberPhaseNone
m.RecentTerminations = nil // Since we're rotating, we do not care about old terminations.
if err := a.actionCtx.UpdateMember(m); err != nil {
return false, maskAny(err)

View file

@ -89,7 +89,7 @@ func (a *actionShutdownMember) Start(ctx context.Context) (bool, error) {
}
}
// Update status
m.State = api.MemberStateShuttingDown
m.Phase = api.MemberPhaseShuttingDown
if err := a.actionCtx.UpdateMember(m); err != nil {
return false, maskAny(err)
}

View file

@ -88,7 +88,7 @@ func (a *actionUpgradeMember) Start(ctx context.Context) (bool, error) {
}
}
// Update status
m.State = api.MemberStateRotating // We keep the rotation state here, since only when a new pod is created, it will get the Upgrading state.
m.Phase = api.MemberPhaseRotating // We keep the rotation phase here, since only when a new pod is created, it will get the Upgrading phase.
if err := a.actionCtx.UpdateMember(m); err != nil {
return false, maskAny(err)
}
@ -105,7 +105,7 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, error) {
log.Error().Msg("No such member")
return true, nil
}
isUpgrading := m.State == api.MemberStateUpgrading
isUpgrading := m.Phase == api.MemberPhaseUpgrading
log = log.With().
Str("pod-name", m.PodName).
Bool("is-upgrading", isUpgrading).Logger()
@ -119,7 +119,7 @@ func (a *actionUpgradeMember) CheckProgress(ctx context.Context) (bool, error) {
return false, maskAny(err)
}
// Pod is now gone, update the member status
m.State = api.MemberStateNone
m.Phase = api.MemberPhaseNone
m.RecentTerminations = nil // Since we're upgrading, we do not care about old terminations.
if err := a.actionCtx.UpdateMember(m); err != nil {
return false, maskAny(err)

View file

@ -85,58 +85,72 @@ func createPlan(log zerolog.Logger, apiObject metav1.Object,
// Check for various scenario's
var plan api.Plan
// Check for scale up/down
switch spec.GetMode() {
case api.DeploymentModeSingle:
// Never scale down
case api.DeploymentModeResilientSingle:
// Only scale singles
plan = append(plan, createScalePlan(log, status.Members.Single, api.ServerGroupSingle, spec.Single.GetCount())...)
case api.DeploymentModeCluster:
// Scale dbservers, coordinators, syncmasters & syncworkers
plan = append(plan, createScalePlan(log, status.Members.DBServers, api.ServerGroupDBServers, spec.DBServers.GetCount())...)
plan = append(plan, createScalePlan(log, status.Members.Coordinators, api.ServerGroupCoordinators, spec.Coordinators.GetCount())...)
plan = append(plan, createScalePlan(log, status.Members.SyncMasters, api.ServerGroupSyncMasters, spec.SyncMasters.GetCount())...)
plan = append(plan, createScalePlan(log, status.Members.SyncWorkers, api.ServerGroupSyncWorkers, spec.SyncWorkers.GetCount())...)
}
// Check for the need to rotate one or more members
getPod := func(podName string) *v1.Pod {
for _, p := range pods {
if p.GetName() == podName {
return &p
}
}
return nil
}
// Check for members in failed state
status.Members.ForeachServerGroup(func(group api.ServerGroup, members *api.MemberStatusList) error {
for _, m := range *members {
if len(plan) > 0 {
// Only 1 change at a time
continue
}
if m.State != api.MemberStateCreated {
// Only rotate when state is created
continue
}
if podName := m.PodName; podName != "" {
if p := getPod(podName); p != nil {
// Got pod, compare it with what it should be
decision := podNeedsUpgrading(*p, spec, status.Images)
if decision.UpgradeNeeded && decision.UpgradeAllowed {
plan = append(plan, createUpgradeMemberPlan(log, m, group, "Version upgrade")...)
} else {
rotNeeded, reason := podNeedsRotation(*p, apiObject, spec, group, status.Members.Agents, m.ID)
if rotNeeded {
plan = append(plan, createRotateMemberPlan(log, m, group, reason)...)
}
}
}
if m.Phase == api.MemberPhaseFailed && len(plan) == 0 {
plan = append(plan, api.NewAction(api.ActionTypeRemoveMember, group, m.ID))
}
}
return nil
})
// Check for scale up/down
if len(plan) == 0 {
switch spec.GetMode() {
case api.DeploymentModeSingle:
// Never scale down
case api.DeploymentModeResilientSingle:
// Only scale singles
plan = append(plan, createScalePlan(log, status.Members.Single, api.ServerGroupSingle, spec.Single.GetCount())...)
case api.DeploymentModeCluster:
// Scale dbservers, coordinators, syncmasters & syncworkers
plan = append(plan, createScalePlan(log, status.Members.DBServers, api.ServerGroupDBServers, spec.DBServers.GetCount())...)
plan = append(plan, createScalePlan(log, status.Members.Coordinators, api.ServerGroupCoordinators, spec.Coordinators.GetCount())...)
plan = append(plan, createScalePlan(log, status.Members.SyncMasters, api.ServerGroupSyncMasters, spec.SyncMasters.GetCount())...)
plan = append(plan, createScalePlan(log, status.Members.SyncWorkers, api.ServerGroupSyncWorkers, spec.SyncWorkers.GetCount())...)
}
}
// Check for the need to rotate one or more members
if len(plan) == 0 {
getPod := func(podName string) *v1.Pod {
for _, p := range pods {
if p.GetName() == podName {
return &p
}
}
return nil
}
status.Members.ForeachServerGroup(func(group api.ServerGroup, members *api.MemberStatusList) error {
for _, m := range *members {
if len(plan) > 0 {
// Only 1 change at a time
continue
}
if m.Phase != api.MemberPhaseCreated {
// Only rotate when phase is created
continue
}
if podName := m.PodName; podName != "" {
if p := getPod(podName); p != nil {
// Got pod, compare it with what it should be
decision := podNeedsUpgrading(*p, spec, status.Images)
if decision.UpgradeNeeded && decision.UpgradeAllowed {
plan = append(plan, createUpgradeMemberPlan(log, m, group, "Version upgrade")...)
} else {
rotNeeded, reason := podNeedsRotation(*p, apiObject, spec, group, status.Members.Agents, m.ID)
if rotNeeded {
plan = append(plan, createRotateMemberPlan(log, m, group, reason)...)
}
}
}
}
}
return nil
})
}
// Return plan
return plan, true
}

View file

@ -0,0 +1,38 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package resilience
import (
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
)
// Context provides methods to the resilience package.
type Context interface {
// GetSpec returns the current specification of the deployment
GetSpec() api.DeploymentSpec
// GetStatus returns the current status of the deployment
GetStatus() api.DeploymentStatus
// UpdateStatus replaces the status of the deployment with the given status and
// updates the resources in k8s.
UpdateStatus(status api.DeploymentStatus, force ...bool) error
}

View file

@ -0,0 +1,29 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package resilience
import "github.com/pkg/errors"
var (
maskAny = errors.WithStack
)

View file

@ -0,0 +1,98 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package resilience
import (
"time"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
)
const (
recentTerminationsSinceGracePeriod = time.Minute * 10
recentTerminationThreshold = 5
)
// CheckMemberFailure performs a check for members that should be in failed state because:
// - They are frequently restarted
// - They cannot be scheduled for a long time (TODO)
func (r *Resilience) CheckMemberFailure() error {
status := r.context.GetStatus()
updateStatusNeeded := false
if err := status.Members.ForeachServerGroup(func(group api.ServerGroup, list *api.MemberStatusList) error {
for _, m := range *list {
log := r.log.With().
Str("id", m.ID).
Str("role", group.AsRole()).
Logger()
// Check current state
if m.Phase != api.MemberPhaseCreated {
continue
}
// Check if pod is ready
if m.Conditions.IsTrue(api.ConditionTypeReady) {
continue
}
// Check recent terminations
count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod))
if count >= recentTerminationThreshold {
// Member has terminated too often in recent history.
failureAcceptable, reason, err := r.isMemberFailureAcceptable(status, group, m)
if err != nil {
log.Warn().Err(err).Msg("Failed to check is member failure is acceptable")
} else if failureAcceptable {
log.Info().Msg("Member has terminated too often in recent history, marking is failed")
m.Phase = api.MemberPhaseFailed
list.Update(m)
updateStatusNeeded = true
} else {
log.Warn().Msgf("Member has terminated too often in recent history, but it is not safe to mark it a failed because: %s", reason)
}
}
}
return nil
}); err != nil {
return maskAny(err)
}
if updateStatusNeeded {
if err := r.context.UpdateStatus(status); err != nil {
return maskAny(err)
}
}
return nil
}
// isMemberFailureAcceptable checks if it is currently acceptable to switch the phase of the given member
// to failed, which means that it will be replaced.
// Return: failureAcceptable, notAcceptableReason, error
func (r *Resilience) isMemberFailureAcceptable(status api.DeploymentStatus, group api.ServerGroup, m api.MemberStatus) (bool, string, error) {
switch group {
case api.ServerGroupCoordinators:
return true, "", nil
default:
// TODO
return false, "TODO", nil
}
}

View file

@ -0,0 +1,40 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//
package resilience
import "github.com/rs/zerolog"
// Resilience is the service that inspects the overall state of the deployment
// to improve resilience.
type Resilience struct {
log zerolog.Logger
context Context
}
// NewResilience creates a new resilience with given context.
func NewResilience(log zerolog.Logger, context Context) *Resilience {
return &Resilience{
log: log,
context: context,
}
}

View file

@ -317,7 +317,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
roleAbbr := group.AsRoleAbbreviated()
podSuffix := createPodSuffix(spec)
m.PodName = k8sutil.CreatePodName(apiObject.GetName(), roleAbbr, m.ID, podSuffix)
newState := api.MemberStateCreated
newPhase := api.MemberPhaseCreated
// Create pod
if group.IsArangod() {
// Find image ID
@ -329,7 +329,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
// Prepare arguments
autoUpgrade := m.Conditions.IsTrue(api.ConditionTypeAutoUpgrade)
if autoUpgrade {
newState = api.MemberStateUpgrading
newPhase = api.MemberPhaseUpgrading
}
args := createArangodArgs(apiObject, spec, group, status.Members.Agents, m.ID, autoUpgrade)
env := make(map[string]k8sutil.EnvValue)
@ -393,8 +393,8 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
}
log.Debug().Str("pod-name", m.PodName).Msg("Created pod")
}
// Record new member state
m.State = newState
// Record new member phase
m.Phase = newPhase
m.Conditions.Remove(api.ConditionTypeReady)
m.Conditions.Remove(api.ConditionTypeTerminated)
m.Conditions.Remove(api.ConditionTypeAutoUpgrade)
@ -416,7 +416,7 @@ func (r *Resources) EnsurePods() error {
status := r.context.GetStatus()
if err := iterator.ForeachServerGroup(func(group api.ServerGroup, groupSpec api.ServerGroupSpec, status *api.MemberStatusList) error {
for _, m := range *status {
if m.State != api.MemberStateNone {
if m.Phase != api.MemberPhaseNone {
continue
}
spec := r.context.GetSpec()

View file

@ -137,10 +137,10 @@ func (r *Resources) InspectPods() error {
for _, m := range *members {
if podName := m.PodName; podName != "" {
if !podExists(podName) {
switch m.State {
case api.MemberStateNone:
switch m.Phase {
case api.MemberPhaseNone:
// Do nothing
case api.MemberStateShuttingDown, api.MemberStateRotating, api.MemberStateUpgrading:
case api.MemberPhaseShuttingDown, api.MemberPhaseRotating, api.MemberPhaseUpgrading, api.MemberPhaseFailed:
// Shutdown was intended, so not need to do anything here.
// Just mark terminated
if m.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Terminated", "") {
@ -150,7 +150,7 @@ func (r *Resources) InspectPods() error {
}
default:
log.Debug().Str("pod-name", podName).Msg("Pod is gone")
m.State = api.MemberStateNone // This is trigger a recreate of the pod.
m.Phase = api.MemberPhaseNone // This is trigger a recreate of the pod.
// Create event
events = append(events, k8sutil.NewPodGoneEvent(podName, group.AsRole(), apiObject))
if m.Conditions.Update(api.ConditionTypeReady, false, "Pod Does Not Exist", "") {