mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Improvement] GT-248 restart non-scheduled pod (#1376)
This commit is contained in:
parent
2d7ee7ad50
commit
d697216e08
8 changed files with 133 additions and 8 deletions
|
@ -29,6 +29,7 @@
|
||||||
- (Maintenance) Bump K8S Version to 1.24.16
|
- (Maintenance) Bump K8S Version to 1.24.16
|
||||||
- (Maintenance) Bump K8S Version to 1.25.12
|
- (Maintenance) Bump K8S Version to 1.25.12
|
||||||
- (Maintenance) Bump Go to 1.20.7
|
- (Maintenance) Bump Go to 1.20.7
|
||||||
|
- (Improvement) Remove PodSchedulingFailure condition instead of setting to false, restart pod if it could not be scheduled
|
||||||
|
|
||||||
## [1.2.31](https://github.com/arangodb/kube-arangodb/tree/1.2.31) (2023-07-14)
|
## [1.2.31](https://github.com/arangodb/kube-arangodb/tree/1.2.31) (2023-07-14)
|
||||||
- (Improvement) Block traffic on the services if there is more than 1 active leader in ActiveFailover mode
|
- (Improvement) Block traffic on the services if there is more than 1 active leader in ActiveFailover mode
|
||||||
|
|
|
@ -36,7 +36,7 @@
|
||||||
| JWTRefresh | no | 10m0s | no | Enterprise Only | Refresh current JWT secrets on the member |
|
| JWTRefresh | no | 10m0s | no | Enterprise Only | Refresh current JWT secrets on the member |
|
||||||
| JWTSetActive | no | 10m0s | no | Enterprise Only | Change active JWT key on the cluster |
|
| JWTSetActive | no | 10m0s | no | Enterprise Only | Change active JWT key on the cluster |
|
||||||
| JWTStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of JWT propagation |
|
| JWTStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of JWT propagation |
|
||||||
| KillMemberPod | no | 10m0s | no | Community & Enterprise | Execute Delete on Pod 9put pod in Terminating state) |
|
| KillMemberPod | no | 10m0s | no | Community & Enterprise | Execute Delete on Pod (put pod in Terminating state) |
|
||||||
| LicenseSet | no | 10m0s | no | Community & Enterprise | Update Cluster license (3.9+) |
|
| LicenseSet | no | 10m0s | no | Community & Enterprise | Update Cluster license (3.9+) |
|
||||||
| MarkToRemoveMember | no | 10m0s | no | Community & Enterprise | Marks member to be removed. Used when member Pod is annotated with replace annotation |
|
| MarkToRemoveMember | no | 10m0s | no | Community & Enterprise | Marks member to be removed. Used when member Pod is annotated with replace annotation |
|
||||||
| MemberPhaseUpdate | no | 10m0s | no | Community & Enterprise | Change member phase |
|
| MemberPhaseUpdate | no | 10m0s | no | Community & Enterprise | Change member phase |
|
||||||
|
|
|
@ -30,7 +30,10 @@ actions:
|
||||||
timeout: 30m
|
timeout: 30m
|
||||||
optional: true
|
optional: true
|
||||||
KillMemberPod:
|
KillMemberPod:
|
||||||
description: Execute Delete on Pod 9put pod in Terminating state)
|
description: Execute Delete on Pod (put pod in Terminating state)
|
||||||
|
scopes:
|
||||||
|
- Normal
|
||||||
|
- High
|
||||||
RotateMember:
|
RotateMember:
|
||||||
description: Waits for Pod restart and recreation
|
description: Waits for Pod restart and recreation
|
||||||
timeout: 15m
|
timeout: 15m
|
||||||
|
|
|
@ -374,7 +374,7 @@ const (
|
||||||
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
|
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
|
||||||
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
|
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
|
||||||
|
|
||||||
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod 9put pod in Terminating state)
|
// ActionTypeKillMemberPod in scopes High and Normal. Execute Delete on Pod (put pod in Terminating state)
|
||||||
ActionTypeKillMemberPod ActionType = "KillMemberPod"
|
ActionTypeKillMemberPod ActionType = "KillMemberPod"
|
||||||
|
|
||||||
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)
|
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)
|
||||||
|
@ -776,7 +776,7 @@ func (a ActionType) Priority() ActionPriority {
|
||||||
case ActionTypeJWTStatusUpdate:
|
case ActionTypeJWTStatusUpdate:
|
||||||
return ActionPriorityNormal
|
return ActionPriorityNormal
|
||||||
case ActionTypeKillMemberPod:
|
case ActionTypeKillMemberPod:
|
||||||
return ActionPriorityNormal
|
return ActionPriorityHigh
|
||||||
case ActionTypeLicenseSet:
|
case ActionTypeLicenseSet:
|
||||||
return ActionPriorityNormal
|
return ActionPriorityNormal
|
||||||
case ActionTypeMarkToRemoveMember:
|
case ActionTypeMarkToRemoveMember:
|
||||||
|
|
|
@ -374,7 +374,7 @@ const (
|
||||||
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
|
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
|
||||||
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
|
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
|
||||||
|
|
||||||
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod 9put pod in Terminating state)
|
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod (put pod in Terminating state)
|
||||||
ActionTypeKillMemberPod ActionType = "KillMemberPod"
|
ActionTypeKillMemberPod ActionType = "KillMemberPod"
|
||||||
|
|
||||||
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)
|
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)
|
||||||
|
|
|
@ -54,6 +54,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb
|
||||||
ApplyIfEmpty(r.updateMemberUpdateConditionsPlan).
|
ApplyIfEmpty(r.updateMemberUpdateConditionsPlan).
|
||||||
ApplyIfEmpty(r.updateMemberRotationConditionsPlan).
|
ApplyIfEmpty(r.updateMemberRotationConditionsPlan).
|
||||||
ApplyIfEmpty(r.createMemberRecreationConditionsPlan).
|
ApplyIfEmpty(r.createMemberRecreationConditionsPlan).
|
||||||
|
ApplyIfEmpty(r.createMemberPodSchedulingFailurePlan).
|
||||||
ApplyIfEmpty(r.createRotateServerStoragePVCPendingResizeConditionPlan).
|
ApplyIfEmpty(r.createRotateServerStoragePVCPendingResizeConditionPlan).
|
||||||
ApplyIfEmpty(r.createChangeMemberArchPlan).
|
ApplyIfEmpty(r.createChangeMemberArchPlan).
|
||||||
ApplyIfEmpty(r.createRotateServerStorageResizePlanRuntime).
|
ApplyIfEmpty(r.createRotateServerStorageResizePlanRuntime).
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
//
|
||||||
|
// DISCLAIMER
|
||||||
|
//
|
||||||
|
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
//
|
||||||
|
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||||
|
//
|
||||||
|
|
||||||
|
package reconcile
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
|
core "k8s.io/api/core/v1"
|
||||||
|
|
||||||
|
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||||
|
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
|
||||||
|
"github.com/arangodb/kube-arangodb/pkg/util"
|
||||||
|
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||||
|
)
|
||||||
|
|
||||||
|
// createMemberPodSchedulingFailurePlan creates plan actions which are required when
|
||||||
|
// some pod has failed to schedule and scheduling parameters already changed
|
||||||
|
func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
|
||||||
|
_ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
|
||||||
|
|
||||||
|
var p api.Plan
|
||||||
|
if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) {
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, m := range status.Members.AsList() {
|
||||||
|
l := r.log.Str("id", m.Member.ID).Str("role", m.Group.AsRole())
|
||||||
|
|
||||||
|
if m.Member.Phase != api.MemberPhaseCreated || m.Member.Pod.GetName() == "" {
|
||||||
|
// Act only when phase is created
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.Member.Conditions.IsTrue(api.ConditionTypeScheduled) || m.Member.Conditions.IsTrue(api.ConditionTypeTerminating) {
|
||||||
|
// Action is needed only for pods which are not scheduled yet
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member)
|
||||||
|
if !imageFound {
|
||||||
|
l.Warn("could not find image for already created member")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
renderedPod, err := context.RenderPodForMember(ctx, context.ACS(), spec, status, m.Member.ID, imageInfo)
|
||||||
|
if err != nil {
|
||||||
|
l.Err(err).Warn("could not render pod for already created member")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.isSchedulingParametersChanged(renderedPod.Spec, m.Member, context) {
|
||||||
|
l.Info("Adding KillMemberPod action: scheduling failed and parameters already updated")
|
||||||
|
p = append(p,
|
||||||
|
actions.NewAction(api.ActionTypeKillMemberPod, m.Group, m.Member, "Scheduling failed"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
|
// isSchedulingParametersChanged returns true if parameters related to pod scheduling has changed
|
||||||
|
func (r *Reconciler) isSchedulingParametersChanged(expectedSpec core.PodSpec, member api.MemberStatus, context PlanBuilderContext) bool {
|
||||||
|
cache, ok := context.ACS().ClusterCache(member.ClusterID)
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
pod, ok := cache.Pod().V1().GetSimple(member.Pod.GetName())
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if r.schedulingParametersAreTheSame(expectedSpec, pod.Spec) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Reconciler) schedulingParametersAreTheSame(expectedSpec, actualSpec core.PodSpec) bool {
|
||||||
|
if expectedSpec.PriorityClassName != actualSpec.PriorityClassName {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(expectedSpec.Tolerations, actualSpec.Tolerations) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(expectedSpec.NodeSelector, actualSpec.NodeSelector) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// we should use SHA256 here because DeepEqual might be unreliable for Affinity rules
|
||||||
|
if specC, err := util.SHA256FromJSON(expectedSpec.Affinity); err != nil {
|
||||||
|
return true
|
||||||
|
} else {
|
||||||
|
if statusC, err := util.SHA256FromJSON(actualSpec.Affinity); err != nil {
|
||||||
|
return true
|
||||||
|
} else if specC != statusC {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
|
@ -527,9 +527,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
|
||||||
}
|
}
|
||||||
} else if status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) &&
|
} else if status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) &&
|
||||||
len(unscheduledPodNames) == 0 {
|
len(unscheduledPodNames) == 0 {
|
||||||
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, false,
|
if status.Conditions.Remove(api.ConditionTypePodSchedulingFailure) {
|
||||||
"Pods Scheduling Resolved",
|
|
||||||
"No pod reports a scheduling timeout") {
|
|
||||||
r.context.CreateEvent(k8sutil.NewPodsSchedulingResolvedEvent(r.context.GetAPIObject()))
|
r.context.CreateEvent(k8sutil.NewPodsSchedulingResolvedEvent(r.context.GetAPIObject()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue