mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Improvement] GT-248 restart non-scheduled pod (#1376)
This commit is contained in:
parent
2d7ee7ad50
commit
d697216e08
8 changed files with 133 additions and 8 deletions
|
@ -29,6 +29,7 @@
|
|||
- (Maintenance) Bump K8S Version to 1.24.16
|
||||
- (Maintenance) Bump K8S Version to 1.25.12
|
||||
- (Maintenance) Bump Go to 1.20.7
|
||||
- (Improvement) Remove PodSchedulingFailure condition instead of setting to false, restart pod if it could not be scheduled
|
||||
|
||||
## [1.2.31](https://github.com/arangodb/kube-arangodb/tree/1.2.31) (2023-07-14)
|
||||
- (Improvement) Block traffic on the services if there is more than 1 active leader in ActiveFailover mode
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
| JWTRefresh | no | 10m0s | no | Enterprise Only | Refresh current JWT secrets on the member |
|
||||
| JWTSetActive | no | 10m0s | no | Enterprise Only | Change active JWT key on the cluster |
|
||||
| JWTStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of JWT propagation |
|
||||
| KillMemberPod | no | 10m0s | no | Community & Enterprise | Execute Delete on Pod 9put pod in Terminating state) |
|
||||
| KillMemberPod | no | 10m0s | no | Community & Enterprise | Execute Delete on Pod (put pod in Terminating state) |
|
||||
| LicenseSet | no | 10m0s | no | Community & Enterprise | Update Cluster license (3.9+) |
|
||||
| MarkToRemoveMember | no | 10m0s | no | Community & Enterprise | Marks member to be removed. Used when member Pod is annotated with replace annotation |
|
||||
| MemberPhaseUpdate | no | 10m0s | no | Community & Enterprise | Change member phase |
|
||||
|
|
|
@ -30,7 +30,10 @@ actions:
|
|||
timeout: 30m
|
||||
optional: true
|
||||
KillMemberPod:
|
||||
description: Execute Delete on Pod 9put pod in Terminating state)
|
||||
description: Execute Delete on Pod (put pod in Terminating state)
|
||||
scopes:
|
||||
- Normal
|
||||
- High
|
||||
RotateMember:
|
||||
description: Waits for Pod restart and recreation
|
||||
timeout: 15m
|
||||
|
|
|
@ -374,7 +374,7 @@ const (
|
|||
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
|
||||
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
|
||||
|
||||
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod 9put pod in Terminating state)
|
||||
// ActionTypeKillMemberPod in scopes High and Normal. Execute Delete on Pod (put pod in Terminating state)
|
||||
ActionTypeKillMemberPod ActionType = "KillMemberPod"
|
||||
|
||||
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)
|
||||
|
@ -776,7 +776,7 @@ func (a ActionType) Priority() ActionPriority {
|
|||
case ActionTypeJWTStatusUpdate:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeKillMemberPod:
|
||||
return ActionPriorityNormal
|
||||
return ActionPriorityHigh
|
||||
case ActionTypeLicenseSet:
|
||||
return ActionPriorityNormal
|
||||
case ActionTypeMarkToRemoveMember:
|
||||
|
|
|
@ -374,7 +374,7 @@ const (
|
|||
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
|
||||
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
|
||||
|
||||
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod 9put pod in Terminating state)
|
||||
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod (put pod in Terminating state)
|
||||
ActionTypeKillMemberPod ActionType = "KillMemberPod"
|
||||
|
||||
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)
|
||||
|
|
|
@ -54,6 +54,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb
|
|||
ApplyIfEmpty(r.updateMemberUpdateConditionsPlan).
|
||||
ApplyIfEmpty(r.updateMemberRotationConditionsPlan).
|
||||
ApplyIfEmpty(r.createMemberRecreationConditionsPlan).
|
||||
ApplyIfEmpty(r.createMemberPodSchedulingFailurePlan).
|
||||
ApplyIfEmpty(r.createRotateServerStoragePVCPendingResizeConditionPlan).
|
||||
ApplyIfEmpty(r.createChangeMemberArchPlan).
|
||||
ApplyIfEmpty(r.createRotateServerStorageResizePlanRuntime).
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package reconcile
|
||||
|
||||
import (
|
||||
"context"
|
||||
"reflect"
|
||||
|
||||
core "k8s.io/api/core/v1"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
|
||||
)
|
||||
|
||||
// createMemberPodSchedulingFailurePlan creates plan actions which are required when
|
||||
// some pod has failed to schedule and scheduling parameters already changed
|
||||
func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
|
||||
_ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
|
||||
|
||||
var p api.Plan
|
||||
if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) {
|
||||
return p
|
||||
}
|
||||
|
||||
for _, m := range status.Members.AsList() {
|
||||
l := r.log.Str("id", m.Member.ID).Str("role", m.Group.AsRole())
|
||||
|
||||
if m.Member.Phase != api.MemberPhaseCreated || m.Member.Pod.GetName() == "" {
|
||||
// Act only when phase is created
|
||||
continue
|
||||
}
|
||||
|
||||
if m.Member.Conditions.IsTrue(api.ConditionTypeScheduled) || m.Member.Conditions.IsTrue(api.ConditionTypeTerminating) {
|
||||
// Action is needed only for pods which are not scheduled yet
|
||||
continue
|
||||
}
|
||||
|
||||
imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member)
|
||||
if !imageFound {
|
||||
l.Warn("could not find image for already created member")
|
||||
continue
|
||||
}
|
||||
|
||||
renderedPod, err := context.RenderPodForMember(ctx, context.ACS(), spec, status, m.Member.ID, imageInfo)
|
||||
if err != nil {
|
||||
l.Err(err).Warn("could not render pod for already created member")
|
||||
continue
|
||||
}
|
||||
|
||||
if r.isSchedulingParametersChanged(renderedPod.Spec, m.Member, context) {
|
||||
l.Info("Adding KillMemberPod action: scheduling failed and parameters already updated")
|
||||
p = append(p,
|
||||
actions.NewAction(api.ActionTypeKillMemberPod, m.Group, m.Member, "Scheduling failed"),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
// isSchedulingParametersChanged returns true if parameters related to pod scheduling has changed
|
||||
func (r *Reconciler) isSchedulingParametersChanged(expectedSpec core.PodSpec, member api.MemberStatus, context PlanBuilderContext) bool {
|
||||
cache, ok := context.ACS().ClusterCache(member.ClusterID)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
pod, ok := cache.Pod().V1().GetSimple(member.Pod.GetName())
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
if r.schedulingParametersAreTheSame(expectedSpec, pod.Spec) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (r *Reconciler) schedulingParametersAreTheSame(expectedSpec, actualSpec core.PodSpec) bool {
|
||||
if expectedSpec.PriorityClassName != actualSpec.PriorityClassName {
|
||||
return false
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expectedSpec.Tolerations, actualSpec.Tolerations) {
|
||||
return false
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expectedSpec.NodeSelector, actualSpec.NodeSelector) {
|
||||
return false
|
||||
}
|
||||
|
||||
// we should use SHA256 here because DeepEqual might be unreliable for Affinity rules
|
||||
if specC, err := util.SHA256FromJSON(expectedSpec.Affinity); err != nil {
|
||||
return true
|
||||
} else {
|
||||
if statusC, err := util.SHA256FromJSON(actualSpec.Affinity); err != nil {
|
||||
return true
|
||||
} else if specC != statusC {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
|
@ -527,9 +527,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
|
|||
}
|
||||
} else if status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) &&
|
||||
len(unscheduledPodNames) == 0 {
|
||||
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, false,
|
||||
"Pods Scheduling Resolved",
|
||||
"No pod reports a scheduling timeout") {
|
||||
if status.Conditions.Remove(api.ConditionTypePodSchedulingFailure) {
|
||||
r.context.CreateEvent(k8sutil.NewPodsSchedulingResolvedEvent(r.context.GetAPIObject()))
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue