1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Improvement] GT-248 restart non-scheduled pod (#1376)

This commit is contained in:
Nikita Vaniasin 2023-09-15 10:19:46 +02:00 committed by GitHub
parent 2d7ee7ad50
commit d697216e08
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 133 additions and 8 deletions

View file

@ -29,6 +29,7 @@
- (Maintenance) Bump K8S Version to 1.24.16
- (Maintenance) Bump K8S Version to 1.25.12
- (Maintenance) Bump Go to 1.20.7
- (Improvement) Remove PodSchedulingFailure condition instead of setting to false, restart pod if it could not be scheduled
## [1.2.31](https://github.com/arangodb/kube-arangodb/tree/1.2.31) (2023-07-14)
- (Improvement) Block traffic on the services if there is more than 1 active leader in ActiveFailover mode

View file

@ -36,7 +36,7 @@
| JWTRefresh | no | 10m0s | no | Enterprise Only | Refresh current JWT secrets on the member |
| JWTSetActive | no | 10m0s | no | Enterprise Only | Change active JWT key on the cluster |
| JWTStatusUpdate | no | 10m0s | no | Enterprise Only | Update status of JWT propagation |
| KillMemberPod | no | 10m0s | no | Community & Enterprise | Execute Delete on Pod 9put pod in Terminating state) |
| KillMemberPod | no | 10m0s | no | Community & Enterprise | Execute Delete on Pod (put pod in Terminating state) |
| LicenseSet | no | 10m0s | no | Community & Enterprise | Update Cluster license (3.9+) |
| MarkToRemoveMember | no | 10m0s | no | Community & Enterprise | Marks member to be removed. Used when member Pod is annotated with replace annotation |
| MemberPhaseUpdate | no | 10m0s | no | Community & Enterprise | Change member phase |

View file

@ -30,7 +30,10 @@ actions:
timeout: 30m
optional: true
KillMemberPod:
description: Execute Delete on Pod 9put pod in Terminating state)
description: Execute Delete on Pod (put pod in Terminating state)
scopes:
- Normal
- High
RotateMember:
description: Waits for Pod restart and recreation
timeout: 15m

View file

@ -374,7 +374,7 @@ const (
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod 9put pod in Terminating state)
// ActionTypeKillMemberPod in scopes High and Normal. Execute Delete on Pod (put pod in Terminating state)
ActionTypeKillMemberPod ActionType = "KillMemberPod"
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)
@ -776,7 +776,7 @@ func (a ActionType) Priority() ActionPriority {
case ActionTypeJWTStatusUpdate:
return ActionPriorityNormal
case ActionTypeKillMemberPod:
return ActionPriorityNormal
return ActionPriorityHigh
case ActionTypeLicenseSet:
return ActionPriorityNormal
case ActionTypeMarkToRemoveMember:

View file

@ -374,7 +374,7 @@ const (
// ActionTypeJWTStatusUpdate in scopes Normal. Update status of JWT propagation
ActionTypeJWTStatusUpdate ActionType = "JWTStatusUpdate"
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod 9put pod in Terminating state)
// ActionTypeKillMemberPod in scopes Normal. Execute Delete on Pod (put pod in Terminating state)
ActionTypeKillMemberPod ActionType = "KillMemberPod"
// ActionTypeLicenseSet in scopes Normal. Update Cluster license (3.9+)

View file

@ -54,6 +54,7 @@ func (r *Reconciler) createHighPlan(ctx context.Context, apiObject k8sutil.APIOb
ApplyIfEmpty(r.updateMemberUpdateConditionsPlan).
ApplyIfEmpty(r.updateMemberRotationConditionsPlan).
ApplyIfEmpty(r.createMemberRecreationConditionsPlan).
ApplyIfEmpty(r.createMemberPodSchedulingFailurePlan).
ApplyIfEmpty(r.createRotateServerStoragePVCPendingResizeConditionPlan).
ApplyIfEmpty(r.createChangeMemberArchPlan).
ApplyIfEmpty(r.createRotateServerStorageResizePlanRuntime).

View file

@ -0,0 +1,122 @@
//
// DISCLAIMER
//
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package reconcile
import (
"context"
"reflect"
core "k8s.io/api/core/v1"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/util"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)
// createMemberPodSchedulingFailurePlan creates plan actions which are required when
// some pod has failed to schedule and scheduling parameters already changed
func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
_ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {
var p api.Plan
if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) {
return p
}
for _, m := range status.Members.AsList() {
l := r.log.Str("id", m.Member.ID).Str("role", m.Group.AsRole())
if m.Member.Phase != api.MemberPhaseCreated || m.Member.Pod.GetName() == "" {
// Act only when phase is created
continue
}
if m.Member.Conditions.IsTrue(api.ConditionTypeScheduled) || m.Member.Conditions.IsTrue(api.ConditionTypeTerminating) {
// Action is needed only for pods which are not scheduled yet
continue
}
imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member)
if !imageFound {
l.Warn("could not find image for already created member")
continue
}
renderedPod, err := context.RenderPodForMember(ctx, context.ACS(), spec, status, m.Member.ID, imageInfo)
if err != nil {
l.Err(err).Warn("could not render pod for already created member")
continue
}
if r.isSchedulingParametersChanged(renderedPod.Spec, m.Member, context) {
l.Info("Adding KillMemberPod action: scheduling failed and parameters already updated")
p = append(p,
actions.NewAction(api.ActionTypeKillMemberPod, m.Group, m.Member, "Scheduling failed"),
)
}
}
return p
}
// isSchedulingParametersChanged returns true if parameters related to pod scheduling has changed
func (r *Reconciler) isSchedulingParametersChanged(expectedSpec core.PodSpec, member api.MemberStatus, context PlanBuilderContext) bool {
cache, ok := context.ACS().ClusterCache(member.ClusterID)
if !ok {
return false
}
pod, ok := cache.Pod().V1().GetSimple(member.Pod.GetName())
if !ok {
return false
}
if r.schedulingParametersAreTheSame(expectedSpec, pod.Spec) {
return false
}
return true
}
func (r *Reconciler) schedulingParametersAreTheSame(expectedSpec, actualSpec core.PodSpec) bool {
if expectedSpec.PriorityClassName != actualSpec.PriorityClassName {
return false
}
if !reflect.DeepEqual(expectedSpec.Tolerations, actualSpec.Tolerations) {
return false
}
if !reflect.DeepEqual(expectedSpec.NodeSelector, actualSpec.NodeSelector) {
return false
}
// we should use SHA256 here because DeepEqual might be unreliable for Affinity rules
if specC, err := util.SHA256FromJSON(expectedSpec.Affinity); err != nil {
return true
} else {
if statusC, err := util.SHA256FromJSON(actualSpec.Affinity); err != nil {
return true
} else if specC != statusC {
return false
}
}
return true
}

View file

@ -527,9 +527,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
}
} else if status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) &&
len(unscheduledPodNames) == 0 {
if status.Conditions.Update(api.ConditionTypePodSchedulingFailure, false,
"Pods Scheduling Resolved",
"No pod reports a scheduling timeout") {
if status.Conditions.Remove(api.ConditionTypePodSchedulingFailure) {
r.context.CreateEvent(k8sutil.NewPodsSchedulingResolvedEvent(r.context.GetAPIObject()))
}
}