1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00
kube-arangodb/pkg/deployment/reconcile/helper_shutdown.go
2022-11-02 08:29:46 +01:00

300 lines
9.2 KiB
Go

//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package reconcile
import (
"context"
"time"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/client"
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
"github.com/arangodb/kube-arangodb/pkg/util"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/kerrors"
)
const (
actionShutdownJobExpiredTermination api.PlanLocalKey = "expiredJobTerminationCheck"
actionShutdownJobExpiredTerminationDelay = 10 * time.Second
ActionShutdownJobExpiredTerminationTimeout = time.Minute
)
// getShutdownHelper returns an action to shut down a pod according to the settings.
// Returns true when member status exists.
// There are 3 possibilities to shut down the pod: immediately, gracefully, standard kubernetes delete API.
// When pod does not exist then success action (which always successes) is returned.
func getShutdownHelper(a actionImpl) (ActionCore, api.MemberStatus, bool) {
m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID)
if !ok {
a.log.Str("pod-name", m.Pod.GetName()).Warn("member is already gone")
return nil, api.MemberStatus{}, false
}
cache, ok := a.actionCtx.ACS().ClusterCache(m.ClusterID)
if !ok {
a.log.Str("pod-name", m.Pod.GetName()).Warn("Cluster is not ready")
return nil, api.MemberStatus{}, false
}
if ifPodUIDMismatch(m, a.action, cache) {
a.log.Error("Member UID is changed")
return NewActionSuccess(), m, true
}
pod, ok := cache.Pod().V1().GetSimple(m.Pod.GetName())
if !ok {
a.log.Str("pod-name", m.Pod.GetName()).Warn("pod is already gone")
// Pod does not exist, so create success action to finish it immediately.
return NewActionSuccess(), m, true
}
if _, ok := pod.GetAnnotations()[deployment.ArangoDeploymentPodDeleteNow]; ok {
// The pod contains annotation, so pod must be deleted immediately.
return shutdownNow{actionImpl: a, memberStatus: m}, m, true
}
if features.GracefulShutdown().Enabled() {
return shutdownHelperAPI{actionImpl: a, memberStatus: m}, m, true
}
serverGroup := a.actionCtx.GetSpec().GetServerGroupSpec(a.action.Group)
switch serverGroup.ShutdownMethod.Get() {
case api.ServerGroupShutdownMethodDelete:
return shutdownHelperDelete{actionImpl: a, memberStatus: m}, m, true
default:
return shutdownHelperAPI{actionImpl: a, memberStatus: m}, m, true
}
}
type shutdownHelperAPI struct {
actionImpl
memberStatus api.MemberStatus
}
func (s shutdownHelperAPI) Start(ctx context.Context) (bool, error) {
s.log.Info("Using API to shutdown member")
group := s.action.Group
podName := s.memberStatus.Pod.GetName()
if podName == "" {
s.log.Warn("Pod is empty")
return true, nil
}
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
if !ok {
return true, errors.Newf("Cluster is not ready")
}
// Remove finalizers, so Kubernetes will quickly terminate the pod
if !features.GracefulShutdown().Enabled() {
pod, ok := cache.Pod().V1().GetSimple(podName)
if ok && len(pod.Finalizers) > 0 {
pod.Finalizers = nil
ctxChild, cancel := globals.GetGlobalTimeouts().Kubernetes().WithTimeout(ctx)
defer cancel()
if _, err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Update(ctxChild, pod, meta.UpdateOptions{}); err != nil {
return false, err
}
}
}
if group.IsArangod() {
// Invoke shutdown endpoint
c, err := s.actionCtx.GetMembersState().GetMemberClient(s.action.MemberID)
if err != nil {
s.log.Err(err).Debug("Failed to create member client")
return false, errors.WithStack(err)
}
removeFromCluster := false
s.log.Bool("removeFromCluster", removeFromCluster).Debug("Shutting down member")
ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
defer cancel()
if err := c.ShutdownV2(ctxChild, removeFromCluster, true); err != nil {
// Shutdown failed. Let's check if we're already done
if ready, _, err := s.CheckProgress(ctxChild); err == nil && ready {
// We're done
return true, nil
}
s.log.Err(err).Debug("Failed to shutdown member")
return false, errors.WithStack(err)
}
} else if group.IsArangosync() {
// Terminate pod
if err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Delete(ctx, podName, meta.DeleteOptions{}); err != nil {
return false, errors.WithStack(err)
}
}
return false, nil
}
// CheckProgress returns true when pod is terminated.
func (s shutdownHelperAPI) CheckProgress(ctx context.Context) (bool, bool, error) {
if s.memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
return true, false, nil
}
if s.action.Group == s.actionCtx.GetMode().ServingGroup() {
if s.actionCtx.BackoffExecution(s.action, actionShutdownJobExpiredTermination, actionShutdownJobExpiredTerminationDelay) {
// Lets try to run termination
c, err := s.actionCtx.GetMembersState().GetMemberClient(s.action.MemberID)
if err != nil {
s.log.Err(err).Warn("Failed to create member client")
} else {
internal := client.NewClient(c.Connection(), s.log)
if err := internal.DeleteExpiredJobs(ctx, ActionShutdownJobExpiredTerminationTimeout); err != nil {
s.log.Err(err).Warn("Unable to kill async jobs on member")
}
}
}
}
return false, false, nil
}
type shutdownHelperDelete struct {
actionImpl
memberStatus api.MemberStatus
}
func (s shutdownHelperDelete) Start(ctx context.Context) (bool, error) {
s.log.Info("Using Pod Delete to shutdown member")
podName := s.memberStatus.Pod.GetName()
if podName == "" {
s.log.Warn("Pod is empty")
return true, nil
}
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
if !ok {
return true, errors.Newf("Cluster is not ready")
}
// Terminate pod
if err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Delete(ctx, podName, meta.DeleteOptions{}); err != nil {
if !kerrors.IsNotFound(err) {
return false, errors.WithStack(err)
}
}
return false, nil
}
func (s shutdownHelperDelete) CheckProgress(ctx context.Context) (bool, bool, error) {
// Check that pod is removed
if !s.memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
// Pod is not yet terminated
s.log.Warn("Pod not yet terminated")
return false, false, nil
}
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
if !ok {
s.log.Warn("Cluster is not ready")
return false, false, nil
}
podName := s.memberStatus.Pod.GetName()
if podName != "" {
if _, ok := cache.Pod().V1().GetSimple(podName); ok {
s.log.Warn("Pod still exists")
return false, false, nil
}
}
return true, false, nil
}
type shutdownNow struct {
actionImpl
memberStatus api.MemberStatus
}
// Start starts removing pod forcefully.
func (s shutdownNow) Start(ctx context.Context) (bool, error) {
// Check progress is used here because removing pod can start gracefully,
// and then it can be changed to force shutdown.
s.log.Info("Using shutdown now method")
ready, _, err := s.CheckProgress(ctx)
return ready, err
}
// CheckProgress starts removing pod forcefully and checks if has it been removed.
func (s shutdownNow) CheckProgress(ctx context.Context) (bool, bool, error) {
podName := s.memberStatus.Pod.GetName()
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
if !ok {
s.log.Warn("Cluster is not ready")
return false, false, nil
}
pod, ok := cache.Pod().V1().GetSimple(podName)
if !ok {
s.log.Info("Using shutdown now method completed because pod is gone")
return true, false, nil
}
if s.memberStatus.Pod.GetUID() != pod.GetUID() {
s.log.Info("Using shutdown now method completed because it is already rotated")
// The new pod has been started already.
return true, false, nil
}
// Remove finalizers forcefully.
if len(pod.Finalizers) > 0 {
pod.Finalizers = nil
ctxChild, cancel := globals.GetGlobalTimeouts().Kubernetes().WithTimeout(ctx)
defer cancel()
if _, err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Update(ctxChild, pod, meta.UpdateOptions{}); err != nil {
return false, false, err
}
}
// Terminate pod.
options := meta.DeleteOptions{
// Leave one second to clean a PVC.
GracePeriodSeconds: util.NewInt64(1),
}
if err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Delete(ctx, podName, options); err != nil {
if !kerrors.IsNotFound(err) {
return false, false, errors.WithStack(err)
}
}
s.log.Info("Using shutdown now method completed")
return true, false, nil
}