mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
300 lines
9.2 KiB
Go
300 lines
9.2 KiB
Go
//
|
|
// DISCLAIMER
|
|
//
|
|
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
|
//
|
|
|
|
package reconcile
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
|
|
"github.com/arangodb/kube-arangodb/pkg/apis/deployment"
|
|
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
|
"github.com/arangodb/kube-arangodb/pkg/deployment/client"
|
|
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
|
|
"github.com/arangodb/kube-arangodb/pkg/util"
|
|
"github.com/arangodb/kube-arangodb/pkg/util/errors"
|
|
"github.com/arangodb/kube-arangodb/pkg/util/globals"
|
|
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/kerrors"
|
|
)
|
|
|
|
const (
|
|
actionShutdownJobExpiredTermination api.PlanLocalKey = "expiredJobTerminationCheck"
|
|
actionShutdownJobExpiredTerminationDelay = 10 * time.Second
|
|
ActionShutdownJobExpiredTerminationTimeout = time.Minute
|
|
)
|
|
|
|
// getShutdownHelper returns an action to shut down a pod according to the settings.
|
|
// Returns true when member status exists.
|
|
// There are 3 possibilities to shut down the pod: immediately, gracefully, standard kubernetes delete API.
|
|
// When pod does not exist then success action (which always successes) is returned.
|
|
func getShutdownHelper(a actionImpl) (ActionCore, api.MemberStatus, bool) {
|
|
m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID)
|
|
if !ok {
|
|
a.log.Str("pod-name", m.Pod.GetName()).Warn("member is already gone")
|
|
|
|
return nil, api.MemberStatus{}, false
|
|
}
|
|
|
|
cache, ok := a.actionCtx.ACS().ClusterCache(m.ClusterID)
|
|
if !ok {
|
|
a.log.Str("pod-name", m.Pod.GetName()).Warn("Cluster is not ready")
|
|
|
|
return nil, api.MemberStatus{}, false
|
|
}
|
|
|
|
if ifPodUIDMismatch(m, a.action, cache) {
|
|
a.log.Error("Member UID is changed")
|
|
return NewActionSuccess(), m, true
|
|
}
|
|
|
|
pod, ok := cache.Pod().V1().GetSimple(m.Pod.GetName())
|
|
if !ok {
|
|
a.log.Str("pod-name", m.Pod.GetName()).Warn("pod is already gone")
|
|
// Pod does not exist, so create success action to finish it immediately.
|
|
return NewActionSuccess(), m, true
|
|
}
|
|
|
|
if _, ok := pod.GetAnnotations()[deployment.ArangoDeploymentPodDeleteNow]; ok {
|
|
// The pod contains annotation, so pod must be deleted immediately.
|
|
return shutdownNow{actionImpl: a, memberStatus: m}, m, true
|
|
}
|
|
|
|
if features.GracefulShutdown().Enabled() {
|
|
return shutdownHelperAPI{actionImpl: a, memberStatus: m}, m, true
|
|
}
|
|
|
|
serverGroup := a.actionCtx.GetSpec().GetServerGroupSpec(a.action.Group)
|
|
|
|
switch serverGroup.ShutdownMethod.Get() {
|
|
case api.ServerGroupShutdownMethodDelete:
|
|
return shutdownHelperDelete{actionImpl: a, memberStatus: m}, m, true
|
|
default:
|
|
return shutdownHelperAPI{actionImpl: a, memberStatus: m}, m, true
|
|
}
|
|
}
|
|
|
|
type shutdownHelperAPI struct {
|
|
actionImpl
|
|
memberStatus api.MemberStatus
|
|
}
|
|
|
|
func (s shutdownHelperAPI) Start(ctx context.Context) (bool, error) {
|
|
s.log.Info("Using API to shutdown member")
|
|
|
|
group := s.action.Group
|
|
podName := s.memberStatus.Pod.GetName()
|
|
if podName == "" {
|
|
s.log.Warn("Pod is empty")
|
|
return true, nil
|
|
}
|
|
|
|
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
|
|
if !ok {
|
|
return true, errors.Newf("Cluster is not ready")
|
|
}
|
|
|
|
// Remove finalizers, so Kubernetes will quickly terminate the pod
|
|
if !features.GracefulShutdown().Enabled() {
|
|
pod, ok := cache.Pod().V1().GetSimple(podName)
|
|
if ok && len(pod.Finalizers) > 0 {
|
|
pod.Finalizers = nil
|
|
|
|
ctxChild, cancel := globals.GetGlobalTimeouts().Kubernetes().WithTimeout(ctx)
|
|
defer cancel()
|
|
|
|
if _, err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Update(ctxChild, pod, meta.UpdateOptions{}); err != nil {
|
|
return false, err
|
|
}
|
|
}
|
|
}
|
|
|
|
if group.IsArangod() {
|
|
// Invoke shutdown endpoint
|
|
c, err := s.actionCtx.GetMembersState().GetMemberClient(s.action.MemberID)
|
|
if err != nil {
|
|
s.log.Err(err).Debug("Failed to create member client")
|
|
return false, errors.WithStack(err)
|
|
}
|
|
removeFromCluster := false
|
|
s.log.Bool("removeFromCluster", removeFromCluster).Debug("Shutting down member")
|
|
ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
|
|
defer cancel()
|
|
if err := c.ShutdownV2(ctxChild, removeFromCluster, true); err != nil {
|
|
// Shutdown failed. Let's check if we're already done
|
|
if ready, _, err := s.CheckProgress(ctxChild); err == nil && ready {
|
|
// We're done
|
|
return true, nil
|
|
}
|
|
s.log.Err(err).Debug("Failed to shutdown member")
|
|
return false, errors.WithStack(err)
|
|
}
|
|
} else if group.IsArangosync() {
|
|
// Terminate pod
|
|
if err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Delete(ctx, podName, meta.DeleteOptions{}); err != nil {
|
|
return false, errors.WithStack(err)
|
|
}
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
// CheckProgress returns true when pod is terminated.
|
|
func (s shutdownHelperAPI) CheckProgress(ctx context.Context) (bool, bool, error) {
|
|
if s.memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
|
|
return true, false, nil
|
|
}
|
|
|
|
if s.action.Group == s.actionCtx.GetMode().ServingGroup() {
|
|
if s.actionCtx.BackoffExecution(s.action, actionShutdownJobExpiredTermination, actionShutdownJobExpiredTerminationDelay) {
|
|
// Lets try to run termination
|
|
c, err := s.actionCtx.GetMembersState().GetMemberClient(s.action.MemberID)
|
|
if err != nil {
|
|
s.log.Err(err).Warn("Failed to create member client")
|
|
} else {
|
|
internal := client.NewClient(c.Connection(), s.log)
|
|
|
|
if err := internal.DeleteExpiredJobs(ctx, ActionShutdownJobExpiredTerminationTimeout); err != nil {
|
|
s.log.Err(err).Warn("Unable to kill async jobs on member")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, false, nil
|
|
}
|
|
|
|
type shutdownHelperDelete struct {
|
|
actionImpl
|
|
memberStatus api.MemberStatus
|
|
}
|
|
|
|
func (s shutdownHelperDelete) Start(ctx context.Context) (bool, error) {
|
|
s.log.Info("Using Pod Delete to shutdown member")
|
|
|
|
podName := s.memberStatus.Pod.GetName()
|
|
if podName == "" {
|
|
s.log.Warn("Pod is empty")
|
|
return true, nil
|
|
}
|
|
|
|
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
|
|
if !ok {
|
|
return true, errors.Newf("Cluster is not ready")
|
|
}
|
|
|
|
// Terminate pod
|
|
if err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Delete(ctx, podName, meta.DeleteOptions{}); err != nil {
|
|
if !kerrors.IsNotFound(err) {
|
|
return false, errors.WithStack(err)
|
|
}
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
func (s shutdownHelperDelete) CheckProgress(ctx context.Context) (bool, bool, error) {
|
|
// Check that pod is removed
|
|
if !s.memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
|
|
// Pod is not yet terminated
|
|
s.log.Warn("Pod not yet terminated")
|
|
return false, false, nil
|
|
}
|
|
|
|
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
|
|
if !ok {
|
|
s.log.Warn("Cluster is not ready")
|
|
return false, false, nil
|
|
}
|
|
|
|
podName := s.memberStatus.Pod.GetName()
|
|
if podName != "" {
|
|
if _, ok := cache.Pod().V1().GetSimple(podName); ok {
|
|
s.log.Warn("Pod still exists")
|
|
return false, false, nil
|
|
}
|
|
}
|
|
|
|
return true, false, nil
|
|
}
|
|
|
|
type shutdownNow struct {
|
|
actionImpl
|
|
memberStatus api.MemberStatus
|
|
}
|
|
|
|
// Start starts removing pod forcefully.
|
|
func (s shutdownNow) Start(ctx context.Context) (bool, error) {
|
|
// Check progress is used here because removing pod can start gracefully,
|
|
// and then it can be changed to force shutdown.
|
|
s.log.Info("Using shutdown now method")
|
|
ready, _, err := s.CheckProgress(ctx)
|
|
return ready, err
|
|
}
|
|
|
|
// CheckProgress starts removing pod forcefully and checks if has it been removed.
|
|
func (s shutdownNow) CheckProgress(ctx context.Context) (bool, bool, error) {
|
|
podName := s.memberStatus.Pod.GetName()
|
|
|
|
cache, ok := s.actionCtx.ACS().ClusterCache(s.memberStatus.ClusterID)
|
|
if !ok {
|
|
s.log.Warn("Cluster is not ready")
|
|
return false, false, nil
|
|
}
|
|
|
|
pod, ok := cache.Pod().V1().GetSimple(podName)
|
|
if !ok {
|
|
s.log.Info("Using shutdown now method completed because pod is gone")
|
|
return true, false, nil
|
|
}
|
|
|
|
if s.memberStatus.Pod.GetUID() != pod.GetUID() {
|
|
s.log.Info("Using shutdown now method completed because it is already rotated")
|
|
// The new pod has been started already.
|
|
return true, false, nil
|
|
}
|
|
|
|
// Remove finalizers forcefully.
|
|
if len(pod.Finalizers) > 0 {
|
|
pod.Finalizers = nil
|
|
|
|
ctxChild, cancel := globals.GetGlobalTimeouts().Kubernetes().WithTimeout(ctx)
|
|
defer cancel()
|
|
|
|
if _, err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Update(ctxChild, pod, meta.UpdateOptions{}); err != nil {
|
|
return false, false, err
|
|
}
|
|
}
|
|
|
|
// Terminate pod.
|
|
options := meta.DeleteOptions{
|
|
// Leave one second to clean a PVC.
|
|
GracePeriodSeconds: util.NewInt64(1),
|
|
}
|
|
if err := cache.Client().Kubernetes().CoreV1().Pods(cache.Namespace()).Delete(ctx, podName, options); err != nil {
|
|
if !kerrors.IsNotFound(err) {
|
|
return false, false, errors.WithStack(err)
|
|
}
|
|
}
|
|
|
|
s.log.Info("Using shutdown now method completed")
|
|
return true, false, nil
|
|
}
|