Monitor sync status concurrently. Only create new upgrade plan if everything is good.

2024-12-14 11:57:37 +00:00 · 2019-04-04 17:06:27 +02:00 · 2019-04-04 17:06:27 +02:00 · 90b1d87daa
commit 90b1d87daa
parent 8749bfaa9a
9 changed files with 114 additions and 24 deletions
--- a/pkg/deployment/context_impl.go
+++ b/pkg/deployment/context_impl.go
@ -380,3 +380,8 @@ func (d *Deployment) GetExpectedPodArguments(apiObject metav1.Object, deplSpec a
 	agents api.MemberStatusList, id string, version driver.Version) []string {
 	return d.resources.GetExpectedPodArguments(apiObject, deplSpec, group, agents, id, version)
 }
+
+// GetShardSyncStatus returns true if all shards are in sync
+func (d *Deployment) GetShardSyncStatus() bool {
+	return d.resources.GetShardSyncStatus()
+}
--- a/pkg/deployment/deployment.go
+++ b/pkg/deployment/deployment.go
@ -141,6 +141,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
 		d.clusterScalingIntegration = ci
 		go ci.ListenForClusterEvents(d.stopCh)
 		go d.resources.RunDeploymentHealthLoop(d.stopCh)
+		go d.resources.RunDeploymentShardSyncLoop(d.stopCh)
 	}
 	if config.AllowChaos {
 		d.chaosMonkey = chaos.NewMonkey(deps.Log, d)
--- a/pkg/deployment/reconcile/action_wait_for_member_up.go
+++ b/pkg/deployment/reconcile/action_wait_for_member_up.go
@ -167,26 +167,6 @@ func (a *actionWaitForMemberUp) checkProgressCluster(ctx context.Context) (bool,
 		log.Debug().Str("status", string(sh.Status)).Msg("Member set status not yet good")
 		return false, false, nil
 	}
-	if a.action.Group == api.ServerGroupDBServers {
-		dbs, err := c.Databases(ctx)
-		if err != nil {
-			return false, false, err
-		}
-		for _, db := range dbs {
-			inv, err := cluster.DatabaseInventory(ctx, db)
-			if err != nil {
-				return false, false, err
-			}
-
-			for _, col := range inv.Collections {
-				if !col.AllInSync {
-					log.Debug().Str("col", col.Parameters.Name).Msg("Not in sync")
-					return false, false, nil
-				}
-			}
-		}
-
-	}
 	// Wait for the member to become ready from a kubernetes point of view
 	// otherwise the coordinators may be rotated to fast and thus none of them
 	// is ready resulting in a short downtime
--- a/pkg/deployment/reconcile/context.go
+++ b/pkg/deployment/reconcile/context.go
@ -93,4 +93,6 @@ type Context interface {
 	// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.
 	GetExpectedPodArguments(apiObject metav1.Object, deplSpec api.DeploymentSpec, group api.ServerGroup,
 		agents api.MemberStatusList, id string, version driver.Version) []string
+	// GetShardSyncStatus returns true if all shards are in sync
+	GetShardSyncStatus() bool
 }
--- a/pkg/deployment/reconcile/plan_builder.go
+++ b/pkg/deployment/reconcile/plan_builder.go
@ -209,12 +209,17 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
 			})
 			return newPlan, upgradeNotAllowed, fromVersion, toVersion, fromLicense, toLicense
 		}
+
 		if newPlan, upgradeNotAllowed, fromVersion, toVersion, fromLicense, toLicense := createRotateOrUpgradePlan(); upgradeNotAllowed {
 			// Upgrade is needed, but not allowed
 			context.CreateEvent(k8sutil.NewUpgradeNotAllowedEvent(apiObject, fromVersion, toVersion, fromLicense, toLicense))
-		} else {
-			// Use the new plan
-			plan = newPlan
+		} else if len(newPlan) > 0 {
+			if clusterReadyForUpgrade(context) {
+				// Use the new plan
+				plan = newPlan
+			} else {
+				log.Info().Msg("Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready.")
+			}
 		}
 	}

@ -237,6 +242,15 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
 	return plan, true
 }

+// clusterReadyForUpgrade returns true if the cluster is ready for the next update, that is:
+// 	- all shards are in sync
+// 	- all members are ready and fine
+func clusterReadyForUpgrade(context PlanBuilderContext) bool {
+	status, _ := context.GetStatus()
+	allInSync := context.GetShardSyncStatus()
+	return allInSync && status.Conditions.IsTrue(api.ConditionTypeReady)
+}
+
 // podNeedsUpgrading decides if an upgrade of the pod is needed (to comply with
 // the given spec) and if that is allowed.
 func podNeedsUpgrading(log zerolog.Logger, p v1.Pod, spec api.DeploymentSpec, images api.ImageInfoList) upgradeDecision {
--- a/pkg/deployment/reconcile/plan_builder_context.go
+++ b/pkg/deployment/reconcile/plan_builder_context.go
@ -46,6 +46,10 @@ type PlanBuilderContext interface {
 	// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.
 	GetExpectedPodArguments(apiObject metav1.Object, deplSpec api.DeploymentSpec, group api.ServerGroup,
 		agents api.MemberStatusList, id string, version driver.Version) []string
+	// GetShardSyncStatus returns true if all shards are in sync
+	GetShardSyncStatus() bool
+	// GetStatus returns the current status of the deployment
+	GetStatus() (api.DeploymentStatus, int32)
 }

 // newPlanBuilderContext creates a PlanBuilderContext from the given context
--- a/pkg/deployment/reconcile/timeouts.go
+++ b/pkg/deployment/reconcile/timeouts.go
@ -33,5 +33,5 @@ const (
 	rotateMemberTimeout          = time.Minute * 30
 	shutdownMemberTimeout        = time.Minute * 30
 	upgradeMemberTimeout         = time.Hour * 6
-	waitForMemberUpTimeout       = time.Minute * 15
+	waitForMemberUpTimeout       = time.Minute * 45
 )
--- a/pkg/deployment/resources/deployment_health.go
+++ b/pkg/deployment/resources/deployment_health.go
@ -28,10 +28,12 @@ import (

 	api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
 	"github.com/arangodb/kube-arangodb/pkg/metrics"
+	"github.com/rs/zerolog/log"
 )

 var (
 	deploymentHealthFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_health_fetches", "Number of times the health of the deployment was fetched", metrics.DeploymentName, metrics.Result)
+	deploymentSyncFetchesCounters   = metrics.MustRegisterCounterVec(metricsComponent, "deployment_sync_fetches", "Number of times the sync status of shards of the deplyoment was fetched", metrics.DeploymentName, metrics.Result)
 )

 // RunDeploymentHealthLoop creates a loop to fetch the health of the deployment.
@ -88,3 +90,80 @@ func (r *Resources) fetchDeploymentHealth() error {
 	r.health.timestamp = time.Now()
 	return nil
 }
+
+// RunDeploymentShardSyncLoop creates a loop to fetch the sync status of shards of the deployment.
+// The loop ends when the given channel is closed.
+func (r *Resources) RunDeploymentShardSyncLoop(stopCh <-chan struct{}) {
+	log := r.log
+	deploymentName := r.context.GetAPIObject().GetName()
+
+	if r.context.GetSpec().GetMode() != api.DeploymentModeCluster {
+		// Deployment health is currently only applicable for clusters
+		return
+	}
+
+	for {
+		if err := r.fetchClusterShardSyncState(); err != nil {
+			log.Debug().Err(err).Msg("Failed to fetch deployment shard sync state")
+			deploymentSyncFetchesCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
+		} else {
+			deploymentSyncFetchesCounters.WithLabelValues(deploymentName, metrics.Success).Inc()
+		}
+		select {
+		case <-time.After(time.Second * 30):
+			// Continue
+		case <-stopCh:
+			// We're done
+			return
+		}
+	}
+}
+
+// fetchClusterShardSyncState performs a single fetch of the cluster inventory and
+// checks if all shards are in sync
+func (r *Resources) fetchClusterShardSyncState() error {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*15)
+	defer cancel()
+	c, err := r.context.GetDatabaseClient(ctx)
+	if err != nil {
+		return err
+	}
+	cluster, err := c.Cluster(ctx)
+	if err != nil {
+		return err
+	}
+	dbs, err := c.Databases(ctx)
+	if err != nil {
+		return err
+	}
+
+	allInSync := true
+dbloop:
+	for _, db := range dbs {
+		inv, err := cluster.DatabaseInventory(ctx, db)
+		if err != nil {
+			return err
+		}
+
+		for _, col := range inv.Collections {
+			if !col.AllInSync {
+				log.Debug().Str("col", col.Parameters.Name).Msg("Not in sync")
+				allInSync = false
+				break dbloop
+			}
+		}
+	}
+
+	r.shardSync.mutex.Lock()
+	defer r.shardSync.mutex.Unlock()
+	r.shardSync.allInSync = allInSync
+	r.shardSync.timestamp = time.Now()
+	return nil
+}
+
+// GetShardSyncStatus returns true if all shards are in sync
+func (r *Resources) GetShardSyncStatus() bool {
+	r.shardSync.mutex.Lock()
+	defer r.shardSync.mutex.Unlock()
+	return r.shardSync.allInSync
+}
--- a/pkg/deployment/resources/resources.go
+++ b/pkg/deployment/resources/resources.go
@ -40,6 +40,11 @@ type Resources struct {
 		timestamp     time.Time            // Timestamp of last fetch of cluster health
 		mutex         sync.Mutex           // Mutex guarding fields in this struct
 	}
+	shardSync struct {
+		allInSync bool
+		timestamp time.Time
+		mutex     sync.Mutex
+	}
 }

 // NewResources creates a new Resources service, used to