mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
Monitor sync status concurrently. Only create new upgrade plan if everything is good.
This commit is contained in:
parent
8749bfaa9a
commit
90b1d87daa
9 changed files with 114 additions and 24 deletions
|
@ -380,3 +380,8 @@ func (d *Deployment) GetExpectedPodArguments(apiObject metav1.Object, deplSpec a
|
|||
agents api.MemberStatusList, id string, version driver.Version) []string {
|
||||
return d.resources.GetExpectedPodArguments(apiObject, deplSpec, group, agents, id, version)
|
||||
}
|
||||
|
||||
// GetShardSyncStatus returns true if all shards are in sync
|
||||
func (d *Deployment) GetShardSyncStatus() bool {
|
||||
return d.resources.GetShardSyncStatus()
|
||||
}
|
||||
|
|
|
@ -141,6 +141,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
|
|||
d.clusterScalingIntegration = ci
|
||||
go ci.ListenForClusterEvents(d.stopCh)
|
||||
go d.resources.RunDeploymentHealthLoop(d.stopCh)
|
||||
go d.resources.RunDeploymentShardSyncLoop(d.stopCh)
|
||||
}
|
||||
if config.AllowChaos {
|
||||
d.chaosMonkey = chaos.NewMonkey(deps.Log, d)
|
||||
|
|
|
@ -167,26 +167,6 @@ func (a *actionWaitForMemberUp) checkProgressCluster(ctx context.Context) (bool,
|
|||
log.Debug().Str("status", string(sh.Status)).Msg("Member set status not yet good")
|
||||
return false, false, nil
|
||||
}
|
||||
if a.action.Group == api.ServerGroupDBServers {
|
||||
dbs, err := c.Databases(ctx)
|
||||
if err != nil {
|
||||
return false, false, err
|
||||
}
|
||||
for _, db := range dbs {
|
||||
inv, err := cluster.DatabaseInventory(ctx, db)
|
||||
if err != nil {
|
||||
return false, false, err
|
||||
}
|
||||
|
||||
for _, col := range inv.Collections {
|
||||
if !col.AllInSync {
|
||||
log.Debug().Str("col", col.Parameters.Name).Msg("Not in sync")
|
||||
return false, false, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// Wait for the member to become ready from a kubernetes point of view
|
||||
// otherwise the coordinators may be rotated to fast and thus none of them
|
||||
// is ready resulting in a short downtime
|
||||
|
|
|
@ -93,4 +93,6 @@ type Context interface {
|
|||
// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.
|
||||
GetExpectedPodArguments(apiObject metav1.Object, deplSpec api.DeploymentSpec, group api.ServerGroup,
|
||||
agents api.MemberStatusList, id string, version driver.Version) []string
|
||||
// GetShardSyncStatus returns true if all shards are in sync
|
||||
GetShardSyncStatus() bool
|
||||
}
|
||||
|
|
|
@ -209,12 +209,17 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
|
|||
})
|
||||
return newPlan, upgradeNotAllowed, fromVersion, toVersion, fromLicense, toLicense
|
||||
}
|
||||
|
||||
if newPlan, upgradeNotAllowed, fromVersion, toVersion, fromLicense, toLicense := createRotateOrUpgradePlan(); upgradeNotAllowed {
|
||||
// Upgrade is needed, but not allowed
|
||||
context.CreateEvent(k8sutil.NewUpgradeNotAllowedEvent(apiObject, fromVersion, toVersion, fromLicense, toLicense))
|
||||
} else {
|
||||
// Use the new plan
|
||||
plan = newPlan
|
||||
} else if len(newPlan) > 0 {
|
||||
if clusterReadyForUpgrade(context) {
|
||||
// Use the new plan
|
||||
plan = newPlan
|
||||
} else {
|
||||
log.Info().Msg("Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -237,6 +242,15 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
|
|||
return plan, true
|
||||
}
|
||||
|
||||
// clusterReadyForUpgrade returns true if the cluster is ready for the next update, that is:
|
||||
// - all shards are in sync
|
||||
// - all members are ready and fine
|
||||
func clusterReadyForUpgrade(context PlanBuilderContext) bool {
|
||||
status, _ := context.GetStatus()
|
||||
allInSync := context.GetShardSyncStatus()
|
||||
return allInSync && status.Conditions.IsTrue(api.ConditionTypeReady)
|
||||
}
|
||||
|
||||
// podNeedsUpgrading decides if an upgrade of the pod is needed (to comply with
|
||||
// the given spec) and if that is allowed.
|
||||
func podNeedsUpgrading(log zerolog.Logger, p v1.Pod, spec api.DeploymentSpec, images api.ImageInfoList) upgradeDecision {
|
||||
|
|
|
@ -46,6 +46,10 @@ type PlanBuilderContext interface {
|
|||
// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.
|
||||
GetExpectedPodArguments(apiObject metav1.Object, deplSpec api.DeploymentSpec, group api.ServerGroup,
|
||||
agents api.MemberStatusList, id string, version driver.Version) []string
|
||||
// GetShardSyncStatus returns true if all shards are in sync
|
||||
GetShardSyncStatus() bool
|
||||
// GetStatus returns the current status of the deployment
|
||||
GetStatus() (api.DeploymentStatus, int32)
|
||||
}
|
||||
|
||||
// newPlanBuilderContext creates a PlanBuilderContext from the given context
|
||||
|
|
|
@ -33,5 +33,5 @@ const (
|
|||
rotateMemberTimeout = time.Minute * 30
|
||||
shutdownMemberTimeout = time.Minute * 30
|
||||
upgradeMemberTimeout = time.Hour * 6
|
||||
waitForMemberUpTimeout = time.Minute * 15
|
||||
waitForMemberUpTimeout = time.Minute * 45
|
||||
)
|
||||
|
|
|
@ -28,10 +28,12 @@ import (
|
|||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
|
||||
"github.com/arangodb/kube-arangodb/pkg/metrics"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
var (
|
||||
deploymentHealthFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_health_fetches", "Number of times the health of the deployment was fetched", metrics.DeploymentName, metrics.Result)
|
||||
deploymentSyncFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_sync_fetches", "Number of times the sync status of shards of the deplyoment was fetched", metrics.DeploymentName, metrics.Result)
|
||||
)
|
||||
|
||||
// RunDeploymentHealthLoop creates a loop to fetch the health of the deployment.
|
||||
|
@ -88,3 +90,80 @@ func (r *Resources) fetchDeploymentHealth() error {
|
|||
r.health.timestamp = time.Now()
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunDeploymentShardSyncLoop creates a loop to fetch the sync status of shards of the deployment.
|
||||
// The loop ends when the given channel is closed.
|
||||
func (r *Resources) RunDeploymentShardSyncLoop(stopCh <-chan struct{}) {
|
||||
log := r.log
|
||||
deploymentName := r.context.GetAPIObject().GetName()
|
||||
|
||||
if r.context.GetSpec().GetMode() != api.DeploymentModeCluster {
|
||||
// Deployment health is currently only applicable for clusters
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
if err := r.fetchClusterShardSyncState(); err != nil {
|
||||
log.Debug().Err(err).Msg("Failed to fetch deployment shard sync state")
|
||||
deploymentSyncFetchesCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
|
||||
} else {
|
||||
deploymentSyncFetchesCounters.WithLabelValues(deploymentName, metrics.Success).Inc()
|
||||
}
|
||||
select {
|
||||
case <-time.After(time.Second * 30):
|
||||
// Continue
|
||||
case <-stopCh:
|
||||
// We're done
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fetchClusterShardSyncState performs a single fetch of the cluster inventory and
|
||||
// checks if all shards are in sync
|
||||
func (r *Resources) fetchClusterShardSyncState() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second*15)
|
||||
defer cancel()
|
||||
c, err := r.context.GetDatabaseClient(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cluster, err := c.Cluster(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dbs, err := c.Databases(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
allInSync := true
|
||||
dbloop:
|
||||
for _, db := range dbs {
|
||||
inv, err := cluster.DatabaseInventory(ctx, db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, col := range inv.Collections {
|
||||
if !col.AllInSync {
|
||||
log.Debug().Str("col", col.Parameters.Name).Msg("Not in sync")
|
||||
allInSync = false
|
||||
break dbloop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r.shardSync.mutex.Lock()
|
||||
defer r.shardSync.mutex.Unlock()
|
||||
r.shardSync.allInSync = allInSync
|
||||
r.shardSync.timestamp = time.Now()
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetShardSyncStatus returns true if all shards are in sync
|
||||
func (r *Resources) GetShardSyncStatus() bool {
|
||||
r.shardSync.mutex.Lock()
|
||||
defer r.shardSync.mutex.Unlock()
|
||||
return r.shardSync.allInSync
|
||||
}
|
||||
|
|
|
@ -40,6 +40,11 @@ type Resources struct {
|
|||
timestamp time.Time // Timestamp of last fetch of cluster health
|
||||
mutex sync.Mutex // Mutex guarding fields in this struct
|
||||
}
|
||||
shardSync struct {
|
||||
allInSync bool
|
||||
timestamp time.Time
|
||||
mutex sync.Mutex
|
||||
}
|
||||
}
|
||||
|
||||
// NewResources creates a new Resources service, used to
|
||||
|
|
Loading…
Reference in a new issue