1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

Monitor sync status concurrently. Only create new upgrade plan if everything is good.

This commit is contained in:
lamai93 2019-04-04 17:06:27 +02:00
parent 8749bfaa9a
commit 90b1d87daa
9 changed files with 114 additions and 24 deletions

View file

@ -380,3 +380,8 @@ func (d *Deployment) GetExpectedPodArguments(apiObject metav1.Object, deplSpec a
agents api.MemberStatusList, id string, version driver.Version) []string {
return d.resources.GetExpectedPodArguments(apiObject, deplSpec, group, agents, id, version)
}
// GetShardSyncStatus returns true if all shards are in sync
func (d *Deployment) GetShardSyncStatus() bool {
return d.resources.GetShardSyncStatus()
}

View file

@ -141,6 +141,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
d.clusterScalingIntegration = ci
go ci.ListenForClusterEvents(d.stopCh)
go d.resources.RunDeploymentHealthLoop(d.stopCh)
go d.resources.RunDeploymentShardSyncLoop(d.stopCh)
}
if config.AllowChaos {
d.chaosMonkey = chaos.NewMonkey(deps.Log, d)

View file

@ -167,26 +167,6 @@ func (a *actionWaitForMemberUp) checkProgressCluster(ctx context.Context) (bool,
log.Debug().Str("status", string(sh.Status)).Msg("Member set status not yet good")
return false, false, nil
}
if a.action.Group == api.ServerGroupDBServers {
dbs, err := c.Databases(ctx)
if err != nil {
return false, false, err
}
for _, db := range dbs {
inv, err := cluster.DatabaseInventory(ctx, db)
if err != nil {
return false, false, err
}
for _, col := range inv.Collections {
if !col.AllInSync {
log.Debug().Str("col", col.Parameters.Name).Msg("Not in sync")
return false, false, nil
}
}
}
}
// Wait for the member to become ready from a kubernetes point of view
// otherwise the coordinators may be rotated to fast and thus none of them
// is ready resulting in a short downtime

View file

@ -93,4 +93,6 @@ type Context interface {
// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.
GetExpectedPodArguments(apiObject metav1.Object, deplSpec api.DeploymentSpec, group api.ServerGroup,
agents api.MemberStatusList, id string, version driver.Version) []string
// GetShardSyncStatus returns true if all shards are in sync
GetShardSyncStatus() bool
}

View file

@ -209,12 +209,17 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
})
return newPlan, upgradeNotAllowed, fromVersion, toVersion, fromLicense, toLicense
}
if newPlan, upgradeNotAllowed, fromVersion, toVersion, fromLicense, toLicense := createRotateOrUpgradePlan(); upgradeNotAllowed {
// Upgrade is needed, but not allowed
context.CreateEvent(k8sutil.NewUpgradeNotAllowedEvent(apiObject, fromVersion, toVersion, fromLicense, toLicense))
} else {
// Use the new plan
plan = newPlan
} else if len(newPlan) > 0 {
if clusterReadyForUpgrade(context) {
// Use the new plan
plan = newPlan
} else {
log.Info().Msg("Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready.")
}
}
}
@ -237,6 +242,15 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
return plan, true
}
// clusterReadyForUpgrade returns true if the cluster is ready for the next update, that is:
// - all shards are in sync
// - all members are ready and fine
func clusterReadyForUpgrade(context PlanBuilderContext) bool {
status, _ := context.GetStatus()
allInSync := context.GetShardSyncStatus()
return allInSync && status.Conditions.IsTrue(api.ConditionTypeReady)
}
// podNeedsUpgrading decides if an upgrade of the pod is needed (to comply with
// the given spec) and if that is allowed.
func podNeedsUpgrading(log zerolog.Logger, p v1.Pod, spec api.DeploymentSpec, images api.ImageInfoList) upgradeDecision {

View file

@ -46,6 +46,10 @@ type PlanBuilderContext interface {
// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.
GetExpectedPodArguments(apiObject metav1.Object, deplSpec api.DeploymentSpec, group api.ServerGroup,
agents api.MemberStatusList, id string, version driver.Version) []string
// GetShardSyncStatus returns true if all shards are in sync
GetShardSyncStatus() bool
// GetStatus returns the current status of the deployment
GetStatus() (api.DeploymentStatus, int32)
}
// newPlanBuilderContext creates a PlanBuilderContext from the given context

View file

@ -33,5 +33,5 @@ const (
rotateMemberTimeout = time.Minute * 30
shutdownMemberTimeout = time.Minute * 30
upgradeMemberTimeout = time.Hour * 6
waitForMemberUpTimeout = time.Minute * 15
waitForMemberUpTimeout = time.Minute * 45
)

View file

@ -28,10 +28,12 @@ import (
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/metrics"
"github.com/rs/zerolog/log"
)
var (
deploymentHealthFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_health_fetches", "Number of times the health of the deployment was fetched", metrics.DeploymentName, metrics.Result)
deploymentSyncFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_sync_fetches", "Number of times the sync status of shards of the deplyoment was fetched", metrics.DeploymentName, metrics.Result)
)
// RunDeploymentHealthLoop creates a loop to fetch the health of the deployment.
@ -88,3 +90,80 @@ func (r *Resources) fetchDeploymentHealth() error {
r.health.timestamp = time.Now()
return nil
}
// RunDeploymentShardSyncLoop creates a loop to fetch the sync status of shards of the deployment.
// The loop ends when the given channel is closed.
func (r *Resources) RunDeploymentShardSyncLoop(stopCh <-chan struct{}) {
log := r.log
deploymentName := r.context.GetAPIObject().GetName()
if r.context.GetSpec().GetMode() != api.DeploymentModeCluster {
// Deployment health is currently only applicable for clusters
return
}
for {
if err := r.fetchClusterShardSyncState(); err != nil {
log.Debug().Err(err).Msg("Failed to fetch deployment shard sync state")
deploymentSyncFetchesCounters.WithLabelValues(deploymentName, metrics.Failed).Inc()
} else {
deploymentSyncFetchesCounters.WithLabelValues(deploymentName, metrics.Success).Inc()
}
select {
case <-time.After(time.Second * 30):
// Continue
case <-stopCh:
// We're done
return
}
}
}
// fetchClusterShardSyncState performs a single fetch of the cluster inventory and
// checks if all shards are in sync
func (r *Resources) fetchClusterShardSyncState() error {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*15)
defer cancel()
c, err := r.context.GetDatabaseClient(ctx)
if err != nil {
return err
}
cluster, err := c.Cluster(ctx)
if err != nil {
return err
}
dbs, err := c.Databases(ctx)
if err != nil {
return err
}
allInSync := true
dbloop:
for _, db := range dbs {
inv, err := cluster.DatabaseInventory(ctx, db)
if err != nil {
return err
}
for _, col := range inv.Collections {
if !col.AllInSync {
log.Debug().Str("col", col.Parameters.Name).Msg("Not in sync")
allInSync = false
break dbloop
}
}
}
r.shardSync.mutex.Lock()
defer r.shardSync.mutex.Unlock()
r.shardSync.allInSync = allInSync
r.shardSync.timestamp = time.Now()
return nil
}
// GetShardSyncStatus returns true if all shards are in sync
func (r *Resources) GetShardSyncStatus() bool {
r.shardSync.mutex.Lock()
defer r.shardSync.mutex.Unlock()
return r.shardSync.allInSync
}

View file

@ -40,6 +40,11 @@ type Resources struct {
timestamp time.Time // Timestamp of last fetch of cluster health
mutex sync.Mutex // Mutex guarding fields in this struct
}
shardSync struct {
allInSync bool
timestamp time.Time
mutex sync.Mutex
}
}
// NewResources creates a new Resources service, used to