1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Bugfix] Check serving members (#1075)

This commit is contained in:
Tomasz Mielech 2022-07-27 15:17:36 +02:00 committed by GitHub
parent a97218af90
commit 5166c58bc7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 63 deletions

View file

@ -8,6 +8,7 @@
- (Feature) Add startup probe for coordinators
- (Feature) Use only connections for healthy members
- (Feature) Set condition to shrink agent volume size
- (Bugfix) Check serving servers
## [1.2.15](https://github.com/arangodb/kube-arangodb/tree/1.2.15) (2022-07-20)
- (Bugfix) Ensure pod names not too long

View file

@ -116,52 +116,32 @@ func (s *stateInspector) RefreshState(ctx context.Context, members api.Deploymen
var h Health
results := make([]State, len(members))
clients := make([]driver.Client, 0, 3)
mode := s.deployment.GetMode()
servingGroup := mode.ServingGroup()
var client driver.Client
members.ForEach(func(id int) {
ctxChild, cancel := globals.GetGlobalTimeouts().ArangoDCheck().WithTimeout(ctx)
defer cancel()
if members[id].Group.IsArangosync() {
results[id] = s.fetchArangosyncMemberState(ctxChild, members[id])
return
}
state := s.fetchServerMemberState(ctxChild, members[id])
if state.IsReachable() && members[id].Group == servingGroup &&
members[id].Member.Conditions.IsTrue(api.ConditionTypeServing) &&
!members[id].Member.Conditions.IsTrue(api.ConditionTypeTerminating) {
// Create slice with reachable clients (it does not mean that they are healthy).
// In the cluster mode it will be checked later which client is healthy.
if mode == api.DeploymentModeActiveFailover {
globals.GetGlobalTimeouts().ArangoDCheck().RunWithTimeout(ctx, func(ctxChild context.Context) error {
if found, _ := arangod.IsServerAvailable(ctxChild, state.client); found {
// Don't check error.
// If error occurs then `clients` slice will be empty and the error `ArangoDB is not reachable`
// will be returned.
clients = append(clients, state.client)
}
return nil
})
} else {
clients = append(clients, state.client)
} else {
results[id] = s.fetchServerMemberState(ctxChild, members[id], servingGroup)
if results[id].IsServing() {
client = results[id].client
}
cs.Version = state.Version
}
results[id] = state
})
if len(clients) > 0 && mode.IsCluster() {
// Get random reachable client.
cli := clients[rand.Intn(len(clients))]
// Clean all clients and rebuild it only with healthy clients.
clients = clients[:0]
if client == nil {
cs.NotReachableErr = errors.New("ArangoDB is not reachable")
} else if !mode.IsCluster() {
// In non-cluster mode take first client which serves.
cs.client = client
} else {
// Fetch health only in cluster mode.
h.Error = globals.GetGlobalTimeouts().ArangoDCheck().RunWithTimeout(ctx, func(ctxChild context.Context) error {
if cluster, err := cli.Cluster(ctxChild); err != nil {
if cluster, err := client.Cluster(ctxChild); err != nil {
return err
} else if health, err := cluster.Health(ctxChild); err != nil {
return err
@ -169,50 +149,37 @@ func (s *stateInspector) RefreshState(ctx context.Context, members api.Deploymen
h.Members = health.Health
}
// Find ArangoDB (not ArangoSync) members which are not healthy and mark them accordingly.
// Mark members if they are healthy.
for i, m := range members {
health, ok := h.Members[driver.ServerID(m.Member.ID)]
if ok && health.SyncStatus == driver.ServerSyncStatusServing && health.Status == driver.ServerStatusGood {
if m.Group == servingGroup {
clients = append(clients, results[i].client)
}
if !results[i].IsReachable() {
continue
}
if results[i].NotReachableErr != nil {
if ok {
results[i].NotReachableErr = errors.Newf("member is not healthy "+
"because syncStatus is %s and status is %s", health.SyncStatus, health.Status)
} else {
results[i].NotReachableErr = errors.Newf("member is unknown in ArangoDB healthy status")
if members[i].Group.IsArangosync() {
// ArangoSync is considered as healthy when it is possible to fetch version.
results[i].IsClusterHealthy = true
continue
}
if v, ok := h.Members[driver.ServerID(m.Member.ID)]; ok {
results[i].IsClusterHealthy = v.Status == driver.ServerStatusGood
if results[i].IsServing() && v.SyncStatus == driver.ServerSyncStatusServing {
if cs.client == nil || rand.Intn(100) > 50 {
// Set client from nil or take next client with 50% probability.
cs.client = results[i].client
cs.Version = results[i].Version
}
}
}
}
return nil
})
if h.Error != nil {
for i := range results {
if results[i].NotReachableErr != nil {
// A member already encountered an error.
continue
}
if results[i].syncClient != nil {
// ArangoSync Member is considered as healthy when version can be fetched.
continue
}
results[i].NotReachableErr = errors.Wrapf(h.Error, "cluster healthy is unknown")
}
}
}
if len(clients) > 0 {
cs.client = clients[rand.Intn(len(clients))]
} else {
if cs.client == nil {
cs.NotReachableErr = errors.New("ArangoDB is not reachable")
}
current := map[string]State{}
for id := range members {
@ -249,7 +216,9 @@ func (s *stateInspector) fetchArangosyncMemberState(ctx context.Context, m api.D
return state
}
func (s *stateInspector) fetchServerMemberState(ctx context.Context, m api.DeploymentStatusMemberElement) State {
func (s *stateInspector) fetchServerMemberState(ctx context.Context, m api.DeploymentStatusMemberElement,
servingGroup api.ServerGroup) State {
// by default, it is not serving. It will be changed if it serves.
var state State
c, err := s.deployment.GetServerClient(ctx, m.Group, m.Member.ID)
if err != nil {
@ -259,10 +228,20 @@ func (s *stateInspector) fetchServerMemberState(ctx context.Context, m api.Deplo
if v, err := c.Version(ctx); err != nil {
state.NotReachableErr = err
return state
} else {
state.Version = v
state.client = c
}
if m.Group == servingGroup {
// Server belongs to group of servers which should serve requests.
globals.GetGlobalTimeouts().ArangoDCheck().RunWithTimeout(ctx, func(ctxChild context.Context) error {
state.serving, _ = arangod.IsServerAvailable(ctxChild, state.client)
return nil
})
}
return state
}
@ -324,12 +303,16 @@ type Health struct {
type State struct {
// NotReachableErr set to non-nil if a member is not reachable.
NotReachableErr error
// IsClusterHealthy describes if member is healthy in a cluster. It is relevant only in cluster mode.
IsClusterHealthy bool
// Version of this specific member.
Version driver.VersionInfo
// client to this specific ArangoDB member.
client driver.Client
// client to this specific ArangoSync member.
syncClient client.API
// serving describes if a member can serve requests.
serving bool
}
// GetDatabaseClient returns client to the database.
@ -349,6 +332,11 @@ func (s State) IsReachable() bool {
return s.NotReachableErr == nil
}
// IsServing returns true when server can serve requests.
func (s State) IsServing() bool {
return s.serving
}
func (s State) WrapLogger(event *zerolog.Event) *zerolog.Event {
return event.Bool("reachable", s.IsReachable()).AnErr("reachableError", s.NotReachableErr)
}