1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] OPS Alerts (#1119)

This commit is contained in:
Adam Janikowski 2022-09-22 11:07:39 +02:00 committed by GitHub
parent 7a416e524d
commit 987cefeab5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 115 additions and 1 deletions

View file

@ -1,12 +1,13 @@
# Change Log
## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A)
- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status
- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status=
- (Feature) Early connections support
- (Bugfix) Fix and document action timeouts
- (Feature) Propagate sidecars' ports to a member's service
- (Debug Package) Initial commit
- (Feature) Detach PVC from deployment in Ordered indexing method
- (Feature) OPS Alerts
## [1.2.16](https://github.com/arangodb/kube-arangodb/tree/1.2.16) (2022-09-14)
- (Feature) Add ArangoDeployment ServerGroupStatus

View file

@ -14,6 +14,7 @@
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
| [arangodb_operator_engine_ops_alerts](./arangodb_operator_engine_ops_alerts.md) | arangodb_operator | engine | Counter | Counter for actions which requires ops attention |
| [arangodb_operator_engine_panics_recovered](./arangodb_operator_engine_panics_recovered.md) | arangodb_operator | engine | Counter | Number of Panics recovered inside Operator reconciliation loop |
| [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) |
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |

View file

@ -0,0 +1,19 @@
# arangodb_operator_engine_ops_alerts (Counter)
## Description
Counter for actions which requires ops attention
## Labels
| Label | Description |
|:---------:|:---------------------|
| namespace | Deployment Namespace |
| name | Deployment Name |
## Alerting
| Priority | Query | Description |
|:--------:|:--------------------------------------------------:|:--------------------------------------------|
| Warning | irate(arangodb_operator_engine_ops_alerts[1m]) > 1 | Trigger an alert if OPS attention is needed |

View file

@ -221,4 +221,18 @@ namespaces:
labels:
- key: section
description: "Panic Section"
ops_alerts:
shortDescription: "Counter for actions which requires ops attention"
description: "Counter for actions which requires ops attention"
type: "Counter"
labels:
- key: namespace
description: "Deployment Namespace"
- key: name
description: "Deployment Name"
alertingRules:
- priority: Warning
query: irate(arangodb_operator_engine_ops_alerts[1m]) > 1
description: "Trigger an alert if OPS attention is needed"

View file

@ -23,6 +23,7 @@ package deployment
import (
"context"
"crypto/tls"
"fmt"
"net"
nhttp "net/http"
"strconv"
@ -651,3 +652,13 @@ func (d *Deployment) GenerateMemberEndpoint(group api.ServerGroup, member api.Me
func (d *Deployment) ACS() sutil.ACS {
return d.acs
}
func (d *Deployment) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
if d == nil {
return
}
d.metrics.ArangodbOperatorEngineOpsAlerts++
d.CreateEvent(k8sutil.NewOperatorEngineOpsAlertEvent(fmt.Sprintf(message, args...), d.GetAPIObject()))
}

View file

@ -36,6 +36,8 @@ type Metrics struct {
DeploymentValidationErrors, DeploymentImmutableErrors, StatusRestores uint64
}
ArangodbOperatorEngineOpsAlerts int
Deployment struct {
Accepted, UpToDate bool
}

View file

@ -141,6 +141,12 @@ func (a *actionCleanOutMember) CheckProgress(ctx context.Context) (bool, bool, e
}
}
if cache.PlanServers().Contains(agency.Server(m.ID)) {
// Something is wrong, servers is CleanedOut but still exists in the Plan
a.actionCtx.CreateOperatorEngineOpsAlertEvent("DBServer %s still exists in Plan after CleanOut", m.ID)
return false, true, nil
}
// Cleanout completed
return true, false, nil
}

View file

@ -51,6 +51,7 @@ type ActionContext interface {
reconciler.ArangoAgencyGet
reconciler.DeploymentInfoGetter
reconciler.DeploymentDatabaseClient
reconciler.KubernetesEventGenerator
member.StateInspectorGetter
@ -130,6 +131,10 @@ type actionContext struct {
metrics *Metrics
}
func (ac *actionContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
ac.context.CreateOperatorEngineOpsAlertEvent(message, args...)
}
func (ac *actionContext) Metrics() *Metrics {
return ac.metrics
}

View file

@ -86,6 +86,11 @@ type testContext struct {
state member.StateInspector
}
func (c *testContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
//TODO implement me
panic("implement me")
}
func (c *testContext) GetAgencyHealth() (agencyCache.Health, bool) {
//TODO implement me
panic("implement me")

View file

@ -138,6 +138,8 @@ type KubernetesEventGenerator interface {
// CreateEvent creates a given event.
// On error, the error is logged.
CreateEvent(evt *k8sutil.Event)
CreateOperatorEngineOpsAlertEvent(message string, args ...interface{})
}
// DeploymentClient provides functionalities to get deployment's clients.

View file

@ -0,0 +1,39 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package metric_descriptions
import "github.com/arangodb/kube-arangodb/pkg/util/metrics"
var (
arangodbOperatorEngineOpsAlerts = metrics.NewDescription("arangodb_operator_engine_ops_alerts", "Counter for actions which requires ops attention", []string{`namespace`, `name`}, nil)
)
func init() {
registerDescription(arangodbOperatorEngineOpsAlerts)
}
func ArangodbOperatorEngineOpsAlerts() metrics.Description {
return arangodbOperatorEngineOpsAlerts
}
func ArangodbOperatorEngineOpsAlertsCounter(value float64, namespace string, name string) metrics.Metric {
return ArangodbOperatorEngineOpsAlerts().Gauge(value, namespace, name)
}

View file

@ -274,3 +274,12 @@ func newDeploymentEvent(apiObject runtime.Object) *Event {
InvolvedObject: apiObject,
}
}
// NewOperatorEngineOpsAlertEvent creates an even of type OperatorEngineOpsAlert.
func NewOperatorEngineOpsAlertEvent(reason string, apiObject APIObject) *Event {
event := newDeploymentEvent(apiObject)
event.Type = core.EventTypeWarning
event.Reason = "OperatorEngineOpsAlert"
event.Message = fmt.Sprintf("Event OperatorEngineOpsAlert raised, investigation needed: %s", reason)
return event
}