mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Feature] OPS Alerts (#1119)
This commit is contained in:
parent
7a416e524d
commit
987cefeab5
12 changed files with 115 additions and 1 deletions
|
@ -1,12 +1,13 @@
|
|||
# Change Log
|
||||
|
||||
## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A)
|
||||
- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status
|
||||
- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status=
|
||||
- (Feature) Early connections support
|
||||
- (Bugfix) Fix and document action timeouts
|
||||
- (Feature) Propagate sidecars' ports to a member's service
|
||||
- (Debug Package) Initial commit
|
||||
- (Feature) Detach PVC from deployment in Ordered indexing method
|
||||
- (Feature) OPS Alerts
|
||||
|
||||
## [1.2.16](https://github.com/arangodb/kube-arangodb/tree/1.2.16) (2022-09-14)
|
||||
- (Feature) Add ArangoDeployment ServerGroupStatus
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
|
||||
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
|
||||
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
|
||||
| [arangodb_operator_engine_ops_alerts](./arangodb_operator_engine_ops_alerts.md) | arangodb_operator | engine | Counter | Counter for actions which requires ops attention |
|
||||
| [arangodb_operator_engine_panics_recovered](./arangodb_operator_engine_panics_recovered.md) | arangodb_operator | engine | Counter | Number of Panics recovered inside Operator reconciliation loop |
|
||||
| [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) |
|
||||
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
# arangodb_operator_engine_ops_alerts (Counter)
|
||||
|
||||
## Description
|
||||
|
||||
Counter for actions which requires ops attention
|
||||
|
||||
## Labels
|
||||
|
||||
| Label | Description |
|
||||
|:---------:|:---------------------|
|
||||
| namespace | Deployment Namespace |
|
||||
| name | Deployment Name |
|
||||
|
||||
|
||||
## Alerting
|
||||
|
||||
| Priority | Query | Description |
|
||||
|:--------:|:--------------------------------------------------:|:--------------------------------------------|
|
||||
| Warning | irate(arangodb_operator_engine_ops_alerts[1m]) > 1 | Trigger an alert if OPS attention is needed |
|
|
@ -221,4 +221,18 @@ namespaces:
|
|||
labels:
|
||||
- key: section
|
||||
description: "Panic Section"
|
||||
ops_alerts:
|
||||
shortDescription: "Counter for actions which requires ops attention"
|
||||
description: "Counter for actions which requires ops attention"
|
||||
type: "Counter"
|
||||
labels:
|
||||
- key: namespace
|
||||
description: "Deployment Namespace"
|
||||
- key: name
|
||||
description: "Deployment Name"
|
||||
alertingRules:
|
||||
- priority: Warning
|
||||
query: irate(arangodb_operator_engine_ops_alerts[1m]) > 1
|
||||
description: "Trigger an alert if OPS attention is needed"
|
||||
|
||||
|
|
@ -23,6 +23,7 @@ package deployment
|
|||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"net"
|
||||
nhttp "net/http"
|
||||
"strconv"
|
||||
|
@ -651,3 +652,13 @@ func (d *Deployment) GenerateMemberEndpoint(group api.ServerGroup, member api.Me
|
|||
func (d *Deployment) ACS() sutil.ACS {
|
||||
return d.acs
|
||||
}
|
||||
|
||||
func (d *Deployment) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
|
||||
if d == nil {
|
||||
return
|
||||
}
|
||||
|
||||
d.metrics.ArangodbOperatorEngineOpsAlerts++
|
||||
|
||||
d.CreateEvent(k8sutil.NewOperatorEngineOpsAlertEvent(fmt.Sprintf(message, args...), d.GetAPIObject()))
|
||||
}
|
||||
|
|
|
@ -36,6 +36,8 @@ type Metrics struct {
|
|||
DeploymentValidationErrors, DeploymentImmutableErrors, StatusRestores uint64
|
||||
}
|
||||
|
||||
ArangodbOperatorEngineOpsAlerts int
|
||||
|
||||
Deployment struct {
|
||||
Accepted, UpToDate bool
|
||||
}
|
||||
|
|
|
@ -141,6 +141,12 @@ func (a *actionCleanOutMember) CheckProgress(ctx context.Context) (bool, bool, e
|
|||
}
|
||||
}
|
||||
|
||||
if cache.PlanServers().Contains(agency.Server(m.ID)) {
|
||||
// Something is wrong, servers is CleanedOut but still exists in the Plan
|
||||
a.actionCtx.CreateOperatorEngineOpsAlertEvent("DBServer %s still exists in Plan after CleanOut", m.ID)
|
||||
return false, true, nil
|
||||
}
|
||||
|
||||
// Cleanout completed
|
||||
return true, false, nil
|
||||
}
|
||||
|
|
|
@ -51,6 +51,7 @@ type ActionContext interface {
|
|||
reconciler.ArangoAgencyGet
|
||||
reconciler.DeploymentInfoGetter
|
||||
reconciler.DeploymentDatabaseClient
|
||||
reconciler.KubernetesEventGenerator
|
||||
|
||||
member.StateInspectorGetter
|
||||
|
||||
|
@ -130,6 +131,10 @@ type actionContext struct {
|
|||
metrics *Metrics
|
||||
}
|
||||
|
||||
func (ac *actionContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
|
||||
ac.context.CreateOperatorEngineOpsAlertEvent(message, args...)
|
||||
}
|
||||
|
||||
func (ac *actionContext) Metrics() *Metrics {
|
||||
return ac.metrics
|
||||
}
|
||||
|
|
|
@ -86,6 +86,11 @@ type testContext struct {
|
|||
state member.StateInspector
|
||||
}
|
||||
|
||||
func (c *testContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (c *testContext) GetAgencyHealth() (agencyCache.Health, bool) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
|
|
|
@ -138,6 +138,8 @@ type KubernetesEventGenerator interface {
|
|||
// CreateEvent creates a given event.
|
||||
// On error, the error is logged.
|
||||
CreateEvent(evt *k8sutil.Event)
|
||||
|
||||
CreateOperatorEngineOpsAlertEvent(message string, args ...interface{})
|
||||
}
|
||||
|
||||
// DeploymentClient provides functionalities to get deployment's clients.
|
||||
|
|
39
pkg/generated/metric_descriptions/arangodb_operator_engine_ops_alerts.go
generated
Normal file
39
pkg/generated/metric_descriptions/arangodb_operator_engine_ops_alerts.go
generated
Normal file
|
@ -0,0 +1,39 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package metric_descriptions
|
||||
|
||||
import "github.com/arangodb/kube-arangodb/pkg/util/metrics"
|
||||
|
||||
var (
|
||||
arangodbOperatorEngineOpsAlerts = metrics.NewDescription("arangodb_operator_engine_ops_alerts", "Counter for actions which requires ops attention", []string{`namespace`, `name`}, nil)
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerDescription(arangodbOperatorEngineOpsAlerts)
|
||||
}
|
||||
|
||||
func ArangodbOperatorEngineOpsAlerts() metrics.Description {
|
||||
return arangodbOperatorEngineOpsAlerts
|
||||
}
|
||||
|
||||
func ArangodbOperatorEngineOpsAlertsCounter(value float64, namespace string, name string) metrics.Metric {
|
||||
return ArangodbOperatorEngineOpsAlerts().Gauge(value, namespace, name)
|
||||
}
|
|
@ -274,3 +274,12 @@ func newDeploymentEvent(apiObject runtime.Object) *Event {
|
|||
InvolvedObject: apiObject,
|
||||
}
|
||||
}
|
||||
|
||||
// NewOperatorEngineOpsAlertEvent creates an even of type OperatorEngineOpsAlert.
|
||||
func NewOperatorEngineOpsAlertEvent(reason string, apiObject APIObject) *Event {
|
||||
event := newDeploymentEvent(apiObject)
|
||||
event.Type = core.EventTypeWarning
|
||||
event.Reason = "OperatorEngineOpsAlert"
|
||||
event.Message = fmt.Sprintf("Event OperatorEngineOpsAlert raised, investigation needed: %s", reason)
|
||||
return event
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue