diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a9ab1e17..ca4d30f33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,13 @@ # Change Log ## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A) -- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status +- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status= - (Feature) Early connections support - (Bugfix) Fix and document action timeouts - (Feature) Propagate sidecars' ports to a member's service - (Debug Package) Initial commit - (Feature) Detach PVC from deployment in Ordered indexing method +- (Feature) OPS Alerts ## [1.2.16](https://github.com/arangodb/kube-arangodb/tree/1.2.16) (2022-09-14) - (Feature) Add ArangoDeployment ServerGroupStatus diff --git a/docs/generated/metrics/README.md b/docs/generated/metrics/README.md index 63c3cc205..58da2bc91 100644 --- a/docs/generated/metrics/README.md +++ b/docs/generated/metrics/README.md @@ -14,6 +14,7 @@ | [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable | | [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present | | [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving | +| [arangodb_operator_engine_ops_alerts](./arangodb_operator_engine_ops_alerts.md) | arangodb_operator | engine | Counter | Counter for actions which requires ops attention | | [arangodb_operator_engine_panics_recovered](./arangodb_operator_engine_panics_recovered.md) | arangodb_operator | engine | Counter | Number of Panics recovered inside Operator reconciliation loop | | [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) | | [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled | diff --git a/docs/generated/metrics/arangodb_operator_engine_ops_alerts.md b/docs/generated/metrics/arangodb_operator_engine_ops_alerts.md new file mode 100644 index 000000000..8901cace4 --- /dev/null +++ b/docs/generated/metrics/arangodb_operator_engine_ops_alerts.md @@ -0,0 +1,19 @@ +# arangodb_operator_engine_ops_alerts (Counter) + +## Description + +Counter for actions which requires ops attention + +## Labels + +| Label | Description | +|:---------:|:---------------------| +| namespace | Deployment Namespace | +| name | Deployment Name | + + +## Alerting + +| Priority | Query | Description | +|:--------:|:--------------------------------------------------:|:--------------------------------------------| +| Warning | irate(arangodb_operator_engine_ops_alerts[1m]) > 1 | Trigger an alert if OPS attention is needed | diff --git a/internal/metrics.yaml b/internal/metrics.yaml index d2904e754..0a992c4b8 100644 --- a/internal/metrics.yaml +++ b/internal/metrics.yaml @@ -221,4 +221,18 @@ namespaces: labels: - key: section description: "Panic Section" + ops_alerts: + shortDescription: "Counter for actions which requires ops attention" + description: "Counter for actions which requires ops attention" + type: "Counter" + labels: + - key: namespace + description: "Deployment Namespace" + - key: name + description: "Deployment Name" + alertingRules: + - priority: Warning + query: irate(arangodb_operator_engine_ops_alerts[1m]) > 1 + description: "Trigger an alert if OPS attention is needed" + \ No newline at end of file diff --git a/pkg/deployment/context_impl.go b/pkg/deployment/context_impl.go index 7b1a0daea..344262077 100644 --- a/pkg/deployment/context_impl.go +++ b/pkg/deployment/context_impl.go @@ -23,6 +23,7 @@ package deployment import ( "context" "crypto/tls" + "fmt" "net" nhttp "net/http" "strconv" @@ -651,3 +652,13 @@ func (d *Deployment) GenerateMemberEndpoint(group api.ServerGroup, member api.Me func (d *Deployment) ACS() sutil.ACS { return d.acs } + +func (d *Deployment) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) { + if d == nil { + return + } + + d.metrics.ArangodbOperatorEngineOpsAlerts++ + + d.CreateEvent(k8sutil.NewOperatorEngineOpsAlertEvent(fmt.Sprintf(message, args...), d.GetAPIObject())) +} diff --git a/pkg/deployment/metrics.go b/pkg/deployment/metrics.go index d3d241905..39d4eea8d 100644 --- a/pkg/deployment/metrics.go +++ b/pkg/deployment/metrics.go @@ -36,6 +36,8 @@ type Metrics struct { DeploymentValidationErrors, DeploymentImmutableErrors, StatusRestores uint64 } + ArangodbOperatorEngineOpsAlerts int + Deployment struct { Accepted, UpToDate bool } diff --git a/pkg/deployment/reconcile/action_cleanout_member.go b/pkg/deployment/reconcile/action_cleanout_member.go index e93cb6d4b..d89a0336f 100644 --- a/pkg/deployment/reconcile/action_cleanout_member.go +++ b/pkg/deployment/reconcile/action_cleanout_member.go @@ -141,6 +141,12 @@ func (a *actionCleanOutMember) CheckProgress(ctx context.Context) (bool, bool, e } } + if cache.PlanServers().Contains(agency.Server(m.ID)) { + // Something is wrong, servers is CleanedOut but still exists in the Plan + a.actionCtx.CreateOperatorEngineOpsAlertEvent("DBServer %s still exists in Plan after CleanOut", m.ID) + return false, true, nil + } + // Cleanout completed return true, false, nil } diff --git a/pkg/deployment/reconcile/action_context.go b/pkg/deployment/reconcile/action_context.go index 62d0eb748..0e4aa8315 100644 --- a/pkg/deployment/reconcile/action_context.go +++ b/pkg/deployment/reconcile/action_context.go @@ -51,6 +51,7 @@ type ActionContext interface { reconciler.ArangoAgencyGet reconciler.DeploymentInfoGetter reconciler.DeploymentDatabaseClient + reconciler.KubernetesEventGenerator member.StateInspectorGetter @@ -130,6 +131,10 @@ type actionContext struct { metrics *Metrics } +func (ac *actionContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) { + ac.context.CreateOperatorEngineOpsAlertEvent(message, args...) +} + func (ac *actionContext) Metrics() *Metrics { return ac.metrics } diff --git a/pkg/deployment/reconcile/plan_builder_test.go b/pkg/deployment/reconcile/plan_builder_test.go index 0d9367fb7..866bbc682 100644 --- a/pkg/deployment/reconcile/plan_builder_test.go +++ b/pkg/deployment/reconcile/plan_builder_test.go @@ -86,6 +86,11 @@ type testContext struct { state member.StateInspector } +func (c *testContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) { + //TODO implement me + panic("implement me") +} + func (c *testContext) GetAgencyHealth() (agencyCache.Health, bool) { //TODO implement me panic("implement me") diff --git a/pkg/deployment/reconciler/context.go b/pkg/deployment/reconciler/context.go index 6f1f0b600..dda2b13cf 100644 --- a/pkg/deployment/reconciler/context.go +++ b/pkg/deployment/reconciler/context.go @@ -138,6 +138,8 @@ type KubernetesEventGenerator interface { // CreateEvent creates a given event. // On error, the error is logged. CreateEvent(evt *k8sutil.Event) + + CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) } // DeploymentClient provides functionalities to get deployment's clients. diff --git a/pkg/generated/metric_descriptions/arangodb_operator_engine_ops_alerts.go b/pkg/generated/metric_descriptions/arangodb_operator_engine_ops_alerts.go new file mode 100644 index 000000000..911430ee5 --- /dev/null +++ b/pkg/generated/metric_descriptions/arangodb_operator_engine_ops_alerts.go @@ -0,0 +1,39 @@ +// +// DISCLAIMER +// +// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package metric_descriptions + +import "github.com/arangodb/kube-arangodb/pkg/util/metrics" + +var ( + arangodbOperatorEngineOpsAlerts = metrics.NewDescription("arangodb_operator_engine_ops_alerts", "Counter for actions which requires ops attention", []string{`namespace`, `name`}, nil) +) + +func init() { + registerDescription(arangodbOperatorEngineOpsAlerts) +} + +func ArangodbOperatorEngineOpsAlerts() metrics.Description { + return arangodbOperatorEngineOpsAlerts +} + +func ArangodbOperatorEngineOpsAlertsCounter(value float64, namespace string, name string) metrics.Metric { + return ArangodbOperatorEngineOpsAlerts().Gauge(value, namespace, name) +} diff --git a/pkg/util/k8sutil/events.go b/pkg/util/k8sutil/events.go index 21b7f63c6..561048f7c 100644 --- a/pkg/util/k8sutil/events.go +++ b/pkg/util/k8sutil/events.go @@ -274,3 +274,12 @@ func newDeploymentEvent(apiObject runtime.Object) *Event { InvolvedObject: apiObject, } } + +// NewOperatorEngineOpsAlertEvent creates an even of type OperatorEngineOpsAlert. +func NewOperatorEngineOpsAlertEvent(reason string, apiObject APIObject) *Event { + event := newDeploymentEvent(apiObject) + event.Type = core.EventTypeWarning + event.Reason = "OperatorEngineOpsAlert" + event.Message = fmt.Sprintf("Event OperatorEngineOpsAlert raised, investigation needed: %s", reason) + return event +}