1
0
Fork 0
mirror of https://github.com/prometheus-operator/prometheus-operator.git synced 2025-04-21 11:48:53 +00:00

pkg: add prometheus_operator_reconcile_operations_total metric ()

* pkg: add prometheus_operator_reconcile_operations_total metric

We already have the `prometheus_operator_reconcile_errors_total` metric
to track the number of reconciliation attempts that failed but we miss
the number of attempts which makes it harder to alert on it. With this
change, we can compute the ratio of reconciliations that failed.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>

* Update alert definition with new metric
This commit is contained in:
Simon Pasquier 2020-08-19 16:41:02 +02:00 committed by GitHub
parent c75a7e27c8
commit e64718cb6b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 20 additions and 6 deletions
example/mixin
jsonnet/mixin/alerts
pkg
alertmanager
operator
prometheus
thanos

View file

@ -13,11 +13,11 @@ groups:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
}} Namespace.
description: '{{ $value | humanizePercentage }} of reconciling operations failed
for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
summary: Errors while reconciling controller.
expr: |
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m])) > 0.1
for: 10m
labels:
severity: warning

View file

@ -21,13 +21,13 @@
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||
rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{%(prometheusOperatorSelector)s}[5m])) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.',
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
summary: 'Errors while reconciling controller.',
},
'for': '10m',

View file

@ -282,6 +282,7 @@ func (c *Operator) processNextWorkItem() bool {
}
defer c.queue.Done(key)
c.metrics.ReconcileCounter().Inc()
err := c.sync(key.(string))
if err == nil {
c.queue.Forget(key)

View file

@ -31,6 +31,7 @@ type Metrics struct {
listFailedCounter prometheus.Counter
watchCounter prometheus.Counter
watchFailedCounter prometheus.Counter
reconcileCounter prometheus.Counter
reconcileErrorsCounter prometheus.Counter
stsDeleteCreateCounter prometheus.Counter
// triggerByCounter is a set of counters keeping track of the amount
@ -46,9 +47,13 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
reg := prometheus.WrapRegistererWith(prometheus.Labels{"controller": name}, r)
m := Metrics{
reg: reg,
reconcileCounter: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_operator_reconcile_operations_total",
Help: "Total number of reconcile operations",
}),
reconcileErrorsCounter: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_operator_reconcile_errors_total",
Help: "Number of errors that occurred while reconciling the statefulset",
Help: "Number of errors that occurred during reconcile operations",
}),
triggerByCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "prometheus_operator_triggered_total",
@ -77,6 +82,7 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
}),
}
m.reg.MustRegister(
m.reconcileCounter,
m.reconcileErrorsCounter,
m.triggerByCounter,
m.stsDeleteCreateCounter,
@ -88,6 +94,11 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
return &m
}
// ReconcileCounter returns a counter to track attempted reconciliations.
func (m *Metrics) ReconcileCounter() prometheus.Counter {
return m.reconcileCounter
}
// ReconcileErrorsCounter returns a counter to track reconciliation errors.
func (m *Metrics) ReconcileErrorsCounter() prometheus.Counter {
return m.reconcileErrorsCounter

View file

@ -1093,6 +1093,7 @@ func (c *Operator) processNextWorkItem() bool {
}
defer c.queue.Done(key)
c.metrics.ReconcileCounter().Inc()
err := c.sync(key.(string))
if err == nil {
c.queue.Forget(key)

View file

@ -554,6 +554,7 @@ func (o *Operator) processNextWorkItem() bool {
}
defer o.queue.Done(key)
o.metrics.ReconcileCounter().Inc()
err := o.sync(key.(string))
if err == nil {
o.queue.Forget(key)