mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 11:48:53 +00:00
pkg: add prometheus_operator_reconcile_operations_total metric (#3415)
* pkg: add prometheus_operator_reconcile_operations_total metric We already have the `prometheus_operator_reconcile_errors_total` metric to track the number of reconciliation attempts that failed but we miss the number of attempts which makes it harder to alert on it. With this change, we can compute the ratio of reconciliations that failed. Signed-off-by: Simon Pasquier <spasquie@redhat.com> * Update alert definition with new metric
This commit is contained in:
parent
c75a7e27c8
commit
e64718cb6b
6 changed files with 20 additions and 6 deletions
example/mixin
jsonnet/mixin/alerts
pkg
|
@ -13,11 +13,11 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
|
||||
}} Namespace.
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed
|
||||
for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||
summary: Errors while reconciling controller.
|
||||
expr: |
|
||||
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m])) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
@ -21,13 +21,13 @@
|
|||
{
|
||||
alert: 'PrometheusOperatorReconcileErrors',
|
||||
expr: |||
|
||||
rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{%(prometheusOperatorSelector)s}[5m])) > 0.1
|
||||
||| % $._config,
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.',
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
|
||||
summary: 'Errors while reconciling controller.',
|
||||
},
|
||||
'for': '10m',
|
||||
|
|
|
@ -282,6 +282,7 @@ func (c *Operator) processNextWorkItem() bool {
|
|||
}
|
||||
defer c.queue.Done(key)
|
||||
|
||||
c.metrics.ReconcileCounter().Inc()
|
||||
err := c.sync(key.(string))
|
||||
if err == nil {
|
||||
c.queue.Forget(key)
|
||||
|
|
|
@ -31,6 +31,7 @@ type Metrics struct {
|
|||
listFailedCounter prometheus.Counter
|
||||
watchCounter prometheus.Counter
|
||||
watchFailedCounter prometheus.Counter
|
||||
reconcileCounter prometheus.Counter
|
||||
reconcileErrorsCounter prometheus.Counter
|
||||
stsDeleteCreateCounter prometheus.Counter
|
||||
// triggerByCounter is a set of counters keeping track of the amount
|
||||
|
@ -46,9 +47,13 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
|
|||
reg := prometheus.WrapRegistererWith(prometheus.Labels{"controller": name}, r)
|
||||
m := Metrics{
|
||||
reg: reg,
|
||||
reconcileCounter: prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "prometheus_operator_reconcile_operations_total",
|
||||
Help: "Total number of reconcile operations",
|
||||
}),
|
||||
reconcileErrorsCounter: prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "prometheus_operator_reconcile_errors_total",
|
||||
Help: "Number of errors that occurred while reconciling the statefulset",
|
||||
Help: "Number of errors that occurred during reconcile operations",
|
||||
}),
|
||||
triggerByCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "prometheus_operator_triggered_total",
|
||||
|
@ -77,6 +82,7 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
|
|||
}),
|
||||
}
|
||||
m.reg.MustRegister(
|
||||
m.reconcileCounter,
|
||||
m.reconcileErrorsCounter,
|
||||
m.triggerByCounter,
|
||||
m.stsDeleteCreateCounter,
|
||||
|
@ -88,6 +94,11 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
|
|||
return &m
|
||||
}
|
||||
|
||||
// ReconcileCounter returns a counter to track attempted reconciliations.
|
||||
func (m *Metrics) ReconcileCounter() prometheus.Counter {
|
||||
return m.reconcileCounter
|
||||
}
|
||||
|
||||
// ReconcileErrorsCounter returns a counter to track reconciliation errors.
|
||||
func (m *Metrics) ReconcileErrorsCounter() prometheus.Counter {
|
||||
return m.reconcileErrorsCounter
|
||||
|
|
|
@ -1093,6 +1093,7 @@ func (c *Operator) processNextWorkItem() bool {
|
|||
}
|
||||
defer c.queue.Done(key)
|
||||
|
||||
c.metrics.ReconcileCounter().Inc()
|
||||
err := c.sync(key.(string))
|
||||
if err == nil {
|
||||
c.queue.Forget(key)
|
||||
|
|
|
@ -554,6 +554,7 @@ func (o *Operator) processNextWorkItem() bool {
|
|||
}
|
||||
defer o.queue.Done(key)
|
||||
|
||||
o.metrics.ReconcileCounter().Inc()
|
||||
err := o.sync(key.(string))
|
||||
if err == nil {
|
||||
o.queue.Forget(key)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue