1
0
Fork 0
mirror of https://github.com/prometheus-operator/prometheus-operator.git synced 2025-04-16 01:06:27 +00:00

feat: add metrics tracking status updates

This commit introduces 2 new counters:
* `prometheus_operator_status_update_errors_total`
* `prometheus_operator_status_update_operations_total`

The metrics have a `controller` label whose value is one of
`alertmanager`, `prometheus`, `prometheus-agent` or `thanos`.

It also adds a new alerting rule `PrometheusOperatorStatusUpdateErrors`
to the mixin which fires when a controller fails to update the status of
the objects that it manages.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
Simon Pasquier 2023-12-22 11:57:25 +01:00
parent 520e2034c7
commit 2e5169a7b7
3 changed files with 42 additions and 3 deletions

View file

@ -31,12 +31,21 @@ groups:
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
summary: Errors while reconciling controller.
summary: Errors while reconciling objects.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorStatusUpdateErrors
annotations:
description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
summary: Errors while updating objects status.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.

View file

@ -56,7 +56,21 @@
},
annotations: {
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
summary: 'Errors while reconciling controller.',
summary: 'Errors while reconciling objects.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorStatusUpdateErrors',
expr: |||
(sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_errors_total{%(prometheusOperatorSelector)s}[5m]))) / (sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_operations_total{%(prometheusOperatorSelector)s}[5m]))) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
summary: 'Errors while updating objects status.',
},
'for': '10m',
},

View file

@ -74,6 +74,8 @@ type ResourceReconciler struct {
reconcileTotal prometheus.Counter
reconcileErrors prometheus.Counter
reconcileDuration prometheus.Histogram
statusTotal prometheus.Counter
statusErrors prometheus.Counter
metrics ReconcilerMetrics
@ -113,7 +115,17 @@ func NewResourceReconciler(
Buckets: []float64{.1, .5, 1, 5, 10},
})
reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration)
statusTotal := prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_operator_status_update_operations_total",
Help: "Total number of update operations to status subresources",
})
statusErrors := prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_operator_status_update_errors_total",
Help: "Number of errors that occurred during update operations to status subresources",
})
reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration, statusTotal, statusErrors)
qname := strings.ToLower(kind)
@ -131,6 +143,8 @@ func NewResourceReconciler(
reconcileTotal: reconcileTotal,
reconcileErrors: reconcileErrors,
reconcileDuration: reconcileDuration,
statusTotal: statusTotal,
statusErrors: statusErrors,
metrics: metrics,
reconcileQ: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), qname),
@ -417,12 +431,14 @@ func (rr *ResourceReconciler) processNextStatusItem(ctx context.Context) bool {
key := item.(string)
defer rr.statusQ.Done(key)
rr.statusTotal.Inc()
err := rr.syncer.UpdateStatus(ctx, key)
if err == nil {
rr.statusQ.Forget(key)
return true
}
rr.statusErrors.Inc()
utilruntime.HandleError(fmt.Errorf("status %q failed: %w", key, err))
rr.statusQ.AddRateLimited(key)