mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-16 01:06:27 +00:00
feat: add metrics tracking status updates
This commit introduces 2 new counters: * `prometheus_operator_status_update_errors_total` * `prometheus_operator_status_update_operations_total` The metrics have a `controller` label whose value is one of `alertmanager`, `prometheus`, `prometheus-agent` or `thanos`. It also adds a new alerting rule `PrometheusOperatorStatusUpdateErrors` to the mixin which fires when a controller fails to update the status of the objects that it manages. Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
520e2034c7
commit
2e5169a7b7
3 changed files with 42 additions and 3 deletions
|
@ -31,12 +31,21 @@ groups:
|
|||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||
summary: Errors while reconciling controller.
|
||||
summary: Errors while reconciling objects.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorStatusUpdateErrors
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||
summary: Errors while updating objects status.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
||||
|
|
|
@ -56,7 +56,21 @@
|
|||
},
|
||||
annotations: {
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
|
||||
summary: 'Errors while reconciling controller.',
|
||||
summary: 'Errors while reconciling objects.',
|
||||
},
|
||||
'for': '10m',
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusOperatorStatusUpdateErrors',
|
||||
expr: |||
|
||||
(sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_errors_total{%(prometheusOperatorSelector)s}[5m]))) / (sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_operations_total{%(prometheusOperatorSelector)s}[5m]))) > 0.1
|
||||
||| % $._config,
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
|
||||
summary: 'Errors while updating objects status.',
|
||||
},
|
||||
'for': '10m',
|
||||
},
|
||||
|
|
|
@ -74,6 +74,8 @@ type ResourceReconciler struct {
|
|||
reconcileTotal prometheus.Counter
|
||||
reconcileErrors prometheus.Counter
|
||||
reconcileDuration prometheus.Histogram
|
||||
statusTotal prometheus.Counter
|
||||
statusErrors prometheus.Counter
|
||||
|
||||
metrics ReconcilerMetrics
|
||||
|
||||
|
@ -113,7 +115,17 @@ func NewResourceReconciler(
|
|||
Buckets: []float64{.1, .5, 1, 5, 10},
|
||||
})
|
||||
|
||||
reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration)
|
||||
statusTotal := prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "prometheus_operator_status_update_operations_total",
|
||||
Help: "Total number of update operations to status subresources",
|
||||
})
|
||||
|
||||
statusErrors := prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "prometheus_operator_status_update_errors_total",
|
||||
Help: "Number of errors that occurred during update operations to status subresources",
|
||||
})
|
||||
|
||||
reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration, statusTotal, statusErrors)
|
||||
|
||||
qname := strings.ToLower(kind)
|
||||
|
||||
|
@ -131,6 +143,8 @@ func NewResourceReconciler(
|
|||
reconcileTotal: reconcileTotal,
|
||||
reconcileErrors: reconcileErrors,
|
||||
reconcileDuration: reconcileDuration,
|
||||
statusTotal: statusTotal,
|
||||
statusErrors: statusErrors,
|
||||
metrics: metrics,
|
||||
|
||||
reconcileQ: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), qname),
|
||||
|
@ -417,12 +431,14 @@ func (rr *ResourceReconciler) processNextStatusItem(ctx context.Context) bool {
|
|||
key := item.(string)
|
||||
defer rr.statusQ.Done(key)
|
||||
|
||||
rr.statusTotal.Inc()
|
||||
err := rr.syncer.UpdateStatus(ctx, key)
|
||||
if err == nil {
|
||||
rr.statusQ.Forget(key)
|
||||
return true
|
||||
}
|
||||
|
||||
rr.statusErrors.Inc()
|
||||
utilruntime.HandleError(fmt.Errorf("status %q failed: %w", key, err))
|
||||
rr.statusQ.AddRateLimited(key)
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue