mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-16 09:16:38 +00:00
feat: add metrics tracking status updates
This commit introduces 2 new counters: * `prometheus_operator_status_update_errors_total` * `prometheus_operator_status_update_operations_total` The metrics have a `controller` label whose value is one of `alertmanager`, `prometheus`, `prometheus-agent` or `thanos`. It also adds a new alerting rule `PrometheusOperatorStatusUpdateErrors` to the mixin which fires when a controller fails to update the status of the objects that it manages. Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
520e2034c7
commit
2e5169a7b7
3 changed files with 42 additions and 3 deletions
|
@ -31,12 +31,21 @@ groups:
|
||||||
- alert: PrometheusOperatorReconcileErrors
|
- alert: PrometheusOperatorReconcileErrors
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||||
summary: Errors while reconciling controller.
|
summary: Errors while reconciling objects.
|
||||||
expr: |
|
expr: |
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
|
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
- alert: PrometheusOperatorStatusUpdateErrors
|
||||||
|
annotations:
|
||||||
|
description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||||
|
summary: Errors while updating objects status.
|
||||||
|
expr: |
|
||||||
|
(sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator"}[5m]))) > 0.1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- alert: PrometheusOperatorNodeLookupErrors
|
- alert: PrometheusOperatorNodeLookupErrors
|
||||||
annotations:
|
annotations:
|
||||||
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
||||||
|
|
|
@ -56,7 +56,21 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
|
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
|
||||||
summary: 'Errors while reconciling controller.',
|
summary: 'Errors while reconciling objects.',
|
||||||
|
},
|
||||||
|
'for': '10m',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusOperatorStatusUpdateErrors',
|
||||||
|
expr: |||
|
||||||
|
(sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_errors_total{%(prometheusOperatorSelector)s}[5m]))) / (sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_operations_total{%(prometheusOperatorSelector)s}[5m]))) > 0.1
|
||||||
|
||| % $._config,
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
|
||||||
|
summary: 'Errors while updating objects status.',
|
||||||
},
|
},
|
||||||
'for': '10m',
|
'for': '10m',
|
||||||
},
|
},
|
||||||
|
|
|
@ -74,6 +74,8 @@ type ResourceReconciler struct {
|
||||||
reconcileTotal prometheus.Counter
|
reconcileTotal prometheus.Counter
|
||||||
reconcileErrors prometheus.Counter
|
reconcileErrors prometheus.Counter
|
||||||
reconcileDuration prometheus.Histogram
|
reconcileDuration prometheus.Histogram
|
||||||
|
statusTotal prometheus.Counter
|
||||||
|
statusErrors prometheus.Counter
|
||||||
|
|
||||||
metrics ReconcilerMetrics
|
metrics ReconcilerMetrics
|
||||||
|
|
||||||
|
@ -113,7 +115,17 @@ func NewResourceReconciler(
|
||||||
Buckets: []float64{.1, .5, 1, 5, 10},
|
Buckets: []float64{.1, .5, 1, 5, 10},
|
||||||
})
|
})
|
||||||
|
|
||||||
reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration)
|
statusTotal := prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_operator_status_update_operations_total",
|
||||||
|
Help: "Total number of update operations to status subresources",
|
||||||
|
})
|
||||||
|
|
||||||
|
statusErrors := prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_operator_status_update_errors_total",
|
||||||
|
Help: "Number of errors that occurred during update operations to status subresources",
|
||||||
|
})
|
||||||
|
|
||||||
|
reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration, statusTotal, statusErrors)
|
||||||
|
|
||||||
qname := strings.ToLower(kind)
|
qname := strings.ToLower(kind)
|
||||||
|
|
||||||
|
@ -131,6 +143,8 @@ func NewResourceReconciler(
|
||||||
reconcileTotal: reconcileTotal,
|
reconcileTotal: reconcileTotal,
|
||||||
reconcileErrors: reconcileErrors,
|
reconcileErrors: reconcileErrors,
|
||||||
reconcileDuration: reconcileDuration,
|
reconcileDuration: reconcileDuration,
|
||||||
|
statusTotal: statusTotal,
|
||||||
|
statusErrors: statusErrors,
|
||||||
metrics: metrics,
|
metrics: metrics,
|
||||||
|
|
||||||
reconcileQ: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), qname),
|
reconcileQ: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), qname),
|
||||||
|
@ -417,12 +431,14 @@ func (rr *ResourceReconciler) processNextStatusItem(ctx context.Context) bool {
|
||||||
key := item.(string)
|
key := item.(string)
|
||||||
defer rr.statusQ.Done(key)
|
defer rr.statusQ.Done(key)
|
||||||
|
|
||||||
|
rr.statusTotal.Inc()
|
||||||
err := rr.syncer.UpdateStatus(ctx, key)
|
err := rr.syncer.UpdateStatus(ctx, key)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
rr.statusQ.Forget(key)
|
rr.statusQ.Forget(key)
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rr.statusErrors.Inc()
|
||||||
utilruntime.HandleError(fmt.Errorf("status %q failed: %w", key, err))
|
utilruntime.HandleError(fmt.Errorf("status %q failed: %w", key, err))
|
||||||
rr.statusQ.AddRateLimited(key)
|
rr.statusQ.AddRateLimited(key)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue