diff --git a/example/mixin/alerts.yaml b/example/mixin/alerts.yaml index 842d5955b..78373bc4b 100644 --- a/example/mixin/alerts.yaml +++ b/example/mixin/alerts.yaml @@ -31,12 +31,21 @@ groups: - alert: PrometheusOperatorReconcileErrors annotations: description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - summary: Errors while reconciling controller. + summary: Errors while reconciling objects. expr: | (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1 for: 10m labels: severity: warning + - alert: PrometheusOperatorStatusUpdateErrors + annotations: + description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' + summary: Errors while updating objects status. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning - alert: PrometheusOperatorNodeLookupErrors annotations: description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. diff --git a/jsonnet/mixin/alerts/alerts.libsonnet b/jsonnet/mixin/alerts/alerts.libsonnet index 665c76ed6..3ed1872ec 100644 --- a/jsonnet/mixin/alerts/alerts.libsonnet +++ b/jsonnet/mixin/alerts/alerts.libsonnet @@ -56,7 +56,21 @@ }, annotations: { description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.', - summary: 'Errors while reconciling controller.', + summary: 'Errors while reconciling objects.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorStatusUpdateErrors', + expr: ||| + (sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_errors_total{%(prometheusOperatorSelector)s}[5m]))) / (sum by (%(groupLabels)s) (rate(prometheus_operator_status_update_operations_total{%(prometheusOperatorSelector)s}[5m]))) > 0.1 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.', + summary: 'Errors while updating objects status.', }, 'for': '10m', }, diff --git a/pkg/operator/resource_reconciler.go b/pkg/operator/resource_reconciler.go index 91237d53f..c5bc74ff9 100644 --- a/pkg/operator/resource_reconciler.go +++ b/pkg/operator/resource_reconciler.go @@ -74,6 +74,8 @@ type ResourceReconciler struct { reconcileTotal prometheus.Counter reconcileErrors prometheus.Counter reconcileDuration prometheus.Histogram + statusTotal prometheus.Counter + statusErrors prometheus.Counter metrics ReconcilerMetrics @@ -113,7 +115,17 @@ func NewResourceReconciler( Buckets: []float64{.1, .5, 1, 5, 10}, }) - reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration) + statusTotal := prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_operator_status_update_operations_total", + Help: "Total number of update operations to status subresources", + }) + + statusErrors := prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_operator_status_update_errors_total", + Help: "Number of errors that occurred during update operations to status subresources", + }) + + reg.MustRegister(reconcileTotal, reconcileErrors, reconcileDuration, statusTotal, statusErrors) qname := strings.ToLower(kind) @@ -131,6 +143,8 @@ func NewResourceReconciler( reconcileTotal: reconcileTotal, reconcileErrors: reconcileErrors, reconcileDuration: reconcileDuration, + statusTotal: statusTotal, + statusErrors: statusErrors, metrics: metrics, reconcileQ: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), qname), @@ -417,12 +431,14 @@ func (rr *ResourceReconciler) processNextStatusItem(ctx context.Context) bool { key := item.(string) defer rr.statusQ.Done(key) + rr.statusTotal.Inc() err := rr.syncer.UpdateStatus(ctx, key) if err == nil { rr.statusQ.Forget(key) return true } + rr.statusErrors.Inc() utilruntime.HandleError(fmt.Errorf("status %q failed: %w", key, err)) rr.statusQ.AddRateLimited(key)