mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-15 17:50:48 +00:00
assets,site/content: regenerate
This commit is contained in:
parent
df43594957
commit
7fd2bee5a7
25 changed files with 1134 additions and 535 deletions
|
@ -27,7 +27,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CephMdsMissingReplicas
|
- alert: CephMdsMissingReplicas
|
||||||
annotations:
|
annotations:
|
||||||
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
|
description: Minimum required replicas for storage metadata service not available.
|
||||||
|
Might affect the working of storage cluster.
|
||||||
message: Insufficient replicas for storage metadata service.
|
message: Insufficient replicas for storage metadata service.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -51,7 +52,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CephMonHighNumberOfLeaderChanges
|
- alert: CephMonHighNumberOfLeaderChanges
|
||||||
annotations:
|
annotations:
|
||||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
|
||||||
|
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||||
message: Storage Cluster has seen many leader changes recently.
|
message: Storage Cluster has seen many leader changes recently.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -64,7 +66,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CephNodeDown
|
- alert: CephNodeDown
|
||||||
annotations:
|
annotations:
|
||||||
description: Storage node {{ $labels.node }} went down. Please check the node immediately.
|
description: Storage node {{ $labels.node }} went down. Please check the node
|
||||||
|
immediately.
|
||||||
message: Storage node {{ $labels.node }} went down
|
message: Storage node {{ $labels.node }} went down
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -77,7 +80,9 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CephOSDCriticallyFull
|
- alert: CephOSDCriticallyFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
|
description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
|
||||||
|
has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space
|
||||||
|
or expand the storage cluster or contact support.
|
||||||
message: Back-end storage device is critically full.
|
message: Back-end storage device is critically full.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -88,7 +93,9 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CephOSDNearFull
|
- alert: CephOSDNearFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
|
description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
|
||||||
|
has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand
|
||||||
|
the storage cluster or contact support.
|
||||||
message: Back-end storage device is nearing full.
|
message: Back-end storage device is nearing full.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -99,7 +106,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CephOSDDiskNotResponding
|
- alert: CephOSDDiskNotResponding
|
||||||
annotations:
|
annotations:
|
||||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
|
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
|
||||||
|
}}.
|
||||||
message: Disk not responding
|
message: Disk not responding
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -110,7 +118,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CephOSDDiskUnavailable
|
- alert: CephOSDDiskUnavailable
|
||||||
annotations:
|
annotations:
|
||||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
|
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
|
||||||
|
}}.
|
||||||
message: Disk not accessible
|
message: Disk not accessible
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -145,8 +154,10 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: PersistentVolumeUsageNearFull
|
- alert: PersistentVolumeUsageNearFull
|
||||||
annotations:
|
annotations:
|
||||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
|
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
|
||||||
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
|
75%. Free up some space or expand the PVC.
|
||||||
|
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
|
||||||
|
or PVC expansion is required.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -156,8 +167,10 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PersistentVolumeUsageCritical
|
- alert: PersistentVolumeUsageCritical
|
||||||
annotations:
|
annotations:
|
||||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
|
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
|
||||||
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
|
85%. Free up some space or expand the PVC immediately.
|
||||||
|
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
|
||||||
|
or PVC expansion is required.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -191,7 +204,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CephOSDVersionMismatch
|
- alert: CephOSDVersionMismatch
|
||||||
annotations:
|
annotations:
|
||||||
description: There are {{ $value }} different versions of Ceph OSD components running.
|
description: There are {{ $value }} different versions of Ceph OSD components
|
||||||
|
running.
|
||||||
message: There are multiple versions of storage services running.
|
message: There are multiple versions of storage services running.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -202,7 +216,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CephMonVersionMismatch
|
- alert: CephMonVersionMismatch
|
||||||
annotations:
|
annotations:
|
||||||
description: There are {{ $value }} different versions of Ceph Mon components running.
|
description: There are {{ $value }} different versions of Ceph Mon components
|
||||||
|
running.
|
||||||
message: There are multiple versions of storage services running.
|
message: There are multiple versions of storage services running.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -215,8 +230,10 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CephClusterNearFull
|
- alert: CephClusterNearFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster.
|
description: Storage cluster utilization has crossed 75% and will become read-only
|
||||||
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
|
at 85%. Free up some space or expand the storage cluster.
|
||||||
|
message: Storage cluster is nearing full. Data deletion or cluster expansion
|
||||||
|
is required.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -226,8 +243,10 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CephClusterCriticallyFull
|
- alert: CephClusterCriticallyFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately.
|
description: Storage cluster utilization has crossed 80% and will become read-only
|
||||||
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
|
at 85%. Free up some space or expand the storage cluster immediately.
|
||||||
|
message: Storage cluster is critically full and needs immediate data deletion
|
||||||
|
or cluster expansion.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -237,8 +256,10 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CephClusterReadOnly
|
- alert: CephClusterReadOnly
|
||||||
annotations:
|
annotations:
|
||||||
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately.
|
description: Storage cluster utilization has crossed 85% and will become read-only
|
||||||
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion.
|
now. Free up some space or expand the storage cluster immediately.
|
||||||
|
message: Storage cluster is read-only now and needs immediate data deletion
|
||||||
|
or cluster expansion.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
|
|
@ -12,7 +12,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CoreDNSLatencyHigh
|
- alert: CoreDNSLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} .
|
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
|
||||||
|
{{ $labels.server }} zone {{ $labels.zone }} .
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
|
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
|
||||||
|
@ -21,7 +22,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CoreDNSErrorsHigh
|
- alert: CoreDNSErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||||
|
of requests.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
@ -32,7 +34,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CoreDNSErrorsHigh
|
- alert: CoreDNSErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||||
|
of requests.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
@ -45,7 +48,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CoreDNSForwardLatencyHigh
|
- alert: CoreDNSForwardLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}.
|
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
|
||||||
|
requests to {{ $labels.to }}.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
|
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
|
||||||
|
@ -54,7 +58,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CoreDNSForwardErrorsHigh
|
- alert: CoreDNSForwardErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||||
|
of forward requests to {{ $labels.to }}.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
@ -65,7 +70,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CoreDNSForwardErrorsHigh
|
- alert: CoreDNSForwardErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||||
|
of forward requests to {{ $labels.to }}.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
|
|
@ -107,7 +107,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CortexIngesterRestarts
|
- alert: CortexIngesterRestarts
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.'
|
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f"
|
||||||
|
$value }} times in the last 30 mins.'
|
||||||
expr: |
|
expr: |
|
||||||
changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1
|
changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1
|
||||||
labels:
|
labels:
|
||||||
|
@ -278,7 +279,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CortexGossipMembersMismatch
|
- alert: CortexGossipMembersMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.'
|
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of
|
||||||
|
gossip members.'
|
||||||
expr: |
|
expr: |
|
||||||
memberlist_client_cluster_members_count
|
memberlist_client_cluster_members_count
|
||||||
!= on (cluster, namespace) group_left
|
!= on (cluster, namespace) group_left
|
||||||
|
@ -290,7 +292,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CortexIngesterHasNotShippedBlocks
|
- alert: CortexIngesterHasNotShippedBlocks
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
|
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||||
|
not shipped any block in the last 4 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4)
|
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4)
|
||||||
and
|
and
|
||||||
|
@ -302,7 +305,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CortexIngesterHasNotShippedBlocksSinceStart
|
- alert: CortexIngesterHasNotShippedBlocksSinceStart
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
|
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||||
|
not shipped any block in the last 4 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0)
|
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0)
|
||||||
and
|
and
|
||||||
|
@ -312,7 +316,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CortexIngesterTSDBHeadCompactionFailed
|
- alert: CortexIngesterTSDBHeadCompactionFailed
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.
|
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing
|
||||||
|
to compact TSDB head.
|
||||||
expr: |
|
expr: |
|
||||||
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
|
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -320,7 +325,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CortexQuerierHasNotScanTheBucket
|
- alert: CortexQuerierHasNotScanTheBucket
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.
|
message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not
|
||||||
|
successfully scanned the bucket since {{ $value | humanizeDuration }}.
|
||||||
expr: |
|
expr: |
|
||||||
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
|
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
|
||||||
and
|
and
|
||||||
|
@ -330,7 +336,9 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CortexQuerierHighRefetchRate
|
- alert: CortexQuerierHighRefetchRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.
|
message: Cortex Queries in {{ $labels.namespace }} are refetching series from
|
||||||
|
different store-gateways (because of missing blocks) for the {{ printf "%.0f"
|
||||||
|
$value }}% of queries.
|
||||||
expr: |
|
expr: |
|
||||||
100 * (
|
100 * (
|
||||||
(
|
(
|
||||||
|
@ -347,7 +355,9 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CortexStoreGatewayHasNotSyncTheBucket
|
- alert: CortexStoreGatewayHasNotSyncTheBucket
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.
|
message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }}
|
||||||
|
has not successfully synched the bucket since {{ $value | humanizeDuration
|
||||||
|
}}.
|
||||||
expr: |
|
expr: |
|
||||||
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
|
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
|
||||||
and
|
and
|
||||||
|
@ -359,7 +369,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks
|
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
|
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||||
|
not successfully cleaned up blocks in the last 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24)
|
(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24)
|
||||||
and
|
and
|
||||||
|
@ -369,7 +380,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart
|
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
|
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||||
|
not successfully cleaned up blocks in the last 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0
|
cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0
|
||||||
for: 24h
|
for: 24h
|
||||||
|
@ -377,7 +389,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CortexCompactorHasNotUploadedBlocks
|
- alert: CortexCompactorHasNotUploadedBlocks
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
|
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||||
|
not uploaded any block in the last 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24)
|
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24)
|
||||||
and
|
and
|
||||||
|
@ -387,7 +400,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: CortexCompactorHasNotUploadedBlocksSinceStart
|
- alert: CortexCompactorHasNotUploadedBlocksSinceStart
|
||||||
annotations:
|
annotations:
|
||||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
|
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||||
|
not uploaded any block in the last 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0
|
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0
|
||||||
for: 24h
|
for: 24h
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
groups:
|
groups:
|
||||||
- name: cortex_api
|
- name: cortex_api
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_request_duration_seconds:99quantile
|
record: cluster_job:cortex_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_request_duration_seconds:50quantile
|
record: cluster_job:cortex_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_request_duration_seconds:avg
|
record: cluster_job:cortex_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||||
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
|
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
|
@ -13,185 +16,279 @@ groups:
|
||||||
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
|
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
|
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
record: cluster_job_route:cortex_request_duration_seconds:99quantile
|
record: cluster_job_route:cortex_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
record: cluster_job_route:cortex_request_duration_seconds:50quantile
|
record: cluster_job_route:cortex_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
record: cluster_job_route:cortex_request_duration_seconds:avg
|
record: cluster_job_route:cortex_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||||
|
route)
|
||||||
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
|
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
|
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
|
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||||
|
job, route)
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
|
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||||
- name: cortex_cache
|
- name: cortex_cache
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job, method)
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
|
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
|
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
|
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
|
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds:avg
|
record: cluster_job:cortex_cache_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
|
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
|
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
|
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
|
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
|
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
|
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
method)
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
|
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
|
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
|
||||||
- name: cortex_storage
|
- name: cortex_storage
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile
|
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile
|
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job, operation)
|
||||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg
|
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate
|
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate
|
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile
|
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile
|
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job, operation)
|
||||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg
|
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le,
|
||||||
|
cluster, job, operation)
|
||||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate
|
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate
|
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile
|
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile
|
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by
|
||||||
|
(cluster, job, operation)
|
||||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg
|
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
operation)
|
||||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate
|
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate
|
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile
|
record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile
|
record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile
|
||||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg
|
record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg
|
||||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le,
|
||||||
|
cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate
|
record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate
|
record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate
|
record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile
|
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile
|
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile
|
||||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
|
||||||
|
by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg
|
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg
|
||||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate
|
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate
|
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate
|
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile
|
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile
|
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile
|
||||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
|
||||||
|
by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg
|
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg
|
||||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate
|
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate
|
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate
|
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile
|
record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile
|
record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile
|
||||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_chunks_per_query:avg
|
record: cluster_job:cortex_chunk_store_chunks_per_query:avg
|
||||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate
|
record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate
|
record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate
|
record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
record: cluster_job_method:cortex_database_request_duration_seconds:99quantile
|
record: cluster_job_method:cortex_database_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, method))
|
||||||
record: cluster_job_method:cortex_database_request_duration_seconds:50quantile
|
record: cluster_job_method:cortex_database_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job, method)
|
||||||
record: cluster_job_method:cortex_database_request_duration_seconds:avg
|
record: cluster_job_method:cortex_database_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
|
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate
|
record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method)
|
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, method)
|
||||||
record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate
|
record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile
|
record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, operation))
|
||||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile
|
record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg
|
record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job, operation)
|
||||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate
|
record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||||
|
operation)
|
||||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate
|
record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job,
|
||||||
|
operation)
|
||||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate
|
record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
|
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
|
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds:avg
|
record: cluster_job:cortex_kv_request_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
|
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
|
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
|
||||||
|
@ -199,11 +296,14 @@ groups:
|
||||||
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
|
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
|
||||||
- name: cortex_queries
|
- name: cortex_queries
|
||||||
rules:
|
rules:
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_query_frontend_retries:99quantile
|
record: cluster_job:cortex_query_frontend_retries:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_query_frontend_retries:50quantile
|
record: cluster_job:cortex_query_frontend_retries:50quantile
|
||||||
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_query_frontend_retries:avg
|
record: cluster_job:cortex_query_frontend_retries:avg
|
||||||
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
|
||||||
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
|
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
|
||||||
|
@ -211,23 +311,33 @@ groups:
|
||||||
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
|
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
|
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
||||||
|
(cluster, job)
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
||||||
|
cluster, job)
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
||||||
|
job)
|
||||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
|
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_ingester_queried_series:99quantile
|
record: cluster_job:cortex_ingester_queried_series:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_ingester_queried_series:50quantile
|
record: cluster_job:cortex_ingester_queried_series:50quantile
|
||||||
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_series:avg
|
record: cluster_job:cortex_ingester_queried_series:avg
|
||||||
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
|
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
|
||||||
|
@ -235,11 +345,14 @@ groups:
|
||||||
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
|
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
|
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_ingester_queried_chunks:99quantile
|
record: cluster_job:cortex_ingester_queried_chunks:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_ingester_queried_chunks:50quantile
|
record: cluster_job:cortex_ingester_queried_chunks:50quantile
|
||||||
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_chunks:avg
|
record: cluster_job:cortex_ingester_queried_chunks:avg
|
||||||
- expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate
|
record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate
|
||||||
|
@ -247,11 +360,14 @@ groups:
|
||||||
record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate
|
record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate
|
||||||
- expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate
|
record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate
|
||||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_ingester_queried_samples:99quantile
|
record: cluster_job:cortex_ingester_queried_samples:99quantile
|
||||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job))
|
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
record: cluster_job:cortex_ingester_queried_samples:50quantile
|
record: cluster_job:cortex_ingester_queried_samples:50quantile
|
||||||
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_samples:avg
|
record: cluster_job:cortex_ingester_queried_samples:avg
|
||||||
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
|
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
|
||||||
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
|
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
|
||||||
|
|
|
@ -18,7 +18,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdInsufficientMembers
|
- alert: etcdInsufficientMembers
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
|
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||||
|
}}).'
|
||||||
expr: |
|
expr: |
|
||||||
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
||||||
for: 3m
|
for: 3m
|
||||||
|
@ -26,7 +27,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdNoLeader
|
- alert: etcdNoLeader
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
|
||||||
|
no leader.'
|
||||||
expr: |
|
expr: |
|
||||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||||
for: 1m
|
for: 1m
|
||||||
|
@ -34,7 +36,9 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdHighNumberOfLeaderChanges
|
- alert: etcdHighNumberOfLeaderChanges
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within
|
||||||
|
the last 15 minutes. Frequent elections may be a sign of insufficient resources,
|
||||||
|
high network latency, or disruptions by other components and should be investigated.'
|
||||||
expr: |
|
expr: |
|
||||||
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -42,7 +46,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
|
||||||
|
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||||
/
|
/
|
||||||
|
@ -53,7 +58,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
|
||||||
|
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||||
/
|
/
|
||||||
|
@ -64,7 +70,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdGRPCRequestsSlow
|
- alert: etcdGRPCRequestsSlow
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
|
||||||
|
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
|
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
|
||||||
> 0.15
|
> 0.15
|
||||||
|
@ -73,7 +80,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdMemberCommunicationSlow
|
- alert: etcdMemberCommunicationSlow
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
|
||||||
|
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
> 0.15
|
> 0.15
|
||||||
|
@ -82,7 +90,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedProposals
|
- alert: etcdHighNumberOfFailedProposals
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
|
||||||
|
the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -90,7 +99,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighFsyncDurations
|
- alert: etcdHighFsyncDurations
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
|
||||||
|
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
> 0.5
|
> 0.5
|
||||||
|
@ -99,7 +109,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighCommitDurations
|
- alert: etcdHighCommitDurations
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||||
|
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
> 0.25
|
> 0.25
|
||||||
|
@ -108,7 +119,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||||
|
instance {{ $labels.instance }}'
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||||
without (code) > 0.01
|
without (code) > 0.01
|
||||||
|
@ -117,7 +129,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
|
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||||
|
instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||||
without (code) > 0.05
|
without (code) > 0.05
|
||||||
|
@ -126,7 +139,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdHTTPRequestsSlow
|
- alert: etcdHTTPRequestsSlow
|
||||||
annotations:
|
annotations:
|
||||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
|
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||||
|
}} are slow.
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||||
> 0.15
|
> 0.15
|
||||||
|
|
|
@ -49,7 +49,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: GlusterBrickUtilization
|
- alert: GlusterBrickUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
|
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||||
|
than 80%
|
||||||
expr: |
|
expr: |
|
||||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
||||||
|
@ -58,7 +59,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: GlusterBrickUtilization
|
- alert: GlusterBrickUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
|
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||||
|
than 90%
|
||||||
expr: |
|
expr: |
|
||||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
||||||
|
@ -69,7 +71,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: GlusterThinpoolDataUtilization
|
- alert: GlusterThinpoolDataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
|
||||||
|
than 80%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -77,7 +80,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: GlusterThinpoolDataUtilization
|
- alert: GlusterThinpoolDataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
|
||||||
|
than 90%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -85,7 +89,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: GlusterThinpoolMetadataUtilization
|
- alert: GlusterThinpoolMetadataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||||
|
than 80%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -93,7 +98,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: GlusterThinpoolMetadataUtilization
|
- alert: GlusterThinpoolMetadataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||||
|
than 90%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
||||||
for: 5m
|
for: 5m
|
||||||
|
|
|
@ -13,7 +13,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
||||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance,
|
||||||
|
job, namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -21,7 +23,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -29,7 +33,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m]))
|
||||||
|
by (instance, job, namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -45,7 +51,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
|
||||||
|
job, namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -53,7 +61,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
||||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -61,7 +71,8 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
||||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
|
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
|
||||||
|
> 0.5
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -69,7 +80,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
||||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job,
|
||||||
|
namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -77,7 +90,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
|
||||||
|
namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -85,7 +100,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
|
||||||
|
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -93,7 +110,9 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job,
|
||||||
|
namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance,
|
||||||
|
job, namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: CockroachInstanceFlapping
|
- alert: CockroachInstanceFlapping
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m'
|
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
|
||||||
|
{{ $value }} time(s) in 10m'
|
||||||
expr: |
|
expr: |
|
||||||
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
|
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
|
||||||
for: 1m
|
for: 1m
|
||||||
|
@ -29,7 +30,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CockroachStoreDiskLow
|
- alert: CockroachStoreDiskLow
|
||||||
annotations:
|
annotations:
|
||||||
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction
|
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
|
||||||
|
}} available disk fraction
|
||||||
expr: |
|
expr: |
|
||||||
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
|
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
|
||||||
for: 30m
|
for: 30m
|
||||||
|
@ -61,7 +63,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CockroachHighOpenFDCount
|
- alert: CockroachHighOpenFDCount
|
||||||
annotations:
|
annotations:
|
||||||
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used'
|
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
|
||||||
|
}} fraction used'
|
||||||
expr: |
|
expr: |
|
||||||
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
|
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
|
||||||
for: 10m
|
for: 10m
|
||||||
|
|
|
@ -3,7 +3,10 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeStateMetricsListErrors
|
- alert: KubeStateMetricsListErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
description: kube-state-metrics is experiencing errors at an elevated rate in
|
||||||
|
list operations. This is likely causing it to not be able to expose metrics
|
||||||
|
about Kubernetes objects correctly or at all.
|
||||||
|
summary: kube-state-metrics is experiencing errors in list operations.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
/
|
/
|
||||||
|
@ -14,7 +17,10 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubeStateMetricsWatchErrors
|
- alert: KubeStateMetricsWatchErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
description: kube-state-metrics is experiencing errors at an elevated rate in
|
||||||
|
watch operations. This is likely causing it to not be able to expose metrics
|
||||||
|
about Kubernetes objects correctly or at all.
|
||||||
|
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
/
|
/
|
||||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePodCrashLooping
|
- alert: KubePodCrashLooping
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||||
|
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||||
expr: |
|
expr: |
|
||||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
||||||
|
@ -12,7 +13,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubePodNotReady
|
- alert: KubePodNotReady
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||||
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (
|
sum by (namespace, pod) (
|
||||||
|
@ -27,7 +29,9 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDeploymentGenerationMismatch
|
- alert: KubeDeploymentGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||||
|
}} does not match, this indicates that the Deployment has failed but has not
|
||||||
|
been rolled back.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
||||||
expr: |
|
expr: |
|
||||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||||
|
@ -38,7 +42,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDeploymentReplicasMismatch
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
|
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
||||||
|
matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -55,7 +60,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetReplicasMismatch
|
- alert: KubeStatefulSetReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not
|
||||||
|
matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -72,7 +78,9 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetGenerationMismatch
|
- alert: KubeStatefulSetGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||||
|
}} does not match, this indicates that the StatefulSet has failed but has
|
||||||
|
not been rolled back.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
||||||
expr: |
|
expr: |
|
||||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||||
|
@ -83,7 +91,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||||
|
has not been rolled out.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -108,7 +117,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetRolloutStuck
|
- alert: KubeDaemonSetRolloutStuck
|
||||||
annotations:
|
annotations:
|
||||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
|
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
|
||||||
|
or progressed for at least 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -139,7 +149,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeContainerWaiting
|
- alert: KubeContainerWaiting
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
|
||||||
|
has been in waiting state for longer than 1 hour.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
||||||
|
@ -148,7 +159,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetNotScheduled
|
- alert: KubeDaemonSetNotScheduled
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are not scheduled.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
||||||
expr: |
|
expr: |
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||||
|
@ -159,7 +171,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetMisScheduled
|
- alert: KubeDaemonSetMisScheduled
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are running where they are not supposed to run.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||||
expr: |
|
expr: |
|
||||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||||
|
@ -168,7 +181,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeJobCompletion
|
- alert: KubeJobCompletion
|
||||||
annotations:
|
annotations:
|
||||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
|
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
|
||||||
|
12 hours to complete.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
||||||
expr: |
|
expr: |
|
||||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||||
|
@ -186,7 +200,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeHpaReplicasMismatch
|
- alert: KubeHpaReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
|
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
|
||||||
|
number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||||
|
@ -199,7 +214,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeHpaMaxedOut
|
- alert: KubeHpaMaxedOut
|
||||||
annotations:
|
annotations:
|
||||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
|
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max
|
||||||
|
replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||||
expr: |
|
expr: |
|
||||||
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||||
|
@ -212,7 +228,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeCPUOvercommit
|
- alert: KubeCPUOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
message: Cluster has overcommitted CPU resource requests for Pods and cannot
|
||||||
|
tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||||
|
@ -225,7 +242,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemoryOvercommit
|
- alert: KubeMemoryOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
||||||
|
tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||||
|
@ -264,7 +282,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeQuotaFullyUsed
|
- alert: KubeQuotaFullyUsed
|
||||||
annotations:
|
annotations:
|
||||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
|
}} of its {{ $labels.resource }} quota.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
||||||
expr: |
|
expr: |
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||||
|
@ -276,7 +295,9 @@ groups:
|
||||||
severity: info
|
severity: info
|
||||||
- alert: CPUThrottlingHigh
|
- alert: CPUThrottlingHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{
|
||||||
|
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
||||||
|
}}.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||||
|
@ -290,7 +311,9 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
|
||||||
|
in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
||||||
|
}} free.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
expr: |
|
expr: |
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||||
|
@ -302,7 +325,9 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
|
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
||||||
|
days. Currently {{ $value | humanizePercentage }} is available.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -317,7 +342,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubePersistentVolumeErrors
|
- alert: KubePersistentVolumeErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
|
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
||||||
|
$labels.phase }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
||||||
expr: |
|
expr: |
|
||||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||||
|
@ -328,7 +354,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeVersionMismatch
|
- alert: KubeVersionMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: There are {{ $value }} different semantic versions of Kubernetes components running.
|
message: There are {{ $value }} different semantic versions of Kubernetes components
|
||||||
|
running.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||||
expr: |
|
expr: |
|
||||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
||||||
|
@ -337,7 +364,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientErrors
|
- alert: KubeClientErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
|
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
||||||
|
}}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||||
|
@ -405,7 +433,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
|
in less than 7.0 days.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
||||||
|
@ -413,7 +442,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
|
in less than 24.0 hours.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
||||||
|
@ -421,7 +451,9 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: AggregatedAPIErrors
|
- alert: AggregatedAPIErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
|
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
|
||||||
|
errors. The number of errors have increased for it in the past five minutes.
|
||||||
|
High values indicate that the availability of the service changes too often.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||||
expr: |
|
expr: |
|
||||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||||
|
@ -429,7 +461,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: AggregatedAPIDown
|
- alert: AggregatedAPIDown
|
||||||
annotations:
|
annotations:
|
||||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.
|
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been
|
||||||
|
only {{ $value | humanize }}% available over the last 5m.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||||
expr: |
|
expr: |
|
||||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
||||||
|
@ -466,7 +499,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeletTooManyPods
|
- alert: KubeletTooManyPods
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
|
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||||
|
}} of its Pod capacity.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||||
expr: |
|
expr: |
|
||||||
count by(node) (
|
count by(node) (
|
||||||
|
@ -481,7 +515,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeNodeReadinessFlapping
|
- alert: KubeNodeReadinessFlapping
|
||||||
annotations:
|
annotations:
|
||||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
|
message: The readiness status of node {{ $labels.node }} has changed {{ $value
|
||||||
|
}} times in the last 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||||
expr: |
|
expr: |
|
||||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||||
|
@ -490,7 +525,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeletPlegDurationHigh
|
- alert: KubeletPlegDurationHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
|
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
|
||||||
|
of {{ $value }} seconds on node {{ $labels.node }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||||
expr: |
|
expr: |
|
||||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||||
|
@ -499,7 +535,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeletPodStartUpLatencyHigh
|
- alert: KubeletPodStartUpLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
|
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
||||||
|
on node {{ $labels.node }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
|
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
|
||||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeFilesystemSpaceFillingUp
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -18,7 +19,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemSpaceFillingUp
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -33,7 +35,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeFilesystemAlmostOutOfSpace
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available space left.
|
||||||
summary: Filesystem has less than 5% space left.
|
summary: Filesystem has less than 5% space left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -46,7 +49,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemAlmostOutOfSpace
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available space left.
|
||||||
summary: Filesystem has less than 3% space left.
|
summary: Filesystem has less than 3% space left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -59,7 +63,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeFilesystemFilesFillingUp
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -74,7 +79,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemFilesFillingUp
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -89,7 +95,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeFilesystemAlmostOutOfFiles
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available inodes left.
|
||||||
summary: Filesystem has less than 5% inodes left.
|
summary: Filesystem has less than 5% inodes left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -102,7 +109,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemAlmostOutOfFiles
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||||
|
only {{ printf "%.2f" $value }}% available inodes left.
|
||||||
summary: Filesystem has less than 3% inodes left.
|
summary: Filesystem has less than 3% inodes left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -115,7 +123,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeNetworkReceiveErrs
|
- alert: NodeNetworkReceiveErrs
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
|
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||||
summary: Network interface is reporting many receive errors.
|
summary: Network interface is reporting many receive errors.
|
||||||
expr: |
|
expr: |
|
||||||
increase(node_network_receive_errs_total[2m]) > 10
|
increase(node_network_receive_errs_total[2m]) > 10
|
||||||
|
@ -124,7 +133,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeNetworkTransmitErrs
|
- alert: NodeNetworkTransmitErrs
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
|
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||||
summary: Network interface is reporting many transmit errors.
|
summary: Network interface is reporting many transmit errors.
|
||||||
expr: |
|
expr: |
|
||||||
increase(node_network_transmit_errs_total[2m]) > 10
|
increase(node_network_transmit_errs_total[2m]) > 10
|
||||||
|
@ -149,7 +159,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeClockSkewDetected
|
- alert: NodeClockSkewDetected
|
||||||
annotations:
|
annotations:
|
||||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
|
||||||
|
NTP is configured correctly on this host.
|
||||||
summary: Clock skew detected.
|
summary: Clock skew detected.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -168,7 +179,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeClockNotSynchronising
|
- alert: NodeClockNotSynchronising
|
||||||
annotations:
|
annotations:
|
||||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is
|
||||||
|
configured on this host.
|
||||||
summary: Clock not synchronising.
|
summary: Clock not synchronising.
|
||||||
expr: |
|
expr: |
|
||||||
min_over_time(node_timex_sync_status[5m]) == 0
|
min_over_time(node_timex_sync_status[5m]) == 0
|
||||||
|
|
|
@ -14,8 +14,10 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: PrometheusNotificationQueueRunningFull
|
- alert: PrometheusNotificationQueueRunningFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
|
description: Alert notification queue of Prometheus {{$labels.instance}} is
|
||||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
running full.
|
||||||
|
summary: Prometheus alert notification queue predicted to run full in less than
|
||||||
|
30m.
|
||||||
expr: |
|
expr: |
|
||||||
# Without min_over_time, failed scrapes could create false negatives, see
|
# Without min_over_time, failed scrapes could create false negatives, see
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
@ -29,8 +31,10 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||||
|
summary: Prometheus has encountered more than 1% errors sending alerts to a
|
||||||
|
specific Alertmanager.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||||
|
@ -44,7 +48,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
|
||||||
|
from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||||
expr: |
|
expr: |
|
||||||
min without(alertmanager) (
|
min without(alertmanager) (
|
||||||
|
@ -70,7 +75,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusTSDBReloadsFailing
|
- alert: PrometheusTSDBReloadsFailing
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
|
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||||
|
reload failures over the last 3h.
|
||||||
summary: Prometheus has issues reloading blocks from disk.
|
summary: Prometheus has issues reloading blocks from disk.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||||
|
@ -79,7 +85,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusTSDBCompactionsFailing
|
- alert: PrometheusTSDBCompactionsFailing
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||||
|
compaction failures over the last 3h.
|
||||||
summary: Prometheus has issues compacting blocks.
|
summary: Prometheus has issues compacting blocks.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||||
|
@ -97,7 +104,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusDuplicateTimestamps
|
- alert: PrometheusDuplicateTimestamps
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||||
|
samples/s with different values but duplicated timestamp.
|
||||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -106,7 +114,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusOutOfOrderTimestamps
|
- alert: PrometheusOutOfOrderTimestamps
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||||
|
samples/s with timestamps arriving out of order.
|
||||||
summary: Prometheus drops samples with out-of-order timestamps.
|
summary: Prometheus drops samples with out-of-order timestamps.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -115,7 +124,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusRemoteStorageFailures
|
- alert: PrometheusRemoteStorageFailures
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f"
|
||||||
|
$value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||||
summary: Prometheus fails to send samples to remote storage.
|
summary: Prometheus fails to send samples to remote storage.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -134,7 +144,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: PrometheusRemoteWriteBehind
|
- alert: PrometheusRemoteWriteBehind
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f"
|
||||||
|
$value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||||
summary: Prometheus remote write is behind.
|
summary: Prometheus remote write is behind.
|
||||||
expr: |
|
expr: |
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
@ -150,8 +161,12 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: PrometheusRemoteWriteDesiredShards
|
- alert: PrometheusRemoteWriteDesiredShards
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
|
description: Prometheus {{$labels.instance}} remote write desired shards calculation
|
||||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
|
||||||
|
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
|
||||||
|
$labels.instance | query | first | value }}.
|
||||||
|
summary: Prometheus remote write desired shards calculation wants to run more
|
||||||
|
than configured max shards.
|
||||||
expr: |
|
expr: |
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
@ -165,7 +180,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusRuleFailures
|
- alert: PrometheusRuleFailures
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf
|
||||||
|
"%.0f" $value }} rules in the last 5m.
|
||||||
summary: Prometheus is failing rule evaluations.
|
summary: Prometheus is failing rule evaluations.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -174,7 +190,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: PrometheusMissingRuleEvaluations
|
- alert: PrometheusMissingRuleEvaluations
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
|
||||||
|
}} rule group evaluations in the last 5m.
|
||||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -183,8 +200,10 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusTargetLimitHit
|
- alert: PrometheusTargetLimitHit
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
}} targets because the number of targets exceeded the configured target_limit.
|
||||||
|
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||||
|
the targets limit.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: ThanosCompactMultipleRunning
|
- alert: ThanosCompactMultipleRunning
|
||||||
annotations:
|
annotations:
|
||||||
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
|
message: No more than one Thanos Compact instance should be running at once.
|
||||||
|
There are {{ $value }}
|
||||||
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
|
@ -17,7 +18,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosCompactHighCompactionFailures
|
- alert: ThanosCompactHighCompactionFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
|
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
|
||||||
|
}}% of compactions.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||||
|
@ -30,7 +32,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosCompactBucketHighOperationFailures
|
- alert: ThanosCompactBucketHighOperationFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value
|
||||||
|
| humanize }}% of operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||||
|
@ -44,14 +47,16 @@ groups:
|
||||||
- alert: ThanosCompactHasNotRun
|
- alert: ThanosCompactHasNotRun
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
||||||
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24
|
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
|
||||||
|
/ 60 / 60 > 24
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- name: thanos-query.rules
|
- name: thanos-query.rules
|
||||||
rules:
|
rules:
|
||||||
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
|
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of "query" requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
||||||
|
@ -63,7 +68,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
|
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of "query_range" requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||||
|
@ -75,7 +81,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosQueryGrpcServerErrorRate
|
- alert: ThanosQueryGrpcServerErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
||||||
|
@ -88,7 +95,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosQueryGrpcClientErrorRate
|
- alert: ThanosQueryGrpcClientErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
|
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
||||||
|
@ -100,7 +108,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosQueryHighDNSFailures
|
- alert: ThanosQueryHighDNSFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
|
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing
|
||||||
|
DNS queries for store endpoints.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||||
|
@ -112,7 +121,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosQueryInstantLatencyHigh
|
- alert: ThanosQueryInstantLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
|
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for instant queries.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
||||||
|
@ -124,7 +134,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosQueryRangeLatencyHigh
|
- alert: ThanosQueryRangeLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
|
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for range queries.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
||||||
|
@ -138,7 +149,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: ThanosReceiveHttpRequestErrorRateHigh
|
- alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||||
|
@ -150,7 +162,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosReceiveHttpRequestLatencyHigh
|
- alert: ThanosReceiveHttpRequestLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
|
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{
|
||||||
|
$value }} seconds for requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
||||||
|
@ -162,7 +175,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosReceiveHighReplicationFailures
|
- alert: ThanosReceiveHighReplicationFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.
|
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value |
|
||||||
|
humanize }}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
thanos_receive_replication_factor > 1
|
thanos_receive_replication_factor > 1
|
||||||
and
|
and
|
||||||
|
@ -184,7 +198,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosReceiveHighForwardRequestFailures
|
- alert: ThanosReceiveHighForwardRequestFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
|
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||||
|
@ -196,7 +211,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosReceiveHighHashringFileRefreshFailures
|
- alert: ThanosReceiveHighHashringFileRefreshFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
|
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file,
|
||||||
|
{{ $value | humanize }} of attempts failed.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||||
|
@ -209,14 +225,17 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosReceiveConfigReloadFailure
|
- alert: ThanosReceiveConfigReloadFailure
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
|
message: Thanos Receive {{$labels.job}} has not been able to reload hashring
|
||||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
|
configurations.
|
||||||
|
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"})
|
||||||
|
by (job) != 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosReceiveNoUpload
|
- alert: ThanosReceiveNoUpload
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.
|
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
|
||||||
|
latest data to object storage.
|
||||||
expr: |
|
expr: |
|
||||||
(up{job=~"thanos-receive.*"} - 1)
|
(up{job=~"thanos-receive.*"} - 1)
|
||||||
+ on (instance) # filters to only alert on current instance last 3h
|
+ on (instance) # filters to only alert on current instance last 3h
|
||||||
|
@ -236,7 +255,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosSidecarUnhealthy
|
- alert: ThanosSidecarUnhealthy
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
|
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{
|
||||||
|
$value }} seconds.
|
||||||
expr: |
|
expr: |
|
||||||
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
|
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
|
||||||
labels:
|
labels:
|
||||||
|
@ -245,7 +265,8 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
- alert: ThanosStoreGrpcErrorRate
|
- alert: ThanosStoreGrpcErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
||||||
|
@ -258,7 +279,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosStoreSeriesGateLatencyHigh
|
- alert: ThanosStoreSeriesGateLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
|
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for store series gate requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||||
|
@ -270,7 +292,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosStoreBucketHighOperationFailures
|
- alert: ThanosStoreBucketHighOperationFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value
|
||||||
|
| humanize }}% of operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||||
|
@ -283,7 +306,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosStoreObjstoreOperationLatencyHigh
|
- alert: ThanosStoreObjstoreOperationLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
|
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of
|
||||||
|
{{ $value }} seconds for the bucket operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||||
|
@ -305,7 +329,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosRuleSenderIsFailingAlerts
|
- alert: ThanosRuleSenderIsFailingAlerts
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts
|
||||||
|
to alertmanager.
|
||||||
expr: |
|
expr: |
|
||||||
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -313,7 +338,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosRuleHighRuleEvaluationFailures
|
- alert: ThanosRuleHighRuleEvaluationFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.
|
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate
|
||||||
|
rules.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
|
sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||||
|
@ -326,7 +352,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosRuleHighRuleEvaluationWarnings
|
- alert: ThanosRuleHighRuleEvaluationWarnings
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
|
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
|
||||||
|
warnings.
|
||||||
expr: |
|
expr: |
|
||||||
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -334,7 +361,8 @@ groups:
|
||||||
severity: info
|
severity: info
|
||||||
- alert: ThanosRuleRuleEvaluationLatencyHigh
|
- alert: ThanosRuleRuleEvaluationLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
|
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
|
||||||
|
than interval for {{$labels.rule_group}}.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
||||||
|
@ -346,7 +374,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosRuleGrpcErrorRate
|
- alert: ThanosRuleGrpcErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
||||||
|
@ -360,13 +389,15 @@ groups:
|
||||||
- alert: ThanosRuleConfigReloadFailure
|
- alert: ThanosRuleConfigReloadFailure
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
||||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
|
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by
|
||||||
|
(job) != 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
- alert: ThanosRuleQueryHighDNSFailures
|
- alert: ThanosRuleQueryHighDNSFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
|
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
|
||||||
|
DNS queries for query endpoints.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||||
|
@ -379,7 +410,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosRuleAlertmanagerHighDNSFailures
|
- alert: ThanosRuleAlertmanagerHighDNSFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
|
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
|
||||||
|
DNS queries for Alertmanager endpoints.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||||
|
@ -392,7 +424,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: ThanosRuleNoEvaluationFor10Intervals
|
- alert: ThanosRuleNoEvaluationFor10Intervals
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
|
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups
|
||||||
|
that did not evaluate for at least 10x of their expected interval.
|
||||||
expr: |
|
expr: |
|
||||||
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
||||||
>
|
>
|
||||||
|
@ -402,7 +435,8 @@ groups:
|
||||||
severity: info
|
severity: info
|
||||||
- alert: ThanosNoRuleEvaluations
|
- alert: ThanosNoRuleEvaluations
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
|
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in
|
||||||
|
the past 2 minutes.
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
||||||
and
|
and
|
||||||
|
@ -472,7 +506,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosBucketReplicateErrorRate
|
- alert: ThanosBucketReplicateErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.
|
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
|
||||||
|
failed.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
|
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
|
||||||
|
@ -484,7 +519,8 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: ThanosBucketReplicateRunLatency
|
- alert: ThanosBucketReplicateRunLatency
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
|
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
|
||||||
|
$value }} seconds for the replicate operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
||||||
|
|
|
@ -59,7 +59,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephMdsMissingReplicas
|
alert: CephMdsMissingReplicas
|
||||||
annotations:
|
annotations:
|
||||||
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
|
description: Minimum required replicas for storage metadata service not available.
|
||||||
|
Might affect the working of storage cluster.
|
||||||
message: Insufficient replicas for storage metadata service.
|
message: Insufficient replicas for storage metadata service.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -93,7 +94,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephMonHighNumberOfLeaderChanges
|
alert: CephMonHighNumberOfLeaderChanges
|
||||||
annotations:
|
annotations:
|
||||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
|
||||||
|
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||||
message: Storage Cluster has seen many leader changes recently.
|
message: Storage Cluster has seen many leader changes recently.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -129,7 +131,9 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephOSDCriticallyFull
|
alert: CephOSDCriticallyFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
|
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
|
||||||
|
crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or
|
||||||
|
expand the storage cluster or contact support.
|
||||||
message: Back-end storage device is critically full.
|
message: Back-end storage device is critically full.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -145,7 +149,9 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephOSDNearFull
|
alert: CephOSDNearFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
|
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
|
||||||
|
crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage
|
||||||
|
cluster or contact support.
|
||||||
message: Back-end storage device is nearing full.
|
message: Back-end storage device is nearing full.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -161,7 +167,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephOSDDiskNotResponding
|
alert: CephOSDDiskNotResponding
|
||||||
annotations:
|
annotations:
|
||||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
|
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
|
||||||
|
}}.
|
||||||
message: Disk not responding
|
message: Disk not responding
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -177,7 +184,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephOSDDiskUnavailable
|
alert: CephOSDDiskUnavailable
|
||||||
annotations:
|
annotations:
|
||||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
|
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
|
||||||
|
}}.
|
||||||
message: Disk not accessible
|
message: Disk not accessible
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
|
@ -227,8 +235,10 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PersistentVolumeUsageNearFull
|
alert: PersistentVolumeUsageNearFull
|
||||||
annotations:
|
annotations:
|
||||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
|
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%.
|
||||||
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
|
Free up some space or expand the PVC.
|
||||||
|
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
|
||||||
|
or PVC expansion is required.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -243,8 +253,10 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PersistentVolumeUsageCritical
|
alert: PersistentVolumeUsageCritical
|
||||||
annotations:
|
annotations:
|
||||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
|
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%.
|
||||||
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
|
Free up some space or expand the PVC immediately.
|
||||||
|
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
|
||||||
|
or PVC expansion is required.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -327,8 +339,10 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephClusterNearFull
|
alert: CephClusterNearFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster.
|
description: Storage cluster utilization has crossed 75% and will become read-only
|
||||||
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
|
at 85%. Free up some space or expand the storage cluster.
|
||||||
|
message: Storage cluster is nearing full. Data deletion or cluster expansion is
|
||||||
|
required.
|
||||||
severity_level: warning
|
severity_level: warning
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -343,8 +357,10 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephClusterCriticallyFull
|
alert: CephClusterCriticallyFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately.
|
description: Storage cluster utilization has crossed 80% and will become read-only
|
||||||
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
|
at 85%. Free up some space or expand the storage cluster immediately.
|
||||||
|
message: Storage cluster is critically full and needs immediate data deletion or
|
||||||
|
cluster expansion.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
@ -359,8 +375,10 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CephClusterReadOnly
|
alert: CephClusterReadOnly
|
||||||
annotations:
|
annotations:
|
||||||
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately.
|
description: Storage cluster utilization has crossed 85% and will become read-only
|
||||||
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion.
|
now. Free up some space or expand the storage cluster immediately.
|
||||||
|
message: Storage cluster is read-only now and needs immediate data deletion or cluster
|
||||||
|
expansion.
|
||||||
severity_level: error
|
severity_level: error
|
||||||
storage_type: ceph
|
storage_type: ceph
|
||||||
expr: |
|
expr: |
|
||||||
|
|
|
@ -39,7 +39,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CoreDNSLatencyHigh
|
alert: CoreDNSLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} .
|
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
|
||||||
|
{{ $labels.server }} zone {{ $labels.zone }} .
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
|
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
|
||||||
|
@ -54,7 +55,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CoreDNSErrorsHigh
|
alert: CoreDNSErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||||
|
requests.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
@ -71,7 +73,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CoreDNSErrorsHigh
|
alert: CoreDNSErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||||
|
requests.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
@ -90,7 +93,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CoreDNSForwardLatencyHigh
|
alert: CoreDNSForwardLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}.
|
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
|
||||||
|
requests to {{ $labels.to }}.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
|
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
|
||||||
|
@ -105,7 +109,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CoreDNSForwardErrorsHigh
|
alert: CoreDNSForwardErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||||
|
forward requests to {{ $labels.to }}.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
@ -122,7 +127,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CoreDNSForwardErrorsHigh
|
alert: CoreDNSForwardErrorsHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||||
|
forward requests to {{ $labels.to }}.
|
||||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -56,7 +56,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdNoLeader
|
alert: etcdNoLeader
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no
|
||||||
|
leader.'
|
||||||
expr: |
|
expr: |
|
||||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||||
for: 1m
|
for: 1m
|
||||||
|
@ -69,7 +70,9 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighNumberOfLeaderChanges
|
alert: etcdHighNumberOfLeaderChanges
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the
|
||||||
|
last 15 minutes. Frequent elections may be a sign of insufficient resources, high
|
||||||
|
network latency, or disruptions by other components and should be investigated.'
|
||||||
expr: |
|
expr: |
|
||||||
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -82,7 +85,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighNumberOfFailedGRPCRequests
|
alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
|
||||||
|
}} failed on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||||
/
|
/
|
||||||
|
@ -98,7 +102,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighNumberOfFailedGRPCRequests
|
alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
|
||||||
|
}} failed on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||||
/
|
/
|
||||||
|
@ -114,7 +119,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdGRPCRequestsSlow
|
alert: etcdGRPCRequestsSlow
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
|
||||||
|
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
|
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
|
||||||
> 0.15
|
> 0.15
|
||||||
|
@ -128,7 +134,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdMemberCommunicationSlow
|
alert: etcdMemberCommunicationSlow
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
|
||||||
|
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
> 0.15
|
> 0.15
|
||||||
|
@ -142,7 +149,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighNumberOfFailedProposals
|
alert: etcdHighNumberOfFailedProposals
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
|
||||||
|
the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -155,7 +163,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighFsyncDurations
|
alert: etcdHighFsyncDurations
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{
|
||||||
|
$value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
> 0.5
|
> 0.5
|
||||||
|
@ -169,7 +178,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighCommitDurations
|
alert: etcdHighCommitDurations
|
||||||
annotations:
|
annotations:
|
||||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{
|
||||||
|
$value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
> 0.25
|
> 0.25
|
||||||
|
@ -183,7 +193,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighNumberOfFailedHTTPRequests
|
alert: etcdHighNumberOfFailedHTTPRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||||
|
{{ $labels.instance }}'
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||||
without (code) > 0.01
|
without (code) > 0.01
|
||||||
|
@ -197,7 +208,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHighNumberOfFailedHTTPRequests
|
alert: etcdHighNumberOfFailedHTTPRequests
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
|
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||||
|
{{ $labels.instance }}.'
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||||
without (code) > 0.05
|
without (code) > 0.05
|
||||||
|
@ -211,7 +223,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: etcdHTTPRequestsSlow
|
alert: etcdHTTPRequestsSlow
|
||||||
annotations:
|
annotations:
|
||||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
|
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||||
|
}} are slow.
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||||
> 0.15
|
> 0.15
|
||||||
|
|
|
@ -96,7 +96,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: GlusterBrickUtilization
|
alert: GlusterBrickUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
|
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||||
|
than 80%
|
||||||
expr: |
|
expr: |
|
||||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
||||||
|
@ -110,7 +111,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: GlusterBrickUtilization
|
alert: GlusterBrickUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
|
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||||
|
than 90%
|
||||||
expr: |
|
expr: |
|
||||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
||||||
|
@ -126,7 +128,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: GlusterThinpoolDataUtilization
|
alert: GlusterThinpoolDataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
|
||||||
|
80%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -139,7 +142,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: GlusterThinpoolDataUtilization
|
alert: GlusterThinpoolDataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
|
||||||
|
90%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -152,7 +156,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: GlusterThinpoolMetadataUtilization
|
alert: GlusterThinpoolMetadataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||||
|
than 80%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -165,7 +170,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: GlusterThinpoolMetadataUtilization
|
alert: GlusterThinpoolMetadataUtilization
|
||||||
annotations:
|
annotations:
|
||||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
|
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||||
|
than 90%
|
||||||
expr: |
|
expr: |
|
||||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
||||||
for: 5m
|
for: 5m
|
||||||
|
|
|
@ -38,7 +38,9 @@ alert: JaegerAgentHTTPServerErrs
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
||||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job,
|
||||||
|
namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -51,7 +53,9 @@ alert: JaegerClientSpansDropped
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -64,7 +68,9 @@ alert: JaegerAgentSpansDropped
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by
|
||||||
|
(instance, job, namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -90,7 +96,9 @@ alert: JaegerCollectorDroppingSpans
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job,
|
||||||
|
namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
|
||||||
|
job, namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -103,7 +111,9 @@ alert: JaegerSamplingUpdateFailing
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
||||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job,
|
||||||
|
namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -116,7 +126,8 @@ alert: JaegerCollectorPersistenceSlow
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
||||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
|
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
|
||||||
|
> 0.5
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -129,7 +140,9 @@ alert: JaegerThrottlingUpdateFailing
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
||||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job,
|
||||||
|
namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -142,7 +155,9 @@ alert: JaegerQueryReqsFailing
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
|
||||||
|
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
|
||||||
|
namespace)> 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -155,7 +170,9 @@ alert: JaegerCassandraWritesFailing
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
|
||||||
|
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -168,7 +185,9 @@ alert: JaegerCassandraReadsFailing
|
||||||
annotations:
|
annotations:
|
||||||
message: |
|
message: |
|
||||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
|
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace)
|
||||||
|
/ sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)>
|
||||||
|
1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CockroachInstanceFlapping
|
alert: CockroachInstanceFlapping
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m'
|
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{
|
||||||
|
$value }} time(s) in 10m'
|
||||||
expr: |
|
expr: |
|
||||||
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
|
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
|
||||||
for: 1m
|
for: 1m
|
||||||
|
@ -64,7 +65,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CockroachStoreDiskLow
|
alert: CockroachStoreDiskLow
|
||||||
annotations:
|
annotations:
|
||||||
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction
|
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }}
|
||||||
|
available disk fraction
|
||||||
expr: |
|
expr: |
|
||||||
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
|
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
|
||||||
for: 30m
|
for: 30m
|
||||||
|
@ -116,7 +118,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CockroachHighOpenFDCount
|
alert: CockroachHighOpenFDCount
|
||||||
annotations:
|
annotations:
|
||||||
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used'
|
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }}
|
||||||
|
fraction used'
|
||||||
expr: |
|
expr: |
|
||||||
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
|
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
|
||||||
for: 10m
|
for: 10m
|
||||||
|
|
|
@ -23,7 +23,10 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeStateMetricsListErrors
|
alert: KubeStateMetricsListErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
description: kube-state-metrics is experiencing errors at an elevated rate in list
|
||||||
|
operations. This is likely causing it to not be able to expose metrics about Kubernetes
|
||||||
|
objects correctly or at all.
|
||||||
|
summary: kube-state-metrics is experiencing errors in list operations.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
/
|
/
|
||||||
|
@ -39,7 +42,10 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeStateMetricsWatchErrors
|
alert: KubeStateMetricsWatchErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
description: kube-state-metrics is experiencing errors at an elevated rate in watch
|
||||||
|
operations. This is likely causing it to not be able to expose metrics about Kubernetes
|
||||||
|
objects correctly or at all.
|
||||||
|
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
/
|
/
|
||||||
|
|
|
@ -24,7 +24,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubePodCrashLooping
|
alert: KubePodCrashLooping
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }})
|
||||||
|
is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||||
expr: |
|
expr: |
|
||||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
||||||
|
@ -39,7 +40,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubePodNotReady
|
alert: KubePodNotReady
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state
|
||||||
|
for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (
|
sum by (namespace, pod) (
|
||||||
|
@ -60,7 +62,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeDeploymentGenerationMismatch
|
alert: KubeDeploymentGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||||
|
}} does not match, this indicates that the Deployment has failed but has not been
|
||||||
|
rolled back.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
||||||
expr: |
|
expr: |
|
||||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||||
|
@ -77,7 +81,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeDeploymentReplicasMismatch
|
alert: KubeDeploymentReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
|
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched
|
||||||
|
the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -100,7 +105,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeStatefulSetReplicasMismatch
|
alert: KubeStatefulSetReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched
|
||||||
|
the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -123,7 +129,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeStatefulSetGenerationMismatch
|
alert: KubeStatefulSetGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||||
|
}} does not match, this indicates that the StatefulSet has failed but has not
|
||||||
|
been rolled back.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
||||||
expr: |
|
expr: |
|
||||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||||
|
@ -140,7 +148,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeStatefulSetUpdateNotRolledOut
|
alert: KubeStatefulSetUpdateNotRolledOut
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has
|
||||||
|
not been rolled out.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -171,7 +180,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeDaemonSetRolloutStuck
|
alert: KubeDaemonSetRolloutStuck
|
||||||
annotations:
|
annotations:
|
||||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
|
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
|
||||||
|
or progressed for at least 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -208,7 +218,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeContainerWaiting
|
alert: KubeContainerWaiting
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
|
||||||
|
has been in waiting state for longer than 1 hour.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
||||||
|
@ -223,7 +234,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeDaemonSetNotScheduled
|
alert: KubeDaemonSetNotScheduled
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are not scheduled.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
||||||
expr: |
|
expr: |
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||||
|
@ -240,7 +252,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeDaemonSetMisScheduled
|
alert: KubeDaemonSetMisScheduled
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are running where they are not supposed to run.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||||
expr: |
|
expr: |
|
||||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||||
|
@ -255,7 +268,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeJobCompletion
|
alert: KubeJobCompletion
|
||||||
annotations:
|
annotations:
|
||||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
|
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
|
||||||
|
12 hours to complete.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
||||||
expr: |
|
expr: |
|
||||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||||
|
@ -285,7 +299,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeHpaReplicasMismatch
|
alert: KubeHpaReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
|
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
|
||||||
|
number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||||
|
@ -304,7 +319,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeHpaMaxedOut
|
alert: KubeHpaMaxedOut
|
||||||
annotations:
|
annotations:
|
||||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
|
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas
|
||||||
|
for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||||
expr: |
|
expr: |
|
||||||
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||||
|
@ -323,7 +339,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeCPUOvercommit
|
alert: KubeCPUOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate
|
||||||
|
node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||||
|
@ -342,7 +359,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeMemoryOvercommit
|
alert: KubeMemoryOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
||||||
|
tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||||
|
@ -399,7 +417,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeQuotaFullyUsed
|
alert: KubeQuotaFullyUsed
|
||||||
annotations:
|
annotations:
|
||||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
|
}} of its {{ $labels.resource }} quota.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
||||||
expr: |
|
expr: |
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||||
|
@ -417,7 +436,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: CPUThrottlingHigh
|
alert: CPUThrottlingHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace
|
||||||
|
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||||
|
@ -437,7 +457,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubePersistentVolumeFillingUp
|
alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in
|
||||||
|
Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
expr: |
|
expr: |
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||||
|
@ -455,7 +476,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubePersistentVolumeFillingUp
|
alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
|
}} in Namespace {{ $labels.namespace }} is expected to fill up within four days.
|
||||||
|
Currently {{ $value | humanizePercentage }} is available.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -476,7 +499,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubePersistentVolumeErrors
|
alert: KubePersistentVolumeErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
|
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase
|
||||||
|
}}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
||||||
expr: |
|
expr: |
|
||||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||||
|
@ -493,7 +517,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeVersionMismatch
|
alert: KubeVersionMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: There are {{ $value }} different semantic versions of Kubernetes components running.
|
message: There are {{ $value }} different semantic versions of Kubernetes components
|
||||||
|
running.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||||
expr: |
|
expr: |
|
||||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
||||||
|
@ -508,7 +533,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeClientErrors
|
alert: KubeClientErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
|
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}'
|
||||||
|
is experiencing {{ $value | humanizePercentage }} errors.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||||
|
@ -606,7 +632,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeClientCertificateExpiration
|
alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
|
in less than 7.0 days.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
||||||
|
@ -620,7 +647,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeClientCertificateExpiration
|
alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
|
in less than 24.0 hours.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
||||||
|
@ -634,7 +662,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: AggregatedAPIErrors
|
alert: AggregatedAPIErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
|
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
|
||||||
|
errors. The number of errors have increased for it in the past five minutes. High
|
||||||
|
values indicate that the availability of the service changes too often.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||||
expr: |
|
expr: |
|
||||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||||
|
@ -648,7 +678,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: AggregatedAPIDown
|
alert: AggregatedAPIDown
|
||||||
annotations:
|
annotations:
|
||||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.
|
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only
|
||||||
|
{{ $value | humanize }}% available over the last 5m.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||||
expr: |
|
expr: |
|
||||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
||||||
|
@ -709,7 +740,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeletTooManyPods
|
alert: KubeletTooManyPods
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
|
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||||
|
}} of its Pod capacity.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||||
expr: |
|
expr: |
|
||||||
count by(node) (
|
count by(node) (
|
||||||
|
@ -730,7 +762,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeNodeReadinessFlapping
|
alert: KubeNodeReadinessFlapping
|
||||||
annotations:
|
annotations:
|
||||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
|
message: The readiness status of node {{ $labels.node }} has changed {{ $value }}
|
||||||
|
times in the last 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||||
expr: |
|
expr: |
|
||||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||||
|
@ -745,7 +778,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeletPlegDurationHigh
|
alert: KubeletPlegDurationHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
|
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
|
||||||
|
of {{ $value }} seconds on node {{ $labels.node }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||||
expr: |
|
expr: |
|
||||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||||
|
@ -760,7 +794,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: KubeletPodStartUpLatencyHigh
|
alert: KubeletPodStartUpLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
|
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on
|
||||||
|
node {{ $labels.node }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
|
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
|
||||||
|
|
|
@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemSpaceFillingUp
|
alert: NodeFilesystemSpaceFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available space left and is filling up.
|
||||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -43,7 +44,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemSpaceFillingUp
|
alert: NodeFilesystemSpaceFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -63,7 +65,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemAlmostOutOfSpace
|
alert: NodeFilesystemAlmostOutOfSpace
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available space left.
|
||||||
summary: Filesystem has less than 5% space left.
|
summary: Filesystem has less than 5% space left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -81,7 +84,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemAlmostOutOfSpace
|
alert: NodeFilesystemAlmostOutOfSpace
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available space left.
|
||||||
summary: Filesystem has less than 3% space left.
|
summary: Filesystem has less than 3% space left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -99,7 +103,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemFilesFillingUp
|
alert: NodeFilesystemFilesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -119,7 +124,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemFilesFillingUp
|
alert: NodeFilesystemFilesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -139,7 +145,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemAlmostOutOfFiles
|
alert: NodeFilesystemAlmostOutOfFiles
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available inodes left.
|
||||||
summary: Filesystem has less than 5% inodes left.
|
summary: Filesystem has less than 5% inodes left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -157,7 +164,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeFilesystemAlmostOutOfFiles
|
alert: NodeFilesystemAlmostOutOfFiles
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||||
|
{{ printf "%.2f" $value }}% available inodes left.
|
||||||
summary: Filesystem has less than 3% inodes left.
|
summary: Filesystem has less than 3% inodes left.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -175,7 +183,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeNetworkReceiveErrs
|
alert: NodeNetworkReceiveErrs
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
|
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||||
summary: Network interface is reporting many receive errors.
|
summary: Network interface is reporting many receive errors.
|
||||||
expr: |
|
expr: |
|
||||||
increase(node_network_receive_errs_total[2m]) > 10
|
increase(node_network_receive_errs_total[2m]) > 10
|
||||||
|
@ -189,7 +198,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeNetworkTransmitErrs
|
alert: NodeNetworkTransmitErrs
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
|
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||||
summary: Network interface is reporting many transmit errors.
|
summary: Network interface is reporting many transmit errors.
|
||||||
expr: |
|
expr: |
|
||||||
increase(node_network_transmit_errs_total[2m]) > 10
|
increase(node_network_transmit_errs_total[2m]) > 10
|
||||||
|
@ -229,7 +239,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeClockSkewDetected
|
alert: NodeClockSkewDetected
|
||||||
annotations:
|
annotations:
|
||||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
|
||||||
|
NTP is configured correctly on this host.
|
||||||
summary: Clock skew detected.
|
summary: Clock skew detected.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -253,7 +264,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: NodeClockNotSynchronising
|
alert: NodeClockNotSynchronising
|
||||||
annotations:
|
annotations:
|
||||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
|
||||||
|
on this host.
|
||||||
summary: Clock not synchronising.
|
summary: Clock not synchronising.
|
||||||
expr: |
|
expr: |
|
||||||
min_over_time(node_timex_sync_status[5m]) == 0
|
min_over_time(node_timex_sync_status[5m]) == 0
|
||||||
|
|
|
@ -35,13 +35,15 @@ labels:
|
||||||
{{< /code >}}
|
{{< /code >}}
|
||||||
|
|
||||||
##### PrometheusNotificationQueueRunningFull
|
##### PrometheusNotificationQueueRunningFull
|
||||||
Prometheus alert notification queue predicted to run full in less than 30m.
|
Prometheus alert notification queue predicted to run full in less than
|
||||||
|
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusNotificationQueueRunningFull
|
alert: PrometheusNotificationQueueRunningFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
|
description: Alert notification queue of Prometheus {{$labels.instance}} is running
|
||||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
full.
|
||||||
|
summary: Prometheus alert notification queue predicted to run full in less than
|
||||||
|
30m.
|
||||||
expr: |
|
expr: |
|
||||||
# Without min_over_time, failed scrapes could create false negatives, see
|
# Without min_over_time, failed scrapes could create false negatives, see
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
@ -56,14 +58,17 @@ labels:
|
||||||
{{< /code >}}
|
{{< /code >}}
|
||||||
|
|
||||||
##### PrometheusErrorSendingAlertsToSomeAlertmanagers
|
##### PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||||
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||||
Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
|
||||||
|
Prometheus has encountered more than 1% errors sending alerts to a specific
|
||||||
|
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||||
|
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
|
||||||
|
Alertmanager.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||||
|
@ -78,13 +83,14 @@ labels:
|
||||||
{{< /code >}}
|
{{< /code >}}
|
||||||
|
|
||||||
##### PrometheusErrorSendingAlertsToAnyAlertmanager
|
##### PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||||
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||||
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||||
|
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||||
|
Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||||
expr: |
|
expr: |
|
||||||
min without(alertmanager) (
|
min without(alertmanager) (
|
||||||
|
@ -120,7 +126,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusTSDBReloadsFailing
|
alert: PrometheusTSDBReloadsFailing
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
|
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||||
|
reload failures over the last 3h.
|
||||||
summary: Prometheus has issues reloading blocks from disk.
|
summary: Prometheus has issues reloading blocks from disk.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||||
|
@ -134,7 +141,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusTSDBCompactionsFailing
|
alert: PrometheusTSDBCompactionsFailing
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||||
|
compaction failures over the last 3h.
|
||||||
summary: Prometheus has issues compacting blocks.
|
summary: Prometheus has issues compacting blocks.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||||
|
@ -162,7 +170,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusDuplicateTimestamps
|
alert: PrometheusDuplicateTimestamps
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||||
|
samples/s with different values but duplicated timestamp.
|
||||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -176,7 +185,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusOutOfOrderTimestamps
|
alert: PrometheusOutOfOrderTimestamps
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||||
|
samples/s with timestamps arriving out of order.
|
||||||
summary: Prometheus drops samples with out-of-order timestamps.
|
summary: Prometheus drops samples with out-of-order timestamps.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -190,7 +200,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusRemoteStorageFailures
|
alert: PrometheusRemoteStorageFailures
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value
|
||||||
|
}}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||||
summary: Prometheus fails to send samples to remote storage.
|
summary: Prometheus fails to send samples to remote storage.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
|
@ -214,7 +225,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusRemoteWriteBehind
|
alert: PrometheusRemoteWriteBehind
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value
|
||||||
|
}}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||||
summary: Prometheus remote write is behind.
|
summary: Prometheus remote write is behind.
|
||||||
expr: |
|
expr: |
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
@ -235,8 +247,12 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusRemoteWriteDesiredShards
|
alert: PrometheusRemoteWriteDesiredShards
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
|
description: Prometheus {{$labels.instance}} remote write desired shards calculation
|
||||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
|
||||||
|
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
|
||||||
|
$labels.instance | query | first | value }}.
|
||||||
|
summary: Prometheus remote write desired shards calculation wants to run more than
|
||||||
|
configured max shards.
|
||||||
expr: |
|
expr: |
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
@ -255,7 +271,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusRuleFailures
|
alert: PrometheusRuleFailures
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f"
|
||||||
|
$value }} rules in the last 5m.
|
||||||
summary: Prometheus is failing rule evaluations.
|
summary: Prometheus is failing rule evaluations.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -269,7 +286,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusMissingRuleEvaluations
|
alert: PrometheusMissingRuleEvaluations
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
|
||||||
|
}} rule group evaluations in the last 5m.
|
||||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -283,8 +301,10 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusTargetLimitHit
|
alert: PrometheusTargetLimitHit
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
}} targets because the number of targets exceeded the configured target_limit.
|
||||||
|
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||||
|
the targets limit.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -296,5 +316,5 @@ labels:
|
||||||
Following dashboards are generated from mixins and hosted on github:
|
Following dashboards are generated from mixins and hosted on github:
|
||||||
|
|
||||||
|
|
||||||
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
|
||||||
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
|
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
|
||||||
|
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
||||||
|
|
|
@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosCompactMultipleRunning
|
alert: ThanosCompactMultipleRunning
|
||||||
annotations:
|
annotations:
|
||||||
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
|
message: No more than one Thanos Compact instance should be running at once. There
|
||||||
|
are {{ $value }}
|
||||||
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
|
@ -47,7 +48,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosCompactHighCompactionFailures
|
alert: ThanosCompactHighCompactionFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
|
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
|
||||||
|
}}% of compactions.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||||
|
@ -65,7 +67,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosCompactBucketHighOperationFailures
|
alert: ThanosCompactBucketHighOperationFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value |
|
||||||
|
humanize }}% of operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||||
|
@ -84,7 +87,8 @@ labels:
|
||||||
alert: ThanosCompactHasNotRun
|
alert: ThanosCompactHasNotRun
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
||||||
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24
|
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
|
||||||
|
/ 60 / 60 > 24
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
{{< /code >}}
|
{{< /code >}}
|
||||||
|
@ -96,7 +100,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
|
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of "query" requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
||||||
|
@ -113,7 +118,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
|
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of "query_range" requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||||
|
@ -130,7 +136,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosQueryGrpcServerErrorRate
|
alert: ThanosQueryGrpcServerErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
||||||
|
@ -148,7 +155,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosQueryGrpcClientErrorRate
|
alert: ThanosQueryGrpcClientErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
|
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}%
|
||||||
|
of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
||||||
|
@ -165,7 +173,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosQueryHighDNSFailures
|
alert: ThanosQueryHighDNSFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
|
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS
|
||||||
|
queries for store endpoints.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||||
|
@ -182,7 +191,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosQueryInstantLatencyHigh
|
alert: ThanosQueryInstantLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
|
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for instant queries.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
||||||
|
@ -199,7 +209,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosQueryRangeLatencyHigh
|
alert: ThanosQueryRangeLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
|
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for range queries.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
||||||
|
@ -218,7 +229,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosReceiveHttpRequestErrorRateHigh
|
alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||||
|
@ -235,7 +247,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosReceiveHttpRequestLatencyHigh
|
alert: ThanosReceiveHttpRequestLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
|
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
||||||
|
@ -252,7 +265,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosReceiveHighReplicationFailures
|
alert: ThanosReceiveHighReplicationFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.
|
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
thanos_receive_replication_factor > 1
|
thanos_receive_replication_factor > 1
|
||||||
and
|
and
|
||||||
|
@ -279,7 +293,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosReceiveHighForwardRequestFailures
|
alert: ThanosReceiveHighForwardRequestFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
|
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||||
|
@ -296,7 +311,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosReceiveHighHashringFileRefreshFailures
|
alert: ThanosReceiveHighHashringFileRefreshFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
|
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{
|
||||||
|
$value | humanize }} of attempts failed.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||||
|
@ -315,7 +331,8 @@ labels:
|
||||||
alert: ThanosReceiveConfigReloadFailure
|
alert: ThanosReceiveConfigReloadFailure
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
|
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
|
||||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
|
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by
|
||||||
|
(job) != 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -326,7 +343,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosReceiveNoUpload
|
alert: ThanosReceiveNoUpload
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.
|
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
|
||||||
|
latest data to object storage.
|
||||||
expr: |
|
expr: |
|
||||||
(up{job=~"thanos-receive.*"} - 1)
|
(up{job=~"thanos-receive.*"} - 1)
|
||||||
+ on (instance) # filters to only alert on current instance last 3h
|
+ on (instance) # filters to only alert on current instance last 3h
|
||||||
|
@ -356,7 +374,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosSidecarUnhealthy
|
alert: ThanosSidecarUnhealthy
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
|
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value
|
||||||
|
}} seconds.
|
||||||
expr: |
|
expr: |
|
||||||
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
|
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
|
||||||
labels:
|
labels:
|
||||||
|
@ -370,7 +389,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosStoreGrpcErrorRate
|
alert: ThanosStoreGrpcErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
|
||||||
|
}}% of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
||||||
|
@ -388,7 +408,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosStoreSeriesGateLatencyHigh
|
alert: ThanosStoreSeriesGateLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
|
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for store series gate requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||||
|
@ -405,7 +426,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosStoreBucketHighOperationFailures
|
alert: ThanosStoreBucketHighOperationFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize
|
||||||
|
}}% of operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||||
|
@ -423,7 +445,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosStoreObjstoreOperationLatencyHigh
|
alert: ThanosStoreObjstoreOperationLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
|
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{
|
||||||
|
$value }} seconds for the bucket operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||||
|
@ -452,12 +475,13 @@ labels:
|
||||||
{{< /code >}}
|
{{< /code >}}
|
||||||
|
|
||||||
##### ThanosRuleSenderIsFailingAlerts
|
##### ThanosRuleSenderIsFailingAlerts
|
||||||
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
|
||||||
|
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosRuleSenderIsFailingAlerts
|
alert: ThanosRuleSenderIsFailingAlerts
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
|
||||||
|
alertmanager.
|
||||||
expr: |
|
expr: |
|
||||||
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -488,7 +512,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosRuleHighRuleEvaluationWarnings
|
alert: ThanosRuleHighRuleEvaluationWarnings
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
|
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
|
||||||
|
warnings.
|
||||||
expr: |
|
expr: |
|
||||||
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -501,7 +526,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosRuleRuleEvaluationLatencyHigh
|
alert: ThanosRuleRuleEvaluationLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
|
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
|
||||||
|
than interval for {{$labels.rule_group}}.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
||||||
|
@ -518,7 +544,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosRuleGrpcErrorRate
|
alert: ThanosRuleGrpcErrorRate
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}%
|
||||||
|
of requests.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
||||||
|
@ -537,7 +564,8 @@ labels:
|
||||||
alert: ThanosRuleConfigReloadFailure
|
alert: ThanosRuleConfigReloadFailure
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
||||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
|
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job)
|
||||||
|
!= 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
@ -548,7 +576,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosRuleQueryHighDNSFailures
|
alert: ThanosRuleQueryHighDNSFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
|
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
|
||||||
|
queries for query endpoints.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||||
|
@ -566,7 +595,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosRuleAlertmanagerHighDNSFailures
|
alert: ThanosRuleAlertmanagerHighDNSFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
|
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
|
||||||
|
queries for Alertmanager endpoints.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||||
|
@ -584,7 +614,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosRuleNoEvaluationFor10Intervals
|
alert: ThanosRuleNoEvaluationFor10Intervals
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
|
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that
|
||||||
|
did not evaluate for at least 10x of their expected interval.
|
||||||
expr: |
|
expr: |
|
||||||
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
||||||
>
|
>
|
||||||
|
@ -599,7 +630,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosNoRuleEvaluations
|
alert: ThanosNoRuleEvaluations
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
|
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the
|
||||||
|
past 2 minutes.
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
||||||
and
|
and
|
||||||
|
@ -726,7 +758,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: ThanosBucketReplicateRunLatency
|
alert: ThanosBucketReplicateRunLatency
|
||||||
annotations:
|
annotations:
|
||||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
|
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||||
|
}} seconds for the replicate operations.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
||||||
|
|
Loading…
Reference in a new issue