1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00

assets,site/content: regenerate

This commit is contained in:
paulfantom 2020-08-13 12:50:10 +02:00
parent df43594957
commit 7fd2bee5a7
No known key found for this signature in database
GPG key ID: 12AE0185401674E7
25 changed files with 1134 additions and 535 deletions

View file

@ -27,7 +27,8 @@ groups:
rules:
- alert: CephMdsMissingReplicas
annotations:
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
description: Minimum required replicas for storage metadata service not available.
Might affect the working of storage cluster.
message: Insufficient replicas for storage metadata service.
severity_level: warning
storage_type: ceph
@ -51,7 +52,8 @@ groups:
severity: critical
- alert: CephMonHighNumberOfLeaderChanges
annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
message: Storage Cluster has seen many leader changes recently.
severity_level: warning
storage_type: ceph
@ -64,7 +66,8 @@ groups:
rules:
- alert: CephNodeDown
annotations:
description: Storage node {{ $labels.node }} went down. Please check the node immediately.
description: Storage node {{ $labels.node }} went down. Please check the node
immediately.
message: Storage node {{ $labels.node }} went down
severity_level: error
storage_type: ceph
@ -77,7 +80,9 @@ groups:
rules:
- alert: CephOSDCriticallyFull
annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space
or expand the storage cluster or contact support.
message: Back-end storage device is critically full.
severity_level: error
storage_type: ceph
@ -88,7 +93,9 @@ groups:
severity: critical
- alert: CephOSDNearFull
annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand
the storage cluster or contact support.
message: Back-end storage device is nearing full.
severity_level: warning
storage_type: ceph
@ -99,7 +106,8 @@ groups:
severity: warning
- alert: CephOSDDiskNotResponding
annotations:
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
}}.
message: Disk not responding
severity_level: error
storage_type: ceph
@ -110,7 +118,8 @@ groups:
severity: critical
- alert: CephOSDDiskUnavailable
annotations:
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
}}.
message: Disk not accessible
severity_level: error
storage_type: ceph
@ -145,8 +154,10 @@ groups:
rules:
- alert: PersistentVolumeUsageNearFull
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
75%. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
or PVC expansion is required.
severity_level: warning
storage_type: ceph
expr: |
@ -156,8 +167,10 @@ groups:
severity: warning
- alert: PersistentVolumeUsageCritical
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
85%. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
or PVC expansion is required.
severity_level: error
storage_type: ceph
expr: |
@ -191,7 +204,8 @@ groups:
severity: warning
- alert: CephOSDVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph OSD components running.
description: There are {{ $value }} different versions of Ceph OSD components
running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
@ -202,7 +216,8 @@ groups:
severity: warning
- alert: CephMonVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph Mon components running.
description: There are {{ $value }} different versions of Ceph Mon components
running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
@ -215,8 +230,10 @@ groups:
rules:
- alert: CephClusterNearFull
annotations:
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
description: Storage cluster utilization has crossed 75% and will become read-only
at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion
is required.
severity_level: warning
storage_type: ceph
expr: |
@ -226,8 +243,10 @@ groups:
severity: warning
- alert: CephClusterCriticallyFull
annotations:
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
description: Storage cluster utilization has crossed 80% and will become read-only
at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion
or cluster expansion.
severity_level: error
storage_type: ceph
expr: |
@ -237,8 +256,10 @@ groups:
severity: critical
- alert: CephClusterReadOnly
annotations:
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion.
description: Storage cluster utilization has crossed 85% and will become read-only
now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion
or cluster expansion.
severity_level: error
storage_type: ceph
expr: |

View file

@ -12,7 +12,8 @@ groups:
severity: critical
- alert: CoreDNSLatencyHigh
annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} .
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
{{ $labels.server }} zone {{ $labels.zone }} .
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
@ -21,7 +22,8 @@ groups:
severity: critical
- alert: CoreDNSErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -32,7 +34,8 @@ groups:
severity: critical
- alert: CoreDNSErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -45,7 +48,8 @@ groups:
rules:
- alert: CoreDNSForwardLatencyHigh
annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}.
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
@ -54,7 +58,8 @@ groups:
severity: critical
- alert: CoreDNSForwardErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: |
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -65,7 +70,8 @@ groups:
severity: critical
- alert: CoreDNSForwardErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))

View file

@ -107,7 +107,8 @@ groups:
severity: warning
- alert: CortexIngesterRestarts
annotations:
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.'
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f"
$value }} times in the last 30 mins.'
expr: |
changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1
labels:
@ -278,7 +279,8 @@ groups:
rules:
- alert: CortexGossipMembersMismatch
annotations:
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.'
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of
gossip members.'
expr: |
memberlist_client_cluster_members_count
!= on (cluster, namespace) group_left
@ -290,7 +292,8 @@ groups:
rules:
- alert: CortexIngesterHasNotShippedBlocks
annotations:
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
not shipped any block in the last 4 hours.
expr: |
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4)
and
@ -302,7 +305,8 @@ groups:
severity: critical
- alert: CortexIngesterHasNotShippedBlocksSinceStart
annotations:
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
not shipped any block in the last 4 hours.
expr: |
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0)
and
@ -312,7 +316,8 @@ groups:
severity: critical
- alert: CortexIngesterTSDBHeadCompactionFailed
annotations:
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing
to compact TSDB head.
expr: |
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
for: 15m
@ -320,7 +325,8 @@ groups:
severity: critical
- alert: CortexQuerierHasNotScanTheBucket
annotations:
message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.
message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not
successfully scanned the bucket since {{ $value | humanizeDuration }}.
expr: |
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
and
@ -330,7 +336,9 @@ groups:
severity: critical
- alert: CortexQuerierHighRefetchRate
annotations:
message: Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.
message: Cortex Queries in {{ $labels.namespace }} are refetching series from
different store-gateways (because of missing blocks) for the {{ printf "%.0f"
$value }}% of queries.
expr: |
100 * (
(
@ -347,7 +355,9 @@ groups:
severity: warning
- alert: CortexStoreGatewayHasNotSyncTheBucket
annotations:
message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.
message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }}
has not successfully synched the bucket since {{ $value | humanizeDuration
}}.
expr: |
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
and
@ -359,7 +369,8 @@ groups:
rules:
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks
annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not successfully cleaned up blocks in the last 24 hours.
expr: |
(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24)
and
@ -369,7 +380,8 @@ groups:
severity: critical
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart
annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not successfully cleaned up blocks in the last 24 hours.
expr: |
cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0
for: 24h
@ -377,7 +389,8 @@ groups:
severity: critical
- alert: CortexCompactorHasNotUploadedBlocks
annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not uploaded any block in the last 24 hours.
expr: |
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24)
and
@ -387,7 +400,8 @@ groups:
severity: critical
- alert: CortexCompactorHasNotUploadedBlocksSinceStart
annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not uploaded any block in the last 24 hours.
expr: |
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0
for: 24h

View file

@ -1,11 +1,14 @@
groups:
- name: cortex_api
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
@ -13,185 +16,279 @@ groups:
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route))
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route))
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
- name: cortex_cache
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job)
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method)
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
- name: cortex_storage
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m]))
by (cluster, job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m]))
by (cluster, job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le,
cluster, job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by
(cluster, job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job)
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job)
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job)
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
/ sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query:avg
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job)
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:50quantile
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_database_request_duration_seconds:avg
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method)
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds:avg
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
@ -199,11 +296,14 @@ groups:
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
- name: cortex_queries
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:50quantile
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
by (cluster, job)
record: cluster_job:cortex_query_frontend_retries:avg
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
@ -211,23 +311,33 @@ groups:
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
(cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job)
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:50quantile
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_series:avg
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
@ -235,11 +345,14 @@ groups:
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:50quantile
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks:avg
- expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate
@ -247,11 +360,14 @@ groups:
record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job))
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:50quantile
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples:avg
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate

View file

@ -18,7 +18,8 @@ groups:
severity: critical
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
expr: |
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
@ -26,7 +27,8 @@ groups:
severity: critical
- alert: etcdNoLeader
annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
no leader.'
expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
@ -34,7 +36,9 @@ groups:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within
the last 15 minutes. Frequent elections may be a sign of insufficient resources,
high network latency, or disruptions by other components and should be investigated.'
expr: |
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m
@ -42,7 +46,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/
@ -53,7 +58,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/
@ -64,7 +70,8 @@ groups:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
@ -73,7 +80,8 @@ groups:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@ -82,7 +90,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
the last 30 minutes on etcd instance {{ $labels.instance }}.'
expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
@ -90,7 +99,8 @@ groups:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
{{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
@ -99,7 +109,8 @@ groups:
severity: warning
- alert: etcdHighCommitDurations
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
@ -108,7 +119,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.01
@ -117,7 +129,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}.'
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.05
@ -126,7 +139,8 @@ groups:
severity: critical
- alert: etcdHTTPRequestsSlow
annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
expr: |
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15

View file

@ -49,7 +49,8 @@ groups:
severity: critical
- alert: GlusterBrickUtilization
annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 80%
expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
@ -58,7 +59,8 @@ groups:
severity: warning
- alert: GlusterBrickUtilization
annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 90%
expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
@ -69,7 +71,8 @@ groups:
rules:
- alert: GlusterThinpoolDataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
than 80%
expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
for: 5m
@ -77,7 +80,8 @@ groups:
severity: warning
- alert: GlusterThinpoolDataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
than 90%
expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
for: 5m
@ -85,7 +89,8 @@ groups:
severity: critical
- alert: GlusterThinpoolMetadataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 80%
expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
for: 5m
@ -93,7 +98,8 @@ groups:
severity: warning
- alert: GlusterThinpoolMetadataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 90%
expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
for: 5m

View file

@ -13,7 +13,9 @@ groups:
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance,
job, namespace)> 1
for: 15m
labels:
severity: warning
@ -21,7 +23,9 @@ groups:
annotations:
message: |
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -29,7 +33,9 @@ groups:
annotations:
message: |
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m]))
by (instance, job, namespace)> 1
for: 15m
labels:
severity: warning
@ -45,7 +51,9 @@ groups:
annotations:
message: |
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
job, namespace)> 1
for: 15m
labels:
severity: warning
@ -53,7 +61,9 @@ groups:
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -61,7 +71,8 @@ groups:
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
> 0.5
for: 15m
labels:
severity: warning
@ -69,7 +80,9 @@ groups:
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job,
namespace)> 1
for: 15m
labels:
severity: warning
@ -77,7 +90,9 @@ groups:
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
namespace)> 1
for: 15m
labels:
severity: warning
@ -85,7 +100,9 @@ groups:
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -93,7 +110,9 @@ groups:
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance,
job, namespace)> 1
for: 15m
labels:
severity: warning

View file

@ -3,7 +3,8 @@ groups:
rules:
- alert: CockroachInstanceFlapping
annotations:
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m'
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
{{ $value }} time(s) in 10m'
expr: |
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
for: 1m
@ -29,7 +30,8 @@ groups:
severity: warning
- alert: CockroachStoreDiskLow
annotations:
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
}} available disk fraction
expr: |
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
for: 30m
@ -61,7 +63,8 @@ groups:
severity: warning
- alert: CockroachHighOpenFDCount
annotations:
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used'
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
}} fraction used'
expr: |
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
for: 10m

View file

@ -3,7 +3,10 @@ groups:
rules:
- alert: KubeStateMetricsListErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate in
list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
@ -14,7 +17,10 @@ groups:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate in
watch operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/

View file

@ -3,7 +3,8 @@ groups:
rules:
- alert: KubePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
@ -12,7 +13,8 @@ groups:
severity: warning
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: |
sum by (namespace, pod) (
@ -27,7 +29,9 @@ groups:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
@ -38,7 +42,8 @@ groups:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: |
(
@ -55,7 +60,8 @@ groups:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not
matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: |
(
@ -72,7 +78,9 @@ groups:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
@ -83,7 +91,8 @@ groups:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: |
(
@ -108,7 +117,8 @@ groups:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: |
(
@ -139,7 +149,8 @@ groups:
severity: warning
- alert: KubeContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
@ -148,7 +159,8 @@ groups:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
@ -159,7 +171,8 @@ groups:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
@ -168,7 +181,8 @@ groups:
severity: warning
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
@ -186,7 +200,8 @@ groups:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
@ -199,7 +214,8 @@ groups:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max
replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"}
@ -212,7 +228,8 @@ groups:
rules:
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
@ -225,7 +242,8 @@ groups:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
@ -264,7 +282,8 @@ groups:
severity: warning
- alert: KubeQuotaFullyUsed
annotations:
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
@ -276,7 +295,9 @@ groups:
severity: info
- alert: CPUThrottlingHigh
annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
}}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
@ -290,7 +311,9 @@ groups:
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: |
kubelet_volume_stats_available_bytes{job="kubelet"}
@ -302,7 +325,9 @@ groups:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: |
(
@ -317,7 +342,8 @@ groups:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
message: The persistent volume {{ $labels.persistentvolume }} has status {{
$labels.phase }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
@ -328,7 +354,8 @@ groups:
rules:
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes components running.
message: There are {{ $value }} different semantic versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
@ -337,7 +364,8 @@ groups:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
@ -405,7 +433,8 @@ groups:
rules:
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
@ -413,7 +442,8 @@ groups:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
@ -421,7 +451,9 @@ groups:
severity: critical
- alert: AggregatedAPIErrors
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
errors. The number of errors have increased for it in the past five minutes.
High values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
@ -429,7 +461,8 @@ groups:
severity: warning
- alert: AggregatedAPIDown
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been
only {{ $value | humanize }}% available over the last 5m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
@ -466,7 +499,8 @@ groups:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: |
count by(node) (
@ -481,7 +515,8 @@ groups:
severity: warning
- alert: KubeNodeReadinessFlapping
annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
message: The readiness status of node {{ $labels.node }} has changed {{ $value
}} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
@ -490,7 +525,8 @@ groups:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
@ -499,7 +535,8 @@ groups:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60

View file

@ -3,7 +3,8 @@ groups:
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left and is filling up.
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
@ -18,7 +19,8 @@ groups:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left and is filling up fast.
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
@ -33,7 +35,8 @@ groups:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 5% space left.
expr: |
(
@ -46,7 +49,8 @@ groups:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 3% space left.
expr: |
(
@ -59,7 +63,8 @@ groups:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left and is filling up.
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
@ -74,7 +79,8 @@ groups:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
@ -89,7 +95,8 @@ groups:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 5% inodes left.
expr: |
(
@ -102,7 +109,8 @@ groups:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 3% inodes left.
expr: |
(
@ -115,7 +123,8 @@ groups:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
summary: Network interface is reporting many receive errors.
expr: |
increase(node_network_receive_errs_total[2m]) > 10
@ -124,7 +133,8 @@ groups:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Network interface is reporting many transmit errors.
expr: |
increase(node_network_transmit_errs_total[2m]) > 10
@ -149,7 +159,8 @@ groups:
severity: warning
- alert: NodeClockSkewDetected
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
NTP is configured correctly on this host.
summary: Clock skew detected.
expr: |
(
@ -168,7 +179,8 @@ groups:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is
configured on this host.
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0

View file

@ -14,8 +14,10 @@ groups:
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
summary: Prometheus alert notification queue predicted to run full in less than 30m.
description: Alert notification queue of Prometheus {{$labels.instance}} is
running full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -29,8 +31,10 @@ groups:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a
specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
@ -44,7 +48,8 @@ groups:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without(alertmanager) (
@ -70,7 +75,8 @@ groups:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
@ -79,7 +85,8 @@ groups:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
@ -97,7 +104,8 @@ groups:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
@ -106,7 +114,8 @@ groups:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
@ -115,7 +124,8 @@ groups:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f"
$value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
summary: Prometheus fails to send samples to remote storage.
expr: |
(
@ -134,7 +144,8 @@ groups:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f"
$value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -150,8 +161,12 @@ groups:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
description: Prometheus {{$labels.instance}} remote write desired shards calculation
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -165,7 +180,8 @@ groups:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf
"%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
@ -174,7 +190,8 @@ groups:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
}} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
@ -183,8 +200,10 @@ groups:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
}} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
for: 15m

View file

@ -3,7 +3,8 @@ groups:
rules:
- alert: ThanosCompactMultipleRunning
annotations:
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
message: No more than one Thanos Compact instance should be running at once.
There are {{ $value }}
expr: sum(up{job=~"thanos-compact.*"}) > 1
for: 5m
labels:
@ -17,7 +18,8 @@ groups:
severity: warning
- alert: ThanosCompactHighCompactionFailures
annotations:
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
}}% of compactions.
expr: |
(
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
@ -30,7 +32,8 @@ groups:
severity: warning
- alert: ThanosCompactBucketHighOperationFailures
annotations:
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value
| humanize }}% of operations.
expr: |
(
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
@ -44,14 +47,16 @@ groups:
- alert: ThanosCompactHasNotRun
annotations:
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
/ 60 / 60 > 24
labels:
severity: warning
- name: thanos-query.rules
rules:
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query" requests.
expr: |
(
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
@ -63,7 +68,8 @@ groups:
severity: critical
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query_range" requests.
expr: |
(
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
@ -75,7 +81,8 @@ groups:
severity: critical
- alert: ThanosQueryGrpcServerErrorRate
annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
@ -88,7 +95,8 @@ groups:
severity: warning
- alert: ThanosQueryGrpcClientErrorRate
annotations:
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
@ -100,7 +108,8 @@ groups:
severity: warning
- alert: ThanosQueryHighDNSFailures
annotations:
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing
DNS queries for store endpoints.
expr: |
(
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
@ -112,7 +121,8 @@ groups:
severity: warning
- alert: ThanosQueryInstantLatencyHigh
annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for instant queries.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
@ -124,7 +134,8 @@ groups:
severity: critical
- alert: ThanosQueryRangeLatencyHigh
annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for range queries.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
@ -138,7 +149,8 @@ groups:
rules:
- alert: ThanosReceiveHttpRequestErrorRateHigh
annotations:
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: |
(
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
@ -150,7 +162,8 @@ groups:
severity: critical
- alert: ThanosReceiveHttpRequestLatencyHigh
annotations:
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for requests.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
@ -162,7 +175,8 @@ groups:
severity: critical
- alert: ThanosReceiveHighReplicationFailures
annotations:
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value |
humanize }}% of requests.
expr: |
thanos_receive_replication_factor > 1
and
@ -184,7 +198,8 @@ groups:
severity: warning
- alert: ThanosReceiveHighForwardRequestFailures
annotations:
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
@ -196,7 +211,8 @@ groups:
severity: warning
- alert: ThanosReceiveHighHashringFileRefreshFailures
annotations:
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file,
{{ $value | humanize }} of attempts failed.
expr: |
(
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
@ -209,14 +225,17 @@ groups:
severity: warning
- alert: ThanosReceiveConfigReloadFailure
annotations:
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
message: Thanos Receive {{$labels.job}} has not been able to reload hashring
configurations.
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"})
by (job) != 1
for: 5m
labels:
severity: warning
- alert: ThanosReceiveNoUpload
annotations:
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
latest data to object storage.
expr: |
(up{job=~"thanos-receive.*"} - 1)
+ on (instance) # filters to only alert on current instance last 3h
@ -236,7 +255,8 @@ groups:
severity: critical
- alert: ThanosSidecarUnhealthy
annotations:
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{
$value }} seconds.
expr: |
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
labels:
@ -245,7 +265,8 @@ groups:
rules:
- alert: ThanosStoreGrpcErrorRate
annotations:
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
@ -258,7 +279,8 @@ groups:
severity: warning
- alert: ThanosStoreSeriesGateLatencyHigh
annotations:
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for store series gate requests.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -270,7 +292,8 @@ groups:
severity: warning
- alert: ThanosStoreBucketHighOperationFailures
annotations:
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value
| humanize }}% of operations.
expr: |
(
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
@ -283,7 +306,8 @@ groups:
severity: warning
- alert: ThanosStoreObjstoreOperationLatencyHigh
annotations:
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of
{{ $value }} seconds for the bucket operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -305,7 +329,8 @@ groups:
severity: critical
- alert: ThanosRuleSenderIsFailingAlerts
annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts
to alertmanager.
expr: |
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
for: 5m
@ -313,7 +338,8 @@ groups:
severity: critical
- alert: ThanosRuleHighRuleEvaluationFailures
annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate
rules.
expr: |
(
sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
@ -326,7 +352,8 @@ groups:
severity: critical
- alert: ThanosRuleHighRuleEvaluationWarnings
annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
warnings.
expr: |
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
for: 15m
@ -334,7 +361,8 @@ groups:
severity: info
- alert: ThanosRuleRuleEvaluationLatencyHigh
annotations:
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
than interval for {{$labels.rule_group}}.
expr: |
(
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
@ -346,7 +374,8 @@ groups:
severity: warning
- alert: ThanosRuleGrpcErrorRate
annotations:
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
@ -360,13 +389,15 @@ groups:
- alert: ThanosRuleConfigReloadFailure
annotations:
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by
(job) != 1
for: 5m
labels:
severity: info
- alert: ThanosRuleQueryHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
DNS queries for query endpoints.
expr: |
(
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -379,7 +410,8 @@ groups:
severity: warning
- alert: ThanosRuleAlertmanagerHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
DNS queries for Alertmanager endpoints.
expr: |
(
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -392,7 +424,8 @@ groups:
severity: warning
- alert: ThanosRuleNoEvaluationFor10Intervals
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups
that did not evaluate for at least 10x of their expected interval.
expr: |
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
>
@ -402,7 +435,8 @@ groups:
severity: info
- alert: ThanosNoRuleEvaluations
annotations:
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in
the past 2 minutes.
expr: |
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
and
@ -472,7 +506,8 @@ groups:
severity: critical
- alert: ThanosBucketReplicateErrorRate
annotations:
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
failed.
expr: |
(
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
@ -484,7 +519,8 @@ groups:
severity: critical
- alert: ThanosBucketReplicateRunLatency
annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20

View file

@ -59,7 +59,8 @@ labels:
{{< code lang="yaml" >}}
alert: CephMdsMissingReplicas
annotations:
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
description: Minimum required replicas for storage metadata service not available.
Might affect the working of storage cluster.
message: Insufficient replicas for storage metadata service.
severity_level: warning
storage_type: ceph
@ -93,7 +94,8 @@ labels:
{{< code lang="yaml" >}}
alert: CephMonHighNumberOfLeaderChanges
annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
message: Storage Cluster has seen many leader changes recently.
severity_level: warning
storage_type: ceph
@ -129,7 +131,9 @@ labels:
{{< code lang="yaml" >}}
alert: CephOSDCriticallyFull
annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or
expand the storage cluster or contact support.
message: Back-end storage device is critically full.
severity_level: error
storage_type: ceph
@ -145,7 +149,9 @@ labels:
{{< code lang="yaml" >}}
alert: CephOSDNearFull
annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage
cluster or contact support.
message: Back-end storage device is nearing full.
severity_level: warning
storage_type: ceph
@ -161,7 +167,8 @@ labels:
{{< code lang="yaml" >}}
alert: CephOSDDiskNotResponding
annotations:
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
}}.
message: Disk not responding
severity_level: error
storage_type: ceph
@ -177,7 +184,8 @@ labels:
{{< code lang="yaml" >}}
alert: CephOSDDiskUnavailable
annotations:
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
}}.
message: Disk not accessible
severity_level: error
storage_type: ceph
@ -227,8 +235,10 @@ labels:
{{< code lang="yaml" >}}
alert: PersistentVolumeUsageNearFull
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%.
Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
or PVC expansion is required.
severity_level: warning
storage_type: ceph
expr: |
@ -243,8 +253,10 @@ labels:
{{< code lang="yaml" >}}
alert: PersistentVolumeUsageCritical
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%.
Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
or PVC expansion is required.
severity_level: error
storage_type: ceph
expr: |
@ -327,8 +339,10 @@ labels:
{{< code lang="yaml" >}}
alert: CephClusterNearFull
annotations:
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
description: Storage cluster utilization has crossed 75% and will become read-only
at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion is
required.
severity_level: warning
storage_type: ceph
expr: |
@ -343,8 +357,10 @@ labels:
{{< code lang="yaml" >}}
alert: CephClusterCriticallyFull
annotations:
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
description: Storage cluster utilization has crossed 80% and will become read-only
at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion or
cluster expansion.
severity_level: error
storage_type: ceph
expr: |
@ -359,8 +375,10 @@ labels:
{{< code lang="yaml" >}}
alert: CephClusterReadOnly
annotations:
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion.
description: Storage cluster utilization has crossed 85% and will become read-only
now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion or cluster
expansion.
severity_level: error
storage_type: ceph
expr: |

View file

@ -39,7 +39,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}}
alert: CoreDNSLatencyHigh
annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} .
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
{{ $labels.server }} zone {{ $labels.zone }} .
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
@ -54,7 +55,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}}
alert: CoreDNSErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -71,7 +73,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}}
alert: CoreDNSErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -90,7 +93,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}}
alert: CoreDNSForwardLatencyHigh
annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}.
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
@ -105,7 +109,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}}
alert: CoreDNSForwardErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: |
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -122,7 +127,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}}
alert: CoreDNSForwardErrorsHigh
annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))

File diff suppressed because it is too large Load diff

View file

@ -56,7 +56,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdNoLeader
annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no
leader.'
expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
@ -69,7 +70,9 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighNumberOfLeaderChanges
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the
last 15 minutes. Frequent elections may be a sign of insufficient resources, high
network latency, or disruptions by other components and should be investigated.'
expr: |
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m
@ -82,7 +85,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
}} failed on etcd instance {{ $labels.instance }}.'
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/
@ -98,7 +102,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
}} failed on etcd instance {{ $labels.instance }}.'
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/
@ -114,7 +119,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdGRPCRequestsSlow
annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
@ -128,7 +134,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdMemberCommunicationSlow
annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@ -142,7 +149,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedProposals
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
the last 30 minutes on etcd instance {{ $labels.instance }}.'
expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
@ -155,7 +163,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighFsyncDurations
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{
$value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
@ -169,7 +178,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighCommitDurations
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{
$value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
@ -183,7 +193,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}'
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.01
@ -197,7 +208,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}.'
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.05
@ -211,7 +223,8 @@ labels:
{{< code lang="yaml" >}}
alert: etcdHTTPRequestsSlow
annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
expr: |
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15

View file

@ -96,7 +96,8 @@ labels:
{{< code lang="yaml" >}}
alert: GlusterBrickUtilization
annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 80%
expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
@ -110,7 +111,8 @@ labels:
{{< code lang="yaml" >}}
alert: GlusterBrickUtilization
annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 90%
expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
@ -126,7 +128,8 @@ labels:
{{< code lang="yaml" >}}
alert: GlusterThinpoolDataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
80%
expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
for: 5m
@ -139,7 +142,8 @@ labels:
{{< code lang="yaml" >}}
alert: GlusterThinpoolDataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
90%
expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
for: 5m
@ -152,7 +156,8 @@ labels:
{{< code lang="yaml" >}}
alert: GlusterThinpoolMetadataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 80%
expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
for: 5m
@ -165,7 +170,8 @@ labels:
{{< code lang="yaml" >}}
alert: GlusterThinpoolMetadataUtilization
annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 90%
expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
for: 5m

View file

@ -38,7 +38,9 @@ alert: JaegerAgentHTTPServerErrs
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -51,7 +53,9 @@ alert: JaegerClientSpansDropped
annotations:
message: |
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -64,7 +68,9 @@ alert: JaegerAgentSpansDropped
annotations:
message: |
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by
(instance, job, namespace)> 1
for: 15m
labels:
severity: warning
@ -90,7 +96,9 @@ alert: JaegerCollectorDroppingSpans
annotations:
message: |
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
job, namespace)> 1
for: 15m
labels:
severity: warning
@ -103,7 +111,9 @@ alert: JaegerSamplingUpdateFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job,
namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -116,7 +126,8 @@ alert: JaegerCollectorPersistenceSlow
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
> 0.5
for: 15m
labels:
severity: warning
@ -129,7 +140,9 @@ alert: JaegerThrottlingUpdateFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job,
namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -142,7 +155,9 @@ alert: JaegerQueryReqsFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
namespace)> 1
for: 15m
labels:
severity: warning
@ -155,7 +170,9 @@ alert: JaegerCassandraWritesFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
@ -168,7 +185,9 @@ alert: JaegerCassandraReadsFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning

View file

@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}}
alert: CockroachInstanceFlapping
annotations:
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m'
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{
$value }} time(s) in 10m'
expr: |
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
for: 1m
@ -64,7 +65,8 @@ labels:
{{< code lang="yaml" >}}
alert: CockroachStoreDiskLow
annotations:
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }}
available disk fraction
expr: |
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
for: 30m
@ -116,7 +118,8 @@ labels:
{{< code lang="yaml" >}}
alert: CockroachHighOpenFDCount
annotations:
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used'
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }}
fraction used'
expr: |
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
for: 10m

View file

@ -23,7 +23,10 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}}
alert: KubeStateMetricsListErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate in list
operations. This is likely causing it to not be able to expose metrics about Kubernetes
objects correctly or at all.
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
@ -39,7 +42,10 @@ labels:
{{< code lang="yaml" >}}
alert: KubeStateMetricsWatchErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate in watch
operations. This is likely causing it to not be able to expose metrics about Kubernetes
objects correctly or at all.
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/

View file

@ -24,7 +24,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }})
is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
@ -39,7 +40,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state
for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: |
sum by (namespace, pod) (
@ -60,7 +62,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not been
rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
@ -77,7 +81,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched
the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: |
(
@ -100,7 +105,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched
the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: |
(
@ -123,7 +129,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
@ -140,7 +148,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has
not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: |
(
@ -171,7 +180,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeDaemonSetRolloutStuck
annotations:
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: |
(
@ -208,7 +218,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
@ -223,7 +234,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
@ -240,7 +252,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
@ -255,7 +268,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
@ -285,7 +299,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeHpaReplicasMismatch
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
@ -304,7 +319,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeHpaMaxedOut
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas
for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"}
@ -323,7 +339,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate
node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
@ -342,7 +359,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeMemoryOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
@ -399,7 +417,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeQuotaFullyUsed
annotations:
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
@ -417,7 +436,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: CPUThrottlingHigh
annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
@ -437,7 +457,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubePersistentVolumeFillingUp
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in
Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: |
kubelet_volume_stats_available_bytes{job="kubelet"}
@ -455,7 +476,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubePersistentVolumeFillingUp
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four days.
Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: |
(
@ -476,7 +499,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubePersistentVolumeErrors
annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase
}}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
@ -493,7 +517,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes components running.
message: There are {{ $value }} different semantic versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
@ -508,7 +533,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}'
is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
@ -606,7 +632,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
@ -620,7 +647,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
@ -634,7 +662,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: AggregatedAPIErrors
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
errors. The number of errors have increased for it in the past five minutes. High
values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
@ -648,7 +678,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: AggregatedAPIDown
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only
{{ $value | humanize }}% available over the last 5m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
@ -709,7 +740,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeletTooManyPods
annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: |
count by(node) (
@ -730,7 +762,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeNodeReadinessFlapping
annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
message: The readiness status of node {{ $labels.node }} has changed {{ $value }}
times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
@ -745,7 +778,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeletPlegDurationHigh
annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
@ -760,7 +794,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}}
alert: KubeletPodStartUpLatencyHigh
annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on
node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60

View file

@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}}
alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up.
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
@ -43,7 +44,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up fast.
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
@ -63,7 +65,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 5% space left.
expr: |
(
@ -81,7 +84,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 3% space left.
expr: |
(
@ -99,7 +103,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up.
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
@ -119,7 +124,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up fast.
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
@ -139,7 +145,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 5% inodes left.
expr: |
(
@ -157,7 +164,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 3% inodes left.
expr: |
(
@ -175,7 +183,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
summary: Network interface is reporting many receive errors.
expr: |
increase(node_network_receive_errs_total[2m]) > 10
@ -189,7 +198,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Network interface is reporting many transmit errors.
expr: |
increase(node_network_transmit_errs_total[2m]) > 10
@ -229,7 +239,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeClockSkewDetected
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
NTP is configured correctly on this host.
summary: Clock skew detected.
expr: |
(
@ -253,7 +264,8 @@ labels:
{{< code lang="yaml" >}}
alert: NodeClockNotSynchronising
annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
on this host.
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0

View file

@ -35,13 +35,15 @@ labels:
{{< /code >}}
##### PrometheusNotificationQueueRunningFull
Prometheus alert notification queue predicted to run full in less than 30m.
Prometheus alert notification queue predicted to run full in less than
{{< code lang="yaml" >}}
alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
summary: Prometheus alert notification queue predicted to run full in less than 30m.
description: Alert notification queue of Prometheus {{$labels.instance}} is running
full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -56,14 +58,17 @@ labels:
{{< /code >}}
##### PrometheusErrorSendingAlertsToSomeAlertmanagers
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
Prometheus has encountered more than 1% errors sending alerts to a specific
{{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
@ -78,13 +83,14 @@ labels:
{{< /code >}}
##### PrometheusErrorSendingAlertsToAnyAlertmanager
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
{{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without(alertmanager) (
@ -120,7 +126,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
@ -134,7 +141,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
@ -162,7 +170,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
@ -176,7 +185,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
@ -190,7 +200,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value
}}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
summary: Prometheus fails to send samples to remote storage.
expr: |
(
@ -214,7 +225,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value
}}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -235,8 +247,12 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
description: Prometheus {{$labels.instance}} remote write desired shards calculation
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more than
configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -255,7 +271,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f"
$value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
@ -269,7 +286,8 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
}} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
@ -283,8 +301,10 @@ labels:
{{< code lang="yaml" >}}
alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
}} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
for: 15m
@ -296,5 +316,5 @@ labels:
Following dashboards are generated from mixins and hosted on github:
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)

View file

@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}}
alert: ThanosCompactMultipleRunning
annotations:
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
message: No more than one Thanos Compact instance should be running at once. There
are {{ $value }}
expr: sum(up{job=~"thanos-compact.*"}) > 1
for: 5m
labels:
@ -47,7 +48,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosCompactHighCompactionFailures
annotations:
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
}}% of compactions.
expr: |
(
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
@ -65,7 +67,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosCompactBucketHighOperationFailures
annotations:
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value |
humanize }}% of operations.
expr: |
(
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
@ -84,7 +87,8 @@ labels:
alert: ThanosCompactHasNotRun
annotations:
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
/ 60 / 60 > 24
labels:
severity: warning
{{< /code >}}
@ -96,7 +100,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosQueryHttpRequestQueryErrorRateHigh
annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query" requests.
expr: |
(
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
@ -113,7 +118,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query_range" requests.
expr: |
(
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
@ -130,7 +136,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosQueryGrpcServerErrorRate
annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
@ -148,7 +155,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosQueryGrpcClientErrorRate
annotations:
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}%
of requests.
expr: |
(
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
@ -165,7 +173,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosQueryHighDNSFailures
annotations:
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS
queries for store endpoints.
expr: |
(
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
@ -182,7 +191,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosQueryInstantLatencyHigh
annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for instant queries.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
@ -199,7 +209,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosQueryRangeLatencyHigh
annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for range queries.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
@ -218,7 +229,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosReceiveHttpRequestErrorRateHigh
annotations:
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: |
(
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
@ -235,7 +247,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosReceiveHttpRequestLatencyHigh
annotations:
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for requests.
expr: |
(
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
@ -252,7 +265,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosReceiveHighReplicationFailures
annotations:
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize
}}% of requests.
expr: |
thanos_receive_replication_factor > 1
and
@ -279,7 +293,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosReceiveHighForwardRequestFailures
annotations:
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
@ -296,7 +311,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosReceiveHighHashringFileRefreshFailures
annotations:
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{
$value | humanize }} of attempts failed.
expr: |
(
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
@ -315,7 +331,8 @@ labels:
alert: ThanosReceiveConfigReloadFailure
annotations:
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by
(job) != 1
for: 5m
labels:
severity: warning
@ -326,7 +343,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosReceiveNoUpload
annotations:
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
latest data to object storage.
expr: |
(up{job=~"thanos-receive.*"} - 1)
+ on (instance) # filters to only alert on current instance last 3h
@ -356,7 +374,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosSidecarUnhealthy
annotations:
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value
}} seconds.
expr: |
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
labels:
@ -370,7 +389,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosStoreGrpcErrorRate
annotations:
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
@ -388,7 +408,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosStoreSeriesGateLatencyHigh
annotations:
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for store series gate requests.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -405,7 +426,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosStoreBucketHighOperationFailures
annotations:
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize
}}% of operations.
expr: |
(
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
@ -423,7 +445,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosStoreObjstoreOperationLatencyHigh
annotations:
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{
$value }} seconds for the bucket operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -452,12 +475,13 @@ labels:
{{< /code >}}
##### ThanosRuleSenderIsFailingAlerts
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
{{< code lang="yaml" >}}
alert: ThanosRuleSenderIsFailingAlerts
annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
alertmanager.
expr: |
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
for: 5m
@ -488,7 +512,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosRuleHighRuleEvaluationWarnings
annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
warnings.
expr: |
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
for: 15m
@ -501,7 +526,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosRuleRuleEvaluationLatencyHigh
annotations:
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
than interval for {{$labels.rule_group}}.
expr: |
(
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
@ -518,7 +544,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosRuleGrpcErrorRate
annotations:
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}%
of requests.
expr: |
(
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
@ -537,7 +564,8 @@ labels:
alert: ThanosRuleConfigReloadFailure
annotations:
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job)
!= 1
for: 5m
labels:
severity: info
@ -548,7 +576,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosRuleQueryHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
queries for query endpoints.
expr: |
(
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -566,7 +595,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosRuleAlertmanagerHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
queries for Alertmanager endpoints.
expr: |
(
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -584,7 +614,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosRuleNoEvaluationFor10Intervals
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that
did not evaluate for at least 10x of their expected interval.
expr: |
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
>
@ -599,7 +630,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosNoRuleEvaluations
annotations:
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the
past 2 minutes.
expr: |
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
and
@ -726,7 +758,8 @@ labels:
{{< code lang="yaml" >}}
alert: ThanosBucketReplicateRunLatency
annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20