1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-15 17:50:48 +00:00

assets,site/content: regenerate

This commit is contained in:
paulfantom 2020-08-13 12:50:10 +02:00
parent df43594957
commit 7fd2bee5a7
No known key found for this signature in database
GPG key ID: 12AE0185401674E7
25 changed files with 1134 additions and 535 deletions

View file

@ -27,7 +27,8 @@ groups:
rules: rules:
- alert: CephMdsMissingReplicas - alert: CephMdsMissingReplicas
annotations: annotations:
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster. description: Minimum required replicas for storage metadata service not available.
Might affect the working of storage cluster.
message: Insufficient replicas for storage metadata service. message: Insufficient replicas for storage metadata service.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -51,7 +52,8 @@ groups:
severity: critical severity: critical
- alert: CephMonHighNumberOfLeaderChanges - alert: CephMonHighNumberOfLeaderChanges
annotations: annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently. description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
message: Storage Cluster has seen many leader changes recently. message: Storage Cluster has seen many leader changes recently.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -64,7 +66,8 @@ groups:
rules: rules:
- alert: CephNodeDown - alert: CephNodeDown
annotations: annotations:
description: Storage node {{ $labels.node }} went down. Please check the node immediately. description: Storage node {{ $labels.node }} went down. Please check the node
immediately.
message: Storage node {{ $labels.node }} went down message: Storage node {{ $labels.node }} went down
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
@ -77,7 +80,9 @@ groups:
rules: rules:
- alert: CephOSDCriticallyFull - alert: CephOSDCriticallyFull
annotations: annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support. description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space
or expand the storage cluster or contact support.
message: Back-end storage device is critically full. message: Back-end storage device is critically full.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
@ -88,7 +93,9 @@ groups:
severity: critical severity: critical
- alert: CephOSDNearFull - alert: CephOSDNearFull
annotations: annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support. description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand
the storage cluster or contact support.
message: Back-end storage device is nearing full. message: Back-end storage device is nearing full.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -99,7 +106,8 @@ groups:
severity: warning severity: warning
- alert: CephOSDDiskNotResponding - alert: CephOSDDiskNotResponding
annotations: annotations:
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}. description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
}}.
message: Disk not responding message: Disk not responding
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
@ -110,7 +118,8 @@ groups:
severity: critical severity: critical
- alert: CephOSDDiskUnavailable - alert: CephOSDDiskUnavailable
annotations: annotations:
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}. description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
}}.
message: Disk not accessible message: Disk not accessible
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
@ -145,8 +154,10 @@ groups:
rules: rules:
- alert: PersistentVolumeUsageNearFull - alert: PersistentVolumeUsageNearFull
annotations: annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC. description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required. 75%. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
or PVC expansion is required.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -156,8 +167,10 @@ groups:
severity: warning severity: warning
- alert: PersistentVolumeUsageCritical - alert: PersistentVolumeUsageCritical
annotations: annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately. description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required. 85%. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
or PVC expansion is required.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -191,7 +204,8 @@ groups:
severity: warning severity: warning
- alert: CephOSDVersionMismatch - alert: CephOSDVersionMismatch
annotations: annotations:
description: There are {{ $value }} different versions of Ceph OSD components running. description: There are {{ $value }} different versions of Ceph OSD components
running.
message: There are multiple versions of storage services running. message: There are multiple versions of storage services running.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -202,7 +216,8 @@ groups:
severity: warning severity: warning
- alert: CephMonVersionMismatch - alert: CephMonVersionMismatch
annotations: annotations:
description: There are {{ $value }} different versions of Ceph Mon components running. description: There are {{ $value }} different versions of Ceph Mon components
running.
message: There are multiple versions of storage services running. message: There are multiple versions of storage services running.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -215,8 +230,10 @@ groups:
rules: rules:
- alert: CephClusterNearFull - alert: CephClusterNearFull
annotations: annotations:
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster. description: Storage cluster utilization has crossed 75% and will become read-only
message: Storage cluster is nearing full. Data deletion or cluster expansion is required. at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion
is required.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -226,8 +243,10 @@ groups:
severity: warning severity: warning
- alert: CephClusterCriticallyFull - alert: CephClusterCriticallyFull
annotations: annotations:
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately. description: Storage cluster utilization has crossed 80% and will become read-only
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion. at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion
or cluster expansion.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -237,8 +256,10 @@ groups:
severity: critical severity: critical
- alert: CephClusterReadOnly - alert: CephClusterReadOnly
annotations: annotations:
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately. description: Storage cluster utilization has crossed 85% and will become read-only
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion. now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion
or cluster expansion.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
expr: | expr: |

View file

@ -12,7 +12,8 @@ groups:
severity: critical severity: critical
- alert: CoreDNSLatencyHigh - alert: CoreDNSLatencyHigh
annotations: annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} . message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
{{ $labels.server }} zone {{ $labels.zone }} .
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
expr: | expr: |
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4 histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
@ -21,7 +22,8 @@ groups:
severity: critical severity: critical
- alert: CoreDNSErrorsHigh - alert: CoreDNSErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: | expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -32,7 +34,8 @@ groups:
severity: critical severity: critical
- alert: CoreDNSErrorsHigh - alert: CoreDNSErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: | expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -45,7 +48,8 @@ groups:
rules: rules:
- alert: CoreDNSForwardLatencyHigh - alert: CoreDNSForwardLatencyHigh
annotations: annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}. message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
expr: | expr: |
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4 histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
@ -54,7 +58,8 @@ groups:
severity: critical severity: critical
- alert: CoreDNSForwardErrorsHigh - alert: CoreDNSForwardErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: | expr: |
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -65,7 +70,8 @@ groups:
severity: critical severity: critical
- alert: CoreDNSForwardErrorsHigh - alert: CoreDNSForwardErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
of forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: | expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))

View file

@ -107,7 +107,8 @@ groups:
severity: warning severity: warning
- alert: CortexIngesterRestarts - alert: CortexIngesterRestarts
annotations: annotations:
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.' message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f"
$value }} times in the last 30 mins.'
expr: | expr: |
changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1 changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1
labels: labels:
@ -278,7 +279,8 @@ groups:
rules: rules:
- alert: CortexGossipMembersMismatch - alert: CortexGossipMembersMismatch
annotations: annotations:
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.' message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of
gossip members.'
expr: | expr: |
memberlist_client_cluster_members_count memberlist_client_cluster_members_count
!= on (cluster, namespace) group_left != on (cluster, namespace) group_left
@ -290,7 +292,8 @@ groups:
rules: rules:
- alert: CortexIngesterHasNotShippedBlocks - alert: CortexIngesterHasNotShippedBlocks
annotations: annotations:
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours. message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
not shipped any block in the last 4 hours.
expr: | expr: |
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4) (min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4)
and and
@ -302,7 +305,8 @@ groups:
severity: critical severity: critical
- alert: CortexIngesterHasNotShippedBlocksSinceStart - alert: CortexIngesterHasNotShippedBlocksSinceStart
annotations: annotations:
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours. message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
not shipped any block in the last 4 hours.
expr: | expr: |
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0) (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0)
and and
@ -312,7 +316,8 @@ groups:
severity: critical severity: critical
- alert: CortexIngesterTSDBHeadCompactionFailed - alert: CortexIngesterTSDBHeadCompactionFailed
annotations: annotations:
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head. message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing
to compact TSDB head.
expr: | expr: |
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
for: 15m for: 15m
@ -320,7 +325,8 @@ groups:
severity: critical severity: critical
- alert: CortexQuerierHasNotScanTheBucket - alert: CortexQuerierHasNotScanTheBucket
annotations: annotations:
message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}. message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not
successfully scanned the bucket since {{ $value | humanizeDuration }}.
expr: | expr: |
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30) (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
and and
@ -330,7 +336,9 @@ groups:
severity: critical severity: critical
- alert: CortexQuerierHighRefetchRate - alert: CortexQuerierHighRefetchRate
annotations: annotations:
message: Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries. message: Cortex Queries in {{ $labels.namespace }} are refetching series from
different store-gateways (because of missing blocks) for the {{ printf "%.0f"
$value }}% of queries.
expr: | expr: |
100 * ( 100 * (
( (
@ -347,7 +355,9 @@ groups:
severity: warning severity: warning
- alert: CortexStoreGatewayHasNotSyncTheBucket - alert: CortexStoreGatewayHasNotSyncTheBucket
annotations: annotations:
message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}. message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }}
has not successfully synched the bucket since {{ $value | humanizeDuration
}}.
expr: | expr: |
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
and and
@ -359,7 +369,8 @@ groups:
rules: rules:
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks - alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks
annotations: annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours. message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not successfully cleaned up blocks in the last 24 hours.
expr: | expr: |
(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24) (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24)
and and
@ -369,7 +380,8 @@ groups:
severity: critical severity: critical
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart - alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart
annotations: annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours. message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not successfully cleaned up blocks in the last 24 hours.
expr: | expr: |
cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0 cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0
for: 24h for: 24h
@ -377,7 +389,8 @@ groups:
severity: critical severity: critical
- alert: CortexCompactorHasNotUploadedBlocks - alert: CortexCompactorHasNotUploadedBlocks
annotations: annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours. message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not uploaded any block in the last 24 hours.
expr: | expr: |
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24)
and and
@ -387,7 +400,8 @@ groups:
severity: critical severity: critical
- alert: CortexCompactorHasNotUploadedBlocksSinceStart - alert: CortexCompactorHasNotUploadedBlocksSinceStart
annotations: annotations:
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours. message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
not uploaded any block in the last 24 hours.
expr: | expr: |
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0 thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0
for: 24h for: 24h

View file

@ -1,11 +1,14 @@
groups: groups:
- name: cortex_api - name: cortex_api
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:99quantile record: cluster_job:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:50quantile record: cluster_job:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:cortex_request_duration_seconds:avg record: cluster_job:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
@ -13,185 +16,279 @@ groups:
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_count:sum_rate record: cluster_job:cortex_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:99quantile record: cluster_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:50quantile record: cluster_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds:avg record: cluster_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
- name: cortex_cache - name: cortex_cache
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)) - expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)) - expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method) - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method) - expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method) - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method) - expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:99quantile record: cluster_job:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:50quantile record: cluster_job:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds:avg record: cluster_job:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)) - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)) - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method) - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds:avg record: cluster_job_method:cortex_cache_request_duration_seconds:avg
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method) - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method) - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method)
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method) - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
- name: cortex_storage - name: cortex_storage
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m]))
by (cluster, job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation) - expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m]))
by (cluster, job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation) - expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le,
cluster, job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by
(cluster, job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation) - expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
/ sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query:avg record: cluster_job:cortex_chunk_store_chunks_per_query:avg
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job) - expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)) - expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:99quantile record: cluster_job_method:cortex_database_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)) - expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:50quantile record: cluster_job_method:cortex_database_request_duration_seconds:50quantile
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method) - expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_database_request_duration_seconds:avg record: cluster_job_method:cortex_database_request_duration_seconds:avg
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method) - expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method) - expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method) - expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)) - expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation) - expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation) - expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:99quantile record: cluster_job:cortex_kv_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:50quantile record: cluster_job:cortex_kv_request_duration_seconds:50quantile
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds:avg record: cluster_job:cortex_kv_request_duration_seconds:avg
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
@ -199,11 +296,14 @@ groups:
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
- name: cortex_queries - name: cortex_queries
rules: rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:99quantile record: cluster_job:cortex_query_frontend_retries:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:50quantile record: cluster_job:cortex_query_frontend_retries:50quantile
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
by (cluster, job)
record: cluster_job:cortex_query_frontend_retries:avg record: cluster_job:cortex_query_frontend_retries:avg
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
@ -211,23 +311,33 @@ groups:
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_count:sum_rate record: cluster_job:cortex_query_frontend_retries_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
(cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:99quantile record: cluster_job:cortex_ingester_queried_series:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:50quantile record: cluster_job:cortex_ingester_queried_series:50quantile
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_series:avg record: cluster_job:cortex_ingester_queried_series:avg
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
@ -235,11 +345,14 @@ groups:
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_count:sum_rate record: cluster_job:cortex_ingester_queried_series_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:99quantile record: cluster_job:cortex_ingester_queried_chunks:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:50quantile record: cluster_job:cortex_ingester_queried_chunks:50quantile
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job) - expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks:avg record: cluster_job:cortex_ingester_queried_chunks:avg
- expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate
@ -247,11 +360,14 @@ groups:
record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job) - expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:99quantile record: cluster_job:cortex_ingester_queried_samples:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:50quantile record: cluster_job:cortex_ingester_queried_samples:50quantile
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples:avg record: cluster_job:cortex_ingester_queried_samples:avg
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate

View file

@ -18,7 +18,8 @@ groups:
severity: critical severity: critical
- alert: etcdInsufficientMembers - alert: etcdInsufficientMembers
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
expr: | expr: |
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m for: 3m
@ -26,7 +27,8 @@ groups:
severity: critical severity: critical
- alert: etcdNoLeader - alert: etcdNoLeader
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
no leader.'
expr: | expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0 etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m for: 1m
@ -34,7 +36,9 @@ groups:
severity: critical severity: critical
- alert: etcdHighNumberOfLeaderChanges - alert: etcdHighNumberOfLeaderChanges
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within
the last 15 minutes. Frequent elections may be a sign of insufficient resources,
high network latency, or disruptions by other components and should be investigated.'
expr: | expr: |
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m for: 5m
@ -42,7 +46,8 @@ groups:
severity: warning severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests - alert: etcdHighNumberOfFailedGRPCRequests
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
expr: | expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/ /
@ -53,7 +58,8 @@ groups:
severity: warning severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests - alert: etcdHighNumberOfFailedGRPCRequests
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
expr: | expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/ /
@ -64,7 +70,8 @@ groups:
severity: critical severity: critical
- alert: etcdGRPCRequestsSlow - alert: etcdGRPCRequestsSlow
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type)) histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15 > 0.15
@ -73,7 +80,8 @@ groups:
severity: critical severity: critical
- alert: etcdMemberCommunicationSlow - alert: etcdMemberCommunicationSlow
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15 > 0.15
@ -82,7 +90,8 @@ groups:
severity: warning severity: warning
- alert: etcdHighNumberOfFailedProposals - alert: etcdHighNumberOfFailedProposals
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
the last 30 minutes on etcd instance {{ $labels.instance }}.'
expr: | expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m for: 15m
@ -90,7 +99,8 @@ groups:
severity: warning severity: warning
- alert: etcdHighFsyncDurations - alert: etcdHighFsyncDurations
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
{{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5 > 0.5
@ -99,7 +109,8 @@ groups:
severity: warning severity: warning
- alert: etcdHighCommitDurations - alert: etcdHighCommitDurations
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25 > 0.25
@ -108,7 +119,8 @@ groups:
severity: warning severity: warning
- alert: etcdHighNumberOfFailedHTTPRequests - alert: etcdHighNumberOfFailedHTTPRequests
annotations: annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
expr: | expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.01 without (code) > 0.01
@ -117,7 +129,8 @@ groups:
severity: warning severity: warning
- alert: etcdHighNumberOfFailedHTTPRequests - alert: etcdHighNumberOfFailedHTTPRequests
annotations: annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}.'
expr: | expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.05 without (code) > 0.05
@ -126,7 +139,8 @@ groups:
severity: critical severity: critical
- alert: etcdHTTPRequestsSlow - alert: etcdHTTPRequestsSlow
annotations: annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
expr: | expr: |
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15 > 0.15

View file

@ -49,7 +49,8 @@ groups:
severity: critical severity: critical
- alert: GlusterBrickUtilization - alert: GlusterBrickUtilization
annotations: annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80% message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 80%
expr: | expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"} 100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80 / gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
@ -58,7 +59,8 @@ groups:
severity: warning severity: warning
- alert: GlusterBrickUtilization - alert: GlusterBrickUtilization
annotations: annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90% message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 90%
expr: | expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"} 100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90 / gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
@ -69,7 +71,8 @@ groups:
rules: rules:
- alert: GlusterThinpoolDataUtilization - alert: GlusterThinpoolDataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80% message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
than 80%
expr: | expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8 gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
for: 5m for: 5m
@ -77,7 +80,8 @@ groups:
severity: warning severity: warning
- alert: GlusterThinpoolDataUtilization - alert: GlusterThinpoolDataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90% message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
than 90%
expr: | expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9 gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
for: 5m for: 5m
@ -85,7 +89,8 @@ groups:
severity: critical severity: critical
- alert: GlusterThinpoolMetadataUtilization - alert: GlusterThinpoolMetadataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80% message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 80%
expr: | expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8 gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
for: 5m for: 5m
@ -93,7 +98,8 @@ groups:
severity: warning severity: warning
- alert: GlusterThinpoolMetadataUtilization - alert: GlusterThinpoolMetadataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90% message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 90%
expr: | expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9 gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
for: 5m for: 5m

View file

@ -13,7 +13,9 @@ groups:
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors. {{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance,
job, namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -21,7 +23,9 @@ groups:
annotations: annotations:
message: | message: |
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -29,7 +33,9 @@ groups:
annotations: annotations:
message: | message: |
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m]))
by (instance, job, namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -45,7 +51,9 @@ groups:
annotations: annotations:
message: | message: |
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
job, namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -53,7 +61,9 @@ groups:
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies. {{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -61,7 +71,8 @@ groups:
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans. {{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5 expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
> 0.5
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -69,7 +80,9 @@ groups:
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies. {{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job,
namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -77,7 +90,9 @@ groups:
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -85,7 +100,9 @@ groups:
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -93,7 +110,9 @@ groups:
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance,
job, namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning

View file

@ -3,7 +3,8 @@ groups:
rules: rules:
- alert: CockroachInstanceFlapping - alert: CockroachInstanceFlapping
annotations: annotations:
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m' message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
{{ $value }} time(s) in 10m'
expr: | expr: |
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5 resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
for: 1m for: 1m
@ -29,7 +30,8 @@ groups:
severity: warning severity: warning
- alert: CockroachStoreDiskLow - alert: CockroachStoreDiskLow
annotations: annotations:
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
}} available disk fraction
expr: | expr: |
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15 :cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
for: 30m for: 30m
@ -61,7 +63,8 @@ groups:
severity: warning severity: warning
- alert: CockroachHighOpenFDCount - alert: CockroachHighOpenFDCount
annotations: annotations:
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used' message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
}} fraction used'
expr: | expr: |
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8 cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
for: 10m for: 10m

View file

@ -3,7 +3,10 @@ groups:
rules: rules:
- alert: KubeStateMetricsListErrors - alert: KubeStateMetricsListErrors
annotations: annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. description: kube-state-metrics is experiencing errors at an elevated rate in
list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
summary: kube-state-metrics is experiencing errors in list operations.
expr: | expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/ /
@ -14,7 +17,10 @@ groups:
severity: critical severity: critical
- alert: KubeStateMetricsWatchErrors - alert: KubeStateMetricsWatchErrors
annotations: annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. description: kube-state-metrics is experiencing errors at an elevated rate in
watch operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
summary: kube-state-metrics is experiencing errors in watch operations.
expr: | expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/ /

View file

@ -3,7 +3,8 @@ groups:
rules: rules:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: | expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
@ -12,7 +13,8 @@ groups:
severity: warning severity: warning
- alert: KubePodNotReady - alert: KubePodNotReady
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) ( sum by (namespace, pod) (
@ -27,7 +29,9 @@ groups:
severity: warning severity: warning
- alert: KubeDeploymentGenerationMismatch - alert: KubeDeploymentGenerationMismatch
annotations: annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: | expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"} kube_deployment_status_observed_generation{job="kube-state-metrics"}
@ -38,7 +42,8 @@ groups:
severity: warning severity: warning
- alert: KubeDeploymentReplicasMismatch - alert: KubeDeploymentReplicasMismatch
annotations: annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: | expr: |
( (
@ -55,7 +60,8 @@ groups:
severity: warning severity: warning
- alert: KubeStatefulSetReplicasMismatch - alert: KubeStatefulSetReplicasMismatch
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not
matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: | expr: |
( (
@ -72,7 +78,9 @@ groups:
severity: warning severity: warning
- alert: KubeStatefulSetGenerationMismatch - alert: KubeStatefulSetGenerationMismatch
annotations: annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr: | expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"} kube_statefulset_status_observed_generation{job="kube-state-metrics"}
@ -83,7 +91,8 @@ groups:
severity: warning severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut - alert: KubeStatefulSetUpdateNotRolledOut
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: | expr: |
( (
@ -108,7 +117,8 @@ groups:
severity: warning severity: warning
- alert: KubeDaemonSetRolloutStuck - alert: KubeDaemonSetRolloutStuck
annotations: annotations:
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: | expr: |
( (
@ -139,7 +149,8 @@ groups:
severity: warning severity: warning
- alert: KubeContainerWaiting - alert: KubeContainerWaiting
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
expr: | expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
@ -148,7 +159,8 @@ groups:
severity: warning severity: warning
- alert: KubeDaemonSetNotScheduled - alert: KubeDaemonSetNotScheduled
annotations: annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr: | expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
@ -159,7 +171,8 @@ groups:
severity: warning severity: warning
- alert: KubeDaemonSetMisScheduled - alert: KubeDaemonSetMisScheduled
annotations: annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: | expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
@ -168,7 +181,8 @@ groups:
severity: warning severity: warning
- alert: KubeJobCompletion - alert: KubeJobCompletion
annotations: annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: | expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
@ -186,7 +200,8 @@ groups:
severity: warning severity: warning
- alert: KubeHpaReplicasMismatch - alert: KubeHpaReplicasMismatch
annotations: annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
expr: | expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"} (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
@ -199,7 +214,8 @@ groups:
severity: warning severity: warning
- alert: KubeHpaMaxedOut - alert: KubeHpaMaxedOut
annotations: annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max
replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
expr: | expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"} kube_hpa_status_current_replicas{job="kube-state-metrics"}
@ -212,7 +228,8 @@ groups:
rules: rules:
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
@ -225,7 +242,8 @@ groups:
severity: warning severity: warning
- alert: KubeMemoryOvercommit - alert: KubeMemoryOvercommit
annotations: annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
@ -264,7 +282,8 @@ groups:
severity: warning severity: warning
- alert: KubeQuotaFullyUsed - alert: KubeQuotaFullyUsed
annotations: annotations:
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
expr: | expr: |
kube_resourcequota{job="kube-state-metrics", type="used"} kube_resourcequota{job="kube-state-metrics", type="used"}
@ -276,7 +295,9 @@ groups:
severity: info severity: info
- alert: CPUThrottlingHigh - alert: CPUThrottlingHigh
annotations: annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
}}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: | expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
@ -290,7 +311,9 @@ groups:
rules: rules:
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: | expr: |
kubelet_volume_stats_available_bytes{job="kubelet"} kubelet_volume_stats_available_bytes{job="kubelet"}
@ -302,7 +325,9 @@ groups:
severity: critical severity: critical
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: | expr: |
( (
@ -317,7 +342,8 @@ groups:
severity: warning severity: warning
- alert: KubePersistentVolumeErrors - alert: KubePersistentVolumeErrors
annotations: annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. message: The persistent volume {{ $labels.persistentvolume }} has status {{
$labels.phase }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
expr: | expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
@ -328,7 +354,8 @@ groups:
rules: rules:
- alert: KubeVersionMismatch - alert: KubeVersionMismatch
annotations: annotations:
message: There are {{ $value }} different semantic versions of Kubernetes components running. message: There are {{ $value }} different semantic versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr: | expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
@ -337,7 +364,8 @@ groups:
severity: warning severity: warning
- alert: KubeClientErrors - alert: KubeClientErrors
annotations: annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: | expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
@ -405,7 +433,8 @@ groups:
rules: rules:
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
@ -413,7 +442,8 @@ groups:
severity: warning severity: warning
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
@ -421,7 +451,9 @@ groups:
severity: critical severity: critical
- alert: AggregatedAPIErrors - alert: AggregatedAPIErrors
annotations: annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
errors. The number of errors have increased for it in the past five minutes.
High values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
expr: | expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
@ -429,7 +461,8 @@ groups:
severity: warning severity: warning
- alert: AggregatedAPIDown - alert: AggregatedAPIDown
annotations: annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m. message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been
only {{ $value | humanize }}% available over the last 5m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
expr: | expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90 (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
@ -466,7 +499,8 @@ groups:
severity: warning severity: warning
- alert: KubeletTooManyPods - alert: KubeletTooManyPods
annotations: annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: | expr: |
count by(node) ( count by(node) (
@ -481,7 +515,8 @@ groups:
severity: warning severity: warning
- alert: KubeNodeReadinessFlapping - alert: KubeNodeReadinessFlapping
annotations: annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. message: The readiness status of node {{ $labels.node }} has changed {{ $value
}} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
expr: | expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
@ -490,7 +525,8 @@ groups:
severity: warning severity: warning
- alert: KubeletPlegDurationHigh - alert: KubeletPlegDurationHigh
annotations: annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
expr: | expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
@ -499,7 +535,8 @@ groups:
severity: warning severity: warning
- alert: KubeletPodStartUpLatencyHigh - alert: KubeletPodStartUpLatencyHigh
annotations: annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
expr: | expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60 histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60

View file

@ -3,7 +3,8 @@ groups:
rules: rules:
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left and is filling up.
summary: Filesystem is predicted to run out of space within the next 24 hours. summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: | expr: |
( (
@ -18,7 +19,8 @@ groups:
severity: warning severity: warning
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left and is filling up fast.
summary: Filesystem is predicted to run out of space within the next 4 hours. summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: | expr: |
( (
@ -33,7 +35,8 @@ groups:
severity: critical severity: critical
- alert: NodeFilesystemAlmostOutOfSpace - alert: NodeFilesystemAlmostOutOfSpace
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 5% space left. summary: Filesystem has less than 5% space left.
expr: | expr: |
( (
@ -46,7 +49,8 @@ groups:
severity: warning severity: warning
- alert: NodeFilesystemAlmostOutOfSpace - alert: NodeFilesystemAlmostOutOfSpace
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 3% space left. summary: Filesystem has less than 3% space left.
expr: | expr: |
( (
@ -59,7 +63,8 @@ groups:
severity: critical severity: critical
- alert: NodeFilesystemFilesFillingUp - alert: NodeFilesystemFilesFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left and is filling up.
summary: Filesystem is predicted to run out of inodes within the next 24 hours. summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: | expr: |
( (
@ -74,7 +79,8 @@ groups:
severity: warning severity: warning
- alert: NodeFilesystemFilesFillingUp - alert: NodeFilesystemFilesFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
summary: Filesystem is predicted to run out of inodes within the next 4 hours. summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: | expr: |
( (
@ -89,7 +95,8 @@ groups:
severity: critical severity: critical
- alert: NodeFilesystemAlmostOutOfFiles - alert: NodeFilesystemAlmostOutOfFiles
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 5% inodes left. summary: Filesystem has less than 5% inodes left.
expr: | expr: |
( (
@ -102,7 +109,8 @@ groups:
severity: warning severity: warning
- alert: NodeFilesystemAlmostOutOfFiles - alert: NodeFilesystemAlmostOutOfFiles
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
only {{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 3% inodes left. summary: Filesystem has less than 3% inodes left.
expr: | expr: |
( (
@ -115,7 +123,8 @@ groups:
severity: critical severity: critical
- alert: NodeNetworkReceiveErrs - alert: NodeNetworkReceiveErrs
annotations: annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
summary: Network interface is reporting many receive errors. summary: Network interface is reporting many receive errors.
expr: | expr: |
increase(node_network_receive_errs_total[2m]) > 10 increase(node_network_receive_errs_total[2m]) > 10
@ -124,7 +133,8 @@ groups:
severity: warning severity: warning
- alert: NodeNetworkTransmitErrs - alert: NodeNetworkTransmitErrs
annotations: annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Network interface is reporting many transmit errors. summary: Network interface is reporting many transmit errors.
expr: | expr: |
increase(node_network_transmit_errs_total[2m]) > 10 increase(node_network_transmit_errs_total[2m]) > 10
@ -149,7 +159,8 @@ groups:
severity: warning severity: warning
- alert: NodeClockSkewDetected - alert: NodeClockSkewDetected
annotations: annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
NTP is configured correctly on this host.
summary: Clock skew detected. summary: Clock skew detected.
expr: | expr: |
( (
@ -168,7 +179,8 @@ groups:
severity: warning severity: warning
- alert: NodeClockNotSynchronising - alert: NodeClockNotSynchronising
annotations: annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is
configured on this host.
summary: Clock not synchronising. summary: Clock not synchronising.
expr: | expr: |
min_over_time(node_timex_sync_status[5m]) == 0 min_over_time(node_timex_sync_status[5m]) == 0

View file

@ -14,8 +14,10 @@ groups:
severity: critical severity: critical
- alert: PrometheusNotificationQueueRunningFull - alert: PrometheusNotificationQueueRunningFull
annotations: annotations:
description: Alert notification queue of Prometheus {{$labels.instance}} is running full. description: Alert notification queue of Prometheus {{$labels.instance}} is
summary: Prometheus alert notification queue predicted to run full in less than 30m. running full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
expr: | expr: |
# Without min_over_time, failed scrapes could create false negatives, see # Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -29,8 +31,10 @@ groups:
severity: warning severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations: annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a
specific Alertmanager.
expr: | expr: |
( (
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
@ -44,7 +48,8 @@ groups:
severity: warning severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations: annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.' description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: | expr: |
min without(alertmanager) ( min without(alertmanager) (
@ -70,7 +75,8 @@ groups:
severity: warning severity: warning
- alert: PrometheusTSDBReloadsFailing - alert: PrometheusTSDBReloadsFailing
annotations: annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h. description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk. summary: Prometheus has issues reloading blocks from disk.
expr: | expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0 increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
@ -79,7 +85,8 @@ groups:
severity: warning severity: warning
- alert: PrometheusTSDBCompactionsFailing - alert: PrometheusTSDBCompactionsFailing
annotations: annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h. description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks. summary: Prometheus has issues compacting blocks.
expr: | expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0 increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
@ -97,7 +104,8 @@ groups:
severity: warning severity: warning
- alert: PrometheusDuplicateTimestamps - alert: PrometheusDuplicateTimestamps
annotations: annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps. summary: Prometheus is dropping samples with duplicate timestamps.
expr: | expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
@ -106,7 +114,8 @@ groups:
severity: warning severity: warning
- alert: PrometheusOutOfOrderTimestamps - alert: PrometheusOutOfOrderTimestamps
annotations: annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps. summary: Prometheus drops samples with out-of-order timestamps.
expr: | expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0 rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
@ -115,7 +124,8 @@ groups:
severity: warning severity: warning
- alert: PrometheusRemoteStorageFailures - alert: PrometheusRemoteStorageFailures
annotations: annotations:
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f"
$value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
summary: Prometheus fails to send samples to remote storage. summary: Prometheus fails to send samples to remote storage.
expr: | expr: |
( (
@ -134,7 +144,8 @@ groups:
severity: critical severity: critical
- alert: PrometheusRemoteWriteBehind - alert: PrometheusRemoteWriteBehind
annotations: annotations:
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f"
$value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
summary: Prometheus remote write is behind. summary: Prometheus remote write is behind.
expr: | expr: |
# Without max_over_time, failed scrapes could create false negatives, see # Without max_over_time, failed scrapes could create false negatives, see
@ -150,8 +161,12 @@ groups:
severity: critical severity: critical
- alert: PrometheusRemoteWriteDesiredShards - alert: PrometheusRemoteWriteDesiredShards
annotations: annotations:
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}. description: Prometheus {{$labels.instance}} remote write desired shards calculation
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: | expr: |
# Without max_over_time, failed scrapes could create false negatives, see # Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -165,7 +180,8 @@ groups:
severity: warning severity: warning
- alert: PrometheusRuleFailures - alert: PrometheusRuleFailures
annotations: annotations:
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf
"%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations. summary: Prometheus is failing rule evaluations.
expr: | expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0 increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
@ -174,7 +190,8 @@ groups:
severity: critical severity: critical
- alert: PrometheusMissingRuleEvaluations - alert: PrometheusMissingRuleEvaluations
annotations: annotations:
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
}} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation. summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: | expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0 increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
@ -183,8 +200,10 @@ groups:
severity: warning severity: warning
- alert: PrometheusTargetLimitHit - alert: PrometheusTargetLimitHit
annotations: annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. }} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: | expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0 increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
for: 15m for: 15m

View file

@ -3,7 +3,8 @@ groups:
rules: rules:
- alert: ThanosCompactMultipleRunning - alert: ThanosCompactMultipleRunning
annotations: annotations:
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }} message: No more than one Thanos Compact instance should be running at once.
There are {{ $value }}
expr: sum(up{job=~"thanos-compact.*"}) > 1 expr: sum(up{job=~"thanos-compact.*"}) > 1
for: 5m for: 5m
labels: labels:
@ -17,7 +18,8 @@ groups:
severity: warning severity: warning
- alert: ThanosCompactHighCompactionFailures - alert: ThanosCompactHighCompactionFailures
annotations: annotations:
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions. message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
}}% of compactions.
expr: | expr: |
( (
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
@ -30,7 +32,8 @@ groups:
severity: warning severity: warning
- alert: ThanosCompactBucketHighOperationFailures - alert: ThanosCompactBucketHighOperationFailures
annotations: annotations:
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value
| humanize }}% of operations.
expr: | expr: |
( (
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
@ -44,14 +47,16 @@ groups:
- alert: ThanosCompactHasNotRun - alert: ThanosCompactHasNotRun
annotations: annotations:
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
/ 60 / 60 > 24
labels: labels:
severity: warning severity: warning
- name: thanos-query.rules - name: thanos-query.rules
rules: rules:
- alert: ThanosQueryHttpRequestQueryErrorRateHigh - alert: ThanosQueryHttpRequestQueryErrorRateHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests. message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query" requests.
expr: | expr: |
( (
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
@ -63,7 +68,8 @@ groups:
severity: critical severity: critical
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests. message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query_range" requests.
expr: | expr: |
( (
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
@ -75,7 +81,8 @@ groups:
severity: critical severity: critical
- alert: ThanosQueryGrpcServerErrorRate - alert: ThanosQueryGrpcServerErrorRate
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m])) sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
@ -88,7 +95,8 @@ groups:
severity: warning severity: warning
- alert: ThanosQueryGrpcClientErrorRate - alert: ThanosQueryGrpcClientErrorRate
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests. message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m])) sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
@ -100,7 +108,8 @@ groups:
severity: warning severity: warning
- alert: ThanosQueryHighDNSFailures - alert: ThanosQueryHighDNSFailures
annotations: annotations:
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints. message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing
DNS queries for store endpoints.
expr: | expr: |
( (
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
@ -112,7 +121,8 @@ groups:
severity: warning severity: warning
- alert: ThanosQueryInstantLatencyHigh - alert: ThanosQueryInstantLatencyHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries. message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for instant queries.
expr: | expr: |
( (
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
@ -124,7 +134,8 @@ groups:
severity: critical severity: critical
- alert: ThanosQueryRangeLatencyHigh - alert: ThanosQueryRangeLatencyHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries. message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for range queries.
expr: | expr: |
( (
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90 histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
@ -138,7 +149,8 @@ groups:
rules: rules:
- alert: ThanosReceiveHttpRequestErrorRateHigh - alert: ThanosReceiveHttpRequestErrorRateHigh
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
@ -150,7 +162,8 @@ groups:
severity: critical severity: critical
- alert: ThanosReceiveHttpRequestLatencyHigh - alert: ThanosReceiveHttpRequestLatencyHigh
annotations: annotations:
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests. message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for requests.
expr: | expr: |
( (
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10 histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
@ -162,7 +175,8 @@ groups:
severity: critical severity: critical
- alert: ThanosReceiveHighReplicationFailures - alert: ThanosReceiveHighReplicationFailures
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests. message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value |
humanize }}% of requests.
expr: | expr: |
thanos_receive_replication_factor > 1 thanos_receive_replication_factor > 1
and and
@ -184,7 +198,8 @@ groups:
severity: warning severity: warning
- alert: ThanosReceiveHighForwardRequestFailures - alert: ThanosReceiveHighForwardRequestFailures
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests. message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
@ -196,7 +211,8 @@ groups:
severity: warning severity: warning
- alert: ThanosReceiveHighHashringFileRefreshFailures - alert: ThanosReceiveHighHashringFileRefreshFailures
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed. message: Thanos Receive {{$labels.job}} is failing to refresh hashring file,
{{ $value | humanize }} of attempts failed.
expr: | expr: |
( (
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
@ -209,14 +225,17 @@ groups:
severity: warning severity: warning
- alert: ThanosReceiveConfigReloadFailure - alert: ThanosReceiveConfigReloadFailure
annotations: annotations:
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations. message: Thanos Receive {{$labels.job}} has not been able to reload hashring
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1 configurations.
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"})
by (job) != 1
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
- alert: ThanosReceiveNoUpload - alert: ThanosReceiveNoUpload
annotations: annotations:
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage. message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
latest data to object storage.
expr: | expr: |
(up{job=~"thanos-receive.*"} - 1) (up{job=~"thanos-receive.*"} - 1)
+ on (instance) # filters to only alert on current instance last 3h + on (instance) # filters to only alert on current instance last 3h
@ -236,7 +255,8 @@ groups:
severity: critical severity: critical
- alert: ThanosSidecarUnhealthy - alert: ThanosSidecarUnhealthy
annotations: annotations:
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{
$value }} seconds.
expr: | expr: |
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
labels: labels:
@ -245,7 +265,8 @@ groups:
rules: rules:
- alert: ThanosStoreGrpcErrorRate - alert: ThanosStoreGrpcErrorRate
annotations: annotations:
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
@ -258,7 +279,8 @@ groups:
severity: warning severity: warning
- alert: ThanosStoreSeriesGateLatencyHigh - alert: ThanosStoreSeriesGateLatencyHigh
annotations: annotations:
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests. message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for store series gate requests.
expr: | expr: |
( (
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -270,7 +292,8 @@ groups:
severity: warning severity: warning
- alert: ThanosStoreBucketHighOperationFailures - alert: ThanosStoreBucketHighOperationFailures
annotations: annotations:
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value
| humanize }}% of operations.
expr: | expr: |
( (
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
@ -283,7 +306,8 @@ groups:
severity: warning severity: warning
- alert: ThanosStoreObjstoreOperationLatencyHigh - alert: ThanosStoreObjstoreOperationLatencyHigh
annotations: annotations:
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations. message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of
{{ $value }} seconds for the bucket operations.
expr: | expr: |
( (
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -305,7 +329,8 @@ groups:
severity: critical severity: critical
- alert: ThanosRuleSenderIsFailingAlerts - alert: ThanosRuleSenderIsFailingAlerts
annotations: annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts
to alertmanager.
expr: | expr: |
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
for: 5m for: 5m
@ -313,7 +338,8 @@ groups:
severity: critical severity: critical
- alert: ThanosRuleHighRuleEvaluationFailures - alert: ThanosRuleHighRuleEvaluationFailures
annotations: annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules. message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate
rules.
expr: | expr: |
( (
sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
@ -326,7 +352,8 @@ groups:
severity: critical severity: critical
- alert: ThanosRuleHighRuleEvaluationWarnings - alert: ThanosRuleHighRuleEvaluationWarnings
annotations: annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings. message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
warnings.
expr: | expr: |
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
for: 15m for: 15m
@ -334,7 +361,8 @@ groups:
severity: info severity: info
- alert: ThanosRuleRuleEvaluationLatencyHigh - alert: ThanosRuleRuleEvaluationLatencyHigh
annotations: annotations:
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}. message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
than interval for {{$labels.rule_group}}.
expr: | expr: |
( (
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"}) sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
@ -346,7 +374,8 @@ groups:
severity: warning severity: warning
- alert: ThanosRuleGrpcErrorRate - alert: ThanosRuleGrpcErrorRate
annotations: annotations:
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
@ -360,13 +389,15 @@ groups:
- alert: ThanosRuleConfigReloadFailure - alert: ThanosRuleConfigReloadFailure
annotations: annotations:
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration. message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1 expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by
(job) != 1
for: 5m for: 5m
labels: labels:
severity: info severity: info
- alert: ThanosRuleQueryHighDNSFailures - alert: ThanosRuleQueryHighDNSFailures
annotations: annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints. message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
DNS queries for query endpoints.
expr: | expr: |
( (
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -379,7 +410,8 @@ groups:
severity: warning severity: warning
- alert: ThanosRuleAlertmanagerHighDNSFailures - alert: ThanosRuleAlertmanagerHighDNSFailures
annotations: annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints. message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
DNS queries for Alertmanager endpoints.
expr: | expr: |
( (
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -392,7 +424,8 @@ groups:
severity: warning severity: warning
- alert: ThanosRuleNoEvaluationFor10Intervals - alert: ThanosRuleNoEvaluationFor10Intervals
annotations: annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval. message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups
that did not evaluate for at least 10x of their expected interval.
expr: | expr: |
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
> >
@ -402,7 +435,8 @@ groups:
severity: info severity: info
- alert: ThanosNoRuleEvaluations - alert: ThanosNoRuleEvaluations
annotations: annotations:
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes. message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in
the past 2 minutes.
expr: | expr: |
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
and and
@ -472,7 +506,8 @@ groups:
severity: critical severity: critical
- alert: ThanosBucketReplicateErrorRate - alert: ThanosBucketReplicateErrorRate
annotations: annotations:
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed. message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
failed.
expr: | expr: |
( (
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
@ -484,7 +519,8 @@ groups:
severity: critical severity: critical
- alert: ThanosBucketReplicateRunLatency - alert: ThanosBucketReplicateRunLatency
annotations: annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations. message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for the replicate operations.
expr: | expr: |
( (
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20

View file

@ -59,7 +59,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephMdsMissingReplicas alert: CephMdsMissingReplicas
annotations: annotations:
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster. description: Minimum required replicas for storage metadata service not available.
Might affect the working of storage cluster.
message: Insufficient replicas for storage metadata service. message: Insufficient replicas for storage metadata service.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -93,7 +94,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephMonHighNumberOfLeaderChanges alert: CephMonHighNumberOfLeaderChanges
annotations: annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently. description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
message: Storage Cluster has seen many leader changes recently. message: Storage Cluster has seen many leader changes recently.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -129,7 +131,9 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephOSDCriticallyFull alert: CephOSDCriticallyFull
annotations: annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support. description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or
expand the storage cluster or contact support.
message: Back-end storage device is critically full. message: Back-end storage device is critically full.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
@ -145,7 +149,9 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephOSDNearFull alert: CephOSDNearFull
annotations: annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support. description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage
cluster or contact support.
message: Back-end storage device is nearing full. message: Back-end storage device is nearing full.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
@ -161,7 +167,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephOSDDiskNotResponding alert: CephOSDDiskNotResponding
annotations: annotations:
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}. description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
}}.
message: Disk not responding message: Disk not responding
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
@ -177,7 +184,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephOSDDiskUnavailable alert: CephOSDDiskUnavailable
annotations: annotations:
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}. description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
}}.
message: Disk not accessible message: Disk not accessible
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
@ -227,8 +235,10 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PersistentVolumeUsageNearFull alert: PersistentVolumeUsageNearFull
annotations: annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC. description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
or PVC expansion is required.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -243,8 +253,10 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PersistentVolumeUsageCritical alert: PersistentVolumeUsageCritical
annotations: annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately. description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
or PVC expansion is required.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -327,8 +339,10 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephClusterNearFull alert: CephClusterNearFull
annotations: annotations:
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster. description: Storage cluster utilization has crossed 75% and will become read-only
message: Storage cluster is nearing full. Data deletion or cluster expansion is required. at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion is
required.
severity_level: warning severity_level: warning
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -343,8 +357,10 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephClusterCriticallyFull alert: CephClusterCriticallyFull
annotations: annotations:
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately. description: Storage cluster utilization has crossed 80% and will become read-only
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion. at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion or
cluster expansion.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
expr: | expr: |
@ -359,8 +375,10 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CephClusterReadOnly alert: CephClusterReadOnly
annotations: annotations:
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately. description: Storage cluster utilization has crossed 85% and will become read-only
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion. now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion or cluster
expansion.
severity_level: error severity_level: error
storage_type: ceph storage_type: ceph
expr: | expr: |

View file

@ -39,7 +39,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CoreDNSLatencyHigh alert: CoreDNSLatencyHigh
annotations: annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} . message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
{{ $labels.server }} zone {{ $labels.zone }} .
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
expr: | expr: |
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4 histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
@ -54,7 +55,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CoreDNSErrorsHigh alert: CoreDNSErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: | expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -71,7 +73,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CoreDNSErrorsHigh alert: CoreDNSErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
requests.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
expr: | expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -90,7 +93,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CoreDNSForwardLatencyHigh alert: CoreDNSForwardLatencyHigh
annotations: annotations:
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}. message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
expr: | expr: |
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4 histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
@ -105,7 +109,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CoreDNSForwardErrorsHigh alert: CoreDNSForwardErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: | expr: |
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
@ -122,7 +127,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CoreDNSForwardErrorsHigh alert: CoreDNSForwardErrorsHigh
annotations: annotations:
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
forward requests to {{ $labels.to }}.
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
expr: | expr: |
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))

File diff suppressed because it is too large Load diff

View file

@ -56,7 +56,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdNoLeader alert: etcdNoLeader
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no
leader.'
expr: | expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0 etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m for: 1m
@ -69,7 +70,9 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighNumberOfLeaderChanges alert: etcdHighNumberOfLeaderChanges
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the
last 15 minutes. Frequent elections may be a sign of insufficient resources, high
network latency, or disruptions by other components and should be investigated.'
expr: | expr: |
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m for: 5m
@ -82,7 +85,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedGRPCRequests alert: etcdHighNumberOfFailedGRPCRequests
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
}} failed on etcd instance {{ $labels.instance }}.'
expr: | expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/ /
@ -98,7 +102,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedGRPCRequests alert: etcdHighNumberOfFailedGRPCRequests
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
}} failed on etcd instance {{ $labels.instance }}.'
expr: | expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/ /
@ -114,7 +119,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdGRPCRequestsSlow alert: etcdGRPCRequestsSlow
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type)) histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15 > 0.15
@ -128,7 +134,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdMemberCommunicationSlow alert: etcdMemberCommunicationSlow
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15 > 0.15
@ -142,7 +149,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedProposals alert: etcdHighNumberOfFailedProposals
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
the last 30 minutes on etcd instance {{ $labels.instance }}.'
expr: | expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m for: 15m
@ -155,7 +163,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighFsyncDurations alert: etcdHighFsyncDurations
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{
$value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5 > 0.5
@ -169,7 +178,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighCommitDurations alert: etcdHighCommitDurations
annotations: annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{
$value }}s on etcd instance {{ $labels.instance }}.'
expr: | expr: |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25 > 0.25
@ -183,7 +193,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedHTTPRequests alert: etcdHighNumberOfFailedHTTPRequests
annotations: annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}'
expr: | expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.01 without (code) > 0.01
@ -197,7 +208,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHighNumberOfFailedHTTPRequests alert: etcdHighNumberOfFailedHTTPRequests
annotations: annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}.'
expr: | expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
without (code) > 0.05 without (code) > 0.05
@ -211,7 +223,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: etcdHTTPRequestsSlow alert: etcdHTTPRequestsSlow
annotations: annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
expr: | expr: |
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15 > 0.15

View file

@ -96,7 +96,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: GlusterBrickUtilization alert: GlusterBrickUtilization
annotations: annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80% message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 80%
expr: | expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"} 100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80 / gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
@ -110,7 +111,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: GlusterBrickUtilization alert: GlusterBrickUtilization
annotations: annotations:
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90% message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
than 90%
expr: | expr: |
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"} 100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90 / gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
@ -126,7 +128,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: GlusterThinpoolDataUtilization alert: GlusterThinpoolDataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80% message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
80%
expr: | expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8 gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
for: 5m for: 5m
@ -139,7 +142,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: GlusterThinpoolDataUtilization alert: GlusterThinpoolDataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90% message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
90%
expr: | expr: |
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9 gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
for: 5m for: 5m
@ -152,7 +156,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: GlusterThinpoolMetadataUtilization alert: GlusterThinpoolMetadataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80% message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 80%
expr: | expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8 gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
for: 5m for: 5m
@ -165,7 +170,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: GlusterThinpoolMetadataUtilization alert: GlusterThinpoolMetadataUtilization
annotations: annotations:
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90% message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
than 90%
expr: | expr: |
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9 gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
for: 5m for: 5m

View file

@ -38,7 +38,9 @@ alert: JaegerAgentHTTPServerErrs
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors. {{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -51,7 +53,9 @@ alert: JaegerClientSpansDropped
annotations: annotations:
message: | message: |
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -64,7 +68,9 @@ alert: JaegerAgentSpansDropped
annotations: annotations:
message: | message: |
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by
(instance, job, namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -90,7 +96,9 @@ alert: JaegerCollectorDroppingSpans
annotations: annotations:
message: | message: |
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans. collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
job, namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -103,7 +111,9 @@ alert: JaegerSamplingUpdateFailing
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies. {{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job,
namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -116,7 +126,8 @@ alert: JaegerCollectorPersistenceSlow
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans. {{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5 expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
> 0.5
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -129,7 +140,9 @@ alert: JaegerThrottlingUpdateFailing
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies. {{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job,
namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -142,7 +155,9 @@ alert: JaegerQueryReqsFailing
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
namespace)> 1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -155,7 +170,9 @@ alert: JaegerCassandraWritesFailing
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -168,7 +185,9 @@ alert: JaegerCassandraReadsFailing
annotations: annotations:
message: | message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}. {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1 expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning

View file

@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CockroachInstanceFlapping alert: CockroachInstanceFlapping
annotations: annotations:
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m' message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{
$value }} time(s) in 10m'
expr: | expr: |
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5 resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
for: 1m for: 1m
@ -64,7 +65,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CockroachStoreDiskLow alert: CockroachStoreDiskLow
annotations: annotations:
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }}
available disk fraction
expr: | expr: |
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15 :cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
for: 30m for: 30m
@ -116,7 +118,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CockroachHighOpenFDCount alert: CockroachHighOpenFDCount
annotations: annotations:
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used' message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }}
fraction used'
expr: | expr: |
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8 cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
for: 10m for: 10m

View file

@ -23,7 +23,10 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeStateMetricsListErrors alert: KubeStateMetricsListErrors
annotations: annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. description: kube-state-metrics is experiencing errors at an elevated rate in list
operations. This is likely causing it to not be able to expose metrics about Kubernetes
objects correctly or at all.
summary: kube-state-metrics is experiencing errors in list operations.
expr: | expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/ /
@ -39,7 +42,10 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeStateMetricsWatchErrors alert: KubeStateMetricsWatchErrors
annotations: annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. description: kube-state-metrics is experiencing errors at an elevated rate in watch
operations. This is likely causing it to not be able to expose metrics about Kubernetes
objects correctly or at all.
summary: kube-state-metrics is experiencing errors in watch operations.
expr: | expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/ /

View file

@ -24,7 +24,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubePodCrashLooping alert: KubePodCrashLooping
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }})
is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: | expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
@ -39,7 +40,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubePodNotReady alert: KubePodNotReady
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state
for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) ( sum by (namespace, pod) (
@ -60,7 +62,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeDeploymentGenerationMismatch alert: KubeDeploymentGenerationMismatch
annotations: annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not been
rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: | expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"} kube_deployment_status_observed_generation{job="kube-state-metrics"}
@ -77,7 +81,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeDeploymentReplicasMismatch alert: KubeDeploymentReplicasMismatch
annotations: annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched
the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: | expr: |
( (
@ -100,7 +105,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeStatefulSetReplicasMismatch alert: KubeStatefulSetReplicasMismatch
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched
the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: | expr: |
( (
@ -123,7 +129,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeStatefulSetGenerationMismatch alert: KubeStatefulSetGenerationMismatch
annotations: annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr: | expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"} kube_statefulset_status_observed_generation{job="kube-state-metrics"}
@ -140,7 +148,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeStatefulSetUpdateNotRolledOut alert: KubeStatefulSetUpdateNotRolledOut
annotations: annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has
not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: | expr: |
( (
@ -171,7 +180,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeDaemonSetRolloutStuck alert: KubeDaemonSetRolloutStuck
annotations: annotations:
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: | expr: |
( (
@ -208,7 +218,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeContainerWaiting alert: KubeContainerWaiting
annotations: annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
expr: | expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
@ -223,7 +234,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeDaemonSetNotScheduled alert: KubeDaemonSetNotScheduled
annotations: annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr: | expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
@ -240,7 +252,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeDaemonSetMisScheduled alert: KubeDaemonSetMisScheduled
annotations: annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: | expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
@ -255,7 +268,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeJobCompletion alert: KubeJobCompletion
annotations: annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: | expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
@ -285,7 +299,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeHpaReplicasMismatch alert: KubeHpaReplicasMismatch
annotations: annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
expr: | expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"} (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
@ -304,7 +319,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeHpaMaxedOut alert: KubeHpaMaxedOut
annotations: annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas
for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
expr: | expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"} kube_hpa_status_current_replicas{job="kube-state-metrics"}
@ -323,7 +339,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeCPUOvercommit alert: KubeCPUOvercommit
annotations: annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate
node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
@ -342,7 +359,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeMemoryOvercommit alert: KubeMemoryOvercommit
annotations: annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
@ -399,7 +417,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeQuotaFullyUsed alert: KubeQuotaFullyUsed
annotations: annotations:
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
expr: | expr: |
kube_resourcequota{job="kube-state-metrics", type="used"} kube_resourcequota{job="kube-state-metrics", type="used"}
@ -417,7 +436,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: CPUThrottlingHigh alert: CPUThrottlingHigh
annotations: annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: | expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
@ -437,7 +457,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubePersistentVolumeFillingUp alert: KubePersistentVolumeFillingUp
annotations: annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in
Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: | expr: |
kubelet_volume_stats_available_bytes{job="kubelet"} kubelet_volume_stats_available_bytes{job="kubelet"}
@ -455,7 +476,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubePersistentVolumeFillingUp alert: KubePersistentVolumeFillingUp
annotations: annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four days.
Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: | expr: |
( (
@ -476,7 +499,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubePersistentVolumeErrors alert: KubePersistentVolumeErrors
annotations: annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase
}}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
expr: | expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
@ -493,7 +517,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeVersionMismatch alert: KubeVersionMismatch
annotations: annotations:
message: There are {{ $value }} different semantic versions of Kubernetes components running. message: There are {{ $value }} different semantic versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr: | expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
@ -508,7 +533,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeClientErrors alert: KubeClientErrors
annotations: annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}'
is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: | expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
@ -606,7 +632,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeClientCertificateExpiration alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
@ -620,7 +647,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeClientCertificateExpiration alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
@ -634,7 +662,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: AggregatedAPIErrors alert: AggregatedAPIErrors
annotations: annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
errors. The number of errors have increased for it in the past five minutes. High
values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
expr: | expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
@ -648,7 +678,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: AggregatedAPIDown alert: AggregatedAPIDown
annotations: annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m. message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only
{{ $value | humanize }}% available over the last 5m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
expr: | expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90 (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
@ -709,7 +740,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeletTooManyPods alert: KubeletTooManyPods
annotations: annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: | expr: |
count by(node) ( count by(node) (
@ -730,7 +762,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeNodeReadinessFlapping alert: KubeNodeReadinessFlapping
annotations: annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. message: The readiness status of node {{ $labels.node }} has changed {{ $value }}
times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
expr: | expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
@ -745,7 +778,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeletPlegDurationHigh alert: KubeletPlegDurationHigh
annotations: annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
expr: | expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
@ -760,7 +794,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: KubeletPodStartUpLatencyHigh alert: KubeletPodStartUpLatencyHigh
annotations: annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on
node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
expr: | expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60 histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60

View file

@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemSpaceFillingUp alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up.
summary: Filesystem is predicted to run out of space within the next 24 hours. summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: | expr: |
( (
@ -43,7 +44,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemSpaceFillingUp alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up fast.
summary: Filesystem is predicted to run out of space within the next 4 hours. summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: | expr: |
( (
@ -63,7 +65,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfSpace alert: NodeFilesystemAlmostOutOfSpace
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 5% space left. summary: Filesystem has less than 5% space left.
expr: | expr: |
( (
@ -81,7 +84,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfSpace alert: NodeFilesystemAlmostOutOfSpace
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 3% space left. summary: Filesystem has less than 3% space left.
expr: | expr: |
( (
@ -99,7 +103,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemFilesFillingUp alert: NodeFilesystemFilesFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up.
summary: Filesystem is predicted to run out of inodes within the next 24 hours. summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: | expr: |
( (
@ -119,7 +124,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemFilesFillingUp alert: NodeFilesystemFilesFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up fast.
summary: Filesystem is predicted to run out of inodes within the next 4 hours. summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: | expr: |
( (
@ -139,7 +145,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfFiles alert: NodeFilesystemAlmostOutOfFiles
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 5% inodes left. summary: Filesystem has less than 5% inodes left.
expr: | expr: |
( (
@ -157,7 +164,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeFilesystemAlmostOutOfFiles alert: NodeFilesystemAlmostOutOfFiles
annotations: annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 3% inodes left. summary: Filesystem has less than 3% inodes left.
expr: | expr: |
( (
@ -175,7 +183,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeNetworkReceiveErrs alert: NodeNetworkReceiveErrs
annotations: annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
summary: Network interface is reporting many receive errors. summary: Network interface is reporting many receive errors.
expr: | expr: |
increase(node_network_receive_errs_total[2m]) > 10 increase(node_network_receive_errs_total[2m]) > 10
@ -189,7 +198,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeNetworkTransmitErrs alert: NodeNetworkTransmitErrs
annotations: annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Network interface is reporting many transmit errors. summary: Network interface is reporting many transmit errors.
expr: | expr: |
increase(node_network_transmit_errs_total[2m]) > 10 increase(node_network_transmit_errs_total[2m]) > 10
@ -229,7 +239,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeClockSkewDetected alert: NodeClockSkewDetected
annotations: annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
NTP is configured correctly on this host.
summary: Clock skew detected. summary: Clock skew detected.
expr: | expr: |
( (
@ -253,7 +264,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: NodeClockNotSynchronising alert: NodeClockNotSynchronising
annotations: annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
on this host.
summary: Clock not synchronising. summary: Clock not synchronising.
expr: | expr: |
min_over_time(node_timex_sync_status[5m]) == 0 min_over_time(node_timex_sync_status[5m]) == 0

View file

@ -35,13 +35,15 @@ labels:
{{< /code >}} {{< /code >}}
##### PrometheusNotificationQueueRunningFull ##### PrometheusNotificationQueueRunningFull
Prometheus alert notification queue predicted to run full in less than 30m. Prometheus alert notification queue predicted to run full in less than
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusNotificationQueueRunningFull alert: PrometheusNotificationQueueRunningFull
annotations: annotations:
description: Alert notification queue of Prometheus {{$labels.instance}} is running full. description: Alert notification queue of Prometheus {{$labels.instance}} is running
summary: Prometheus alert notification queue predicted to run full in less than 30m. full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
expr: | expr: |
# Without min_over_time, failed scrapes could create false negatives, see # Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -56,14 +58,17 @@ labels:
{{< /code >}} {{< /code >}}
##### PrometheusErrorSendingAlertsToSomeAlertmanagers ##### PrometheusErrorSendingAlertsToSomeAlertmanagers
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
Prometheus has encountered more than 1% errors sending alerts to a specific
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations: annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
Alertmanager.
expr: | expr: |
( (
rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
@ -78,13 +83,14 @@ labels:
{{< /code >}} {{< /code >}}
##### PrometheusErrorSendingAlertsToAnyAlertmanager ##### PrometheusErrorSendingAlertsToAnyAlertmanager
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.' '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus encounters more than 3% errors sending alerts to any Alertmanager. Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToAnyAlertmanager alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations: annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.' description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: | expr: |
min without(alertmanager) ( min without(alertmanager) (
@ -120,7 +126,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusTSDBReloadsFailing alert: PrometheusTSDBReloadsFailing
annotations: annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h. description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk. summary: Prometheus has issues reloading blocks from disk.
expr: | expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0 increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
@ -134,7 +141,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusTSDBCompactionsFailing alert: PrometheusTSDBCompactionsFailing
annotations: annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h. description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks. summary: Prometheus has issues compacting blocks.
expr: | expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0 increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
@ -162,7 +170,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusDuplicateTimestamps alert: PrometheusDuplicateTimestamps
annotations: annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps. summary: Prometheus is dropping samples with duplicate timestamps.
expr: | expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
@ -176,7 +185,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusOutOfOrderTimestamps alert: PrometheusOutOfOrderTimestamps
annotations: annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps. summary: Prometheus drops samples with out-of-order timestamps.
expr: | expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0 rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
@ -190,7 +200,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusRemoteStorageFailures alert: PrometheusRemoteStorageFailures
annotations: annotations:
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value
}}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
summary: Prometheus fails to send samples to remote storage. summary: Prometheus fails to send samples to remote storage.
expr: | expr: |
( (
@ -214,7 +225,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusRemoteWriteBehind alert: PrometheusRemoteWriteBehind
annotations: annotations:
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value
}}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
summary: Prometheus remote write is behind. summary: Prometheus remote write is behind.
expr: | expr: |
# Without max_over_time, failed scrapes could create false negatives, see # Without max_over_time, failed scrapes could create false negatives, see
@ -235,8 +247,12 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusRemoteWriteDesiredShards alert: PrometheusRemoteWriteDesiredShards
annotations: annotations:
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}. description: Prometheus {{$labels.instance}} remote write desired shards calculation
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more than
configured max shards.
expr: | expr: |
# Without max_over_time, failed scrapes could create false negatives, see # Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
@ -255,7 +271,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusRuleFailures alert: PrometheusRuleFailures
annotations: annotations:
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f"
$value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations. summary: Prometheus is failing rule evaluations.
expr: | expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0 increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
@ -269,7 +286,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusMissingRuleEvaluations alert: PrometheusMissingRuleEvaluations
annotations: annotations:
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
}} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation. summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: | expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0 increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
@ -283,8 +301,10 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: PrometheusTargetLimitHit alert: PrometheusTargetLimitHit
annotations: annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. }} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: | expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0 increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
for: 15m for: 15m
@ -296,5 +316,5 @@ labels:
Following dashboards are generated from mixins and hosted on github: Following dashboards are generated from mixins and hosted on github:
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json) - [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)

View file

@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosCompactMultipleRunning alert: ThanosCompactMultipleRunning
annotations: annotations:
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }} message: No more than one Thanos Compact instance should be running at once. There
are {{ $value }}
expr: sum(up{job=~"thanos-compact.*"}) > 1 expr: sum(up{job=~"thanos-compact.*"}) > 1
for: 5m for: 5m
labels: labels:
@ -47,7 +48,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosCompactHighCompactionFailures alert: ThanosCompactHighCompactionFailures
annotations: annotations:
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions. message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
}}% of compactions.
expr: | expr: |
( (
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
@ -65,7 +67,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosCompactBucketHighOperationFailures alert: ThanosCompactBucketHighOperationFailures
annotations: annotations:
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value |
humanize }}% of operations.
expr: | expr: |
( (
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
@ -84,7 +87,8 @@ labels:
alert: ThanosCompactHasNotRun alert: ThanosCompactHasNotRun
annotations: annotations:
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
/ 60 / 60 > 24
labels: labels:
severity: warning severity: warning
{{< /code >}} {{< /code >}}
@ -96,7 +100,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosQueryHttpRequestQueryErrorRateHigh alert: ThanosQueryHttpRequestQueryErrorRateHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests. message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query" requests.
expr: | expr: |
( (
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
@ -113,7 +118,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests. message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of "query_range" requests.
expr: | expr: |
( (
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
@ -130,7 +136,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosQueryGrpcServerErrorRate alert: ThanosQueryGrpcServerErrorRate
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m])) sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
@ -148,7 +155,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosQueryGrpcClientErrorRate alert: ThanosQueryGrpcClientErrorRate
annotations: annotations:
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests. message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}%
of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m])) sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
@ -165,7 +173,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosQueryHighDNSFailures alert: ThanosQueryHighDNSFailures
annotations: annotations:
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints. message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS
queries for store endpoints.
expr: | expr: |
( (
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
@ -182,7 +191,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosQueryInstantLatencyHigh alert: ThanosQueryInstantLatencyHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries. message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for instant queries.
expr: | expr: |
( (
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
@ -199,7 +209,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosQueryRangeLatencyHigh alert: ThanosQueryRangeLatencyHigh
annotations: annotations:
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries. message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for range queries.
expr: | expr: |
( (
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90 histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
@ -218,7 +229,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosReceiveHttpRequestErrorRateHigh alert: ThanosReceiveHttpRequestErrorRateHigh
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
@ -235,7 +247,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosReceiveHttpRequestLatencyHigh alert: ThanosReceiveHttpRequestLatencyHigh
annotations: annotations:
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests. message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for requests.
expr: | expr: |
( (
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10 histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
@ -252,7 +265,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosReceiveHighReplicationFailures alert: ThanosReceiveHighReplicationFailures
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests. message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize
}}% of requests.
expr: | expr: |
thanos_receive_replication_factor > 1 thanos_receive_replication_factor > 1
and and
@ -279,7 +293,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosReceiveHighForwardRequestFailures alert: ThanosReceiveHighForwardRequestFailures
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests. message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
@ -296,7 +311,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosReceiveHighHashringFileRefreshFailures alert: ThanosReceiveHighHashringFileRefreshFailures
annotations: annotations:
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed. message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{
$value | humanize }} of attempts failed.
expr: | expr: |
( (
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
@ -315,7 +331,8 @@ labels:
alert: ThanosReceiveConfigReloadFailure alert: ThanosReceiveConfigReloadFailure
annotations: annotations:
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations. message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1 expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by
(job) != 1
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -326,7 +343,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosReceiveNoUpload alert: ThanosReceiveNoUpload
annotations: annotations:
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage. message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
latest data to object storage.
expr: | expr: |
(up{job=~"thanos-receive.*"} - 1) (up{job=~"thanos-receive.*"} - 1)
+ on (instance) # filters to only alert on current instance last 3h + on (instance) # filters to only alert on current instance last 3h
@ -356,7 +374,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosSidecarUnhealthy alert: ThanosSidecarUnhealthy
annotations: annotations:
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value
}} seconds.
expr: | expr: |
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
labels: labels:
@ -370,7 +389,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosStoreGrpcErrorRate alert: ThanosStoreGrpcErrorRate
annotations: annotations:
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
}}% of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
@ -388,7 +408,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosStoreSeriesGateLatencyHigh alert: ThanosStoreSeriesGateLatencyHigh
annotations: annotations:
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests. message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for store series gate requests.
expr: | expr: |
( (
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -405,7 +426,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosStoreBucketHighOperationFailures alert: ThanosStoreBucketHighOperationFailures
annotations: annotations:
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize
}}% of operations.
expr: | expr: |
( (
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
@ -423,7 +445,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosStoreObjstoreOperationLatencyHigh alert: ThanosStoreObjstoreOperationLatencyHigh
annotations: annotations:
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations. message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{
$value }} seconds for the bucket operations.
expr: | expr: |
( (
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
@ -452,12 +475,13 @@ labels:
{{< /code >}} {{< /code >}}
##### ThanosRuleSenderIsFailingAlerts ##### ThanosRuleSenderIsFailingAlerts
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosRuleSenderIsFailingAlerts alert: ThanosRuleSenderIsFailingAlerts
annotations: annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
alertmanager.
expr: | expr: |
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
for: 5m for: 5m
@ -488,7 +512,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosRuleHighRuleEvaluationWarnings alert: ThanosRuleHighRuleEvaluationWarnings
annotations: annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings. message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
warnings.
expr: | expr: |
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
for: 15m for: 15m
@ -501,7 +526,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosRuleRuleEvaluationLatencyHigh alert: ThanosRuleRuleEvaluationLatencyHigh
annotations: annotations:
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}. message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
than interval for {{$labels.rule_group}}.
expr: | expr: |
( (
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"}) sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
@ -518,7 +544,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosRuleGrpcErrorRate alert: ThanosRuleGrpcErrorRate
annotations: annotations:
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}%
of requests.
expr: | expr: |
( (
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
@ -537,7 +564,8 @@ labels:
alert: ThanosRuleConfigReloadFailure alert: ThanosRuleConfigReloadFailure
annotations: annotations:
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration. message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1 expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job)
!= 1
for: 5m for: 5m
labels: labels:
severity: info severity: info
@ -548,7 +576,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosRuleQueryHighDNSFailures alert: ThanosRuleQueryHighDNSFailures
annotations: annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints. message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
queries for query endpoints.
expr: | expr: |
( (
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -566,7 +595,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosRuleAlertmanagerHighDNSFailures alert: ThanosRuleAlertmanagerHighDNSFailures
annotations: annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints. message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
queries for Alertmanager endpoints.
expr: | expr: |
( (
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
@ -584,7 +614,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosRuleNoEvaluationFor10Intervals alert: ThanosRuleNoEvaluationFor10Intervals
annotations: annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval. message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that
did not evaluate for at least 10x of their expected interval.
expr: | expr: |
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
> >
@ -599,7 +630,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosNoRuleEvaluations alert: ThanosNoRuleEvaluations
annotations: annotations:
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes. message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the
past 2 minutes.
expr: | expr: |
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
and and
@ -726,7 +758,8 @@ labels:
{{< code lang="yaml" >}} {{< code lang="yaml" >}}
alert: ThanosBucketReplicateRunLatency alert: ThanosBucketReplicateRunLatency
annotations: annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations. message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value
}} seconds for the replicate operations.
expr: | expr: |
( (
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20