mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
assets,site/content: regenerate
This commit is contained in:
parent
df43594957
commit
7fd2bee5a7
25 changed files with 1134 additions and 535 deletions
|
@ -27,7 +27,8 @@ groups:
|
|||
rules:
|
||||
- alert: CephMdsMissingReplicas
|
||||
annotations:
|
||||
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
|
||||
description: Minimum required replicas for storage metadata service not available.
|
||||
Might affect the working of storage cluster.
|
||||
message: Insufficient replicas for storage metadata service.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -51,7 +52,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CephMonHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
|
||||
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||
message: Storage Cluster has seen many leader changes recently.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -64,7 +66,8 @@ groups:
|
|||
rules:
|
||||
- alert: CephNodeDown
|
||||
annotations:
|
||||
description: Storage node {{ $labels.node }} went down. Please check the node immediately.
|
||||
description: Storage node {{ $labels.node }} went down. Please check the node
|
||||
immediately.
|
||||
message: Storage node {{ $labels.node }} went down
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
|
@ -77,7 +80,9 @@ groups:
|
|||
rules:
|
||||
- alert: CephOSDCriticallyFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
|
||||
has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space
|
||||
or expand the storage cluster or contact support.
|
||||
message: Back-end storage device is critically full.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
|
@ -88,7 +93,9 @@ groups:
|
|||
severity: critical
|
||||
- alert: CephOSDNearFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }}
|
||||
has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand
|
||||
the storage cluster or contact support.
|
||||
message: Back-end storage device is nearing full.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -99,7 +106,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: CephOSDDiskNotResponding
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
|
||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
|
||||
}}.
|
||||
message: Disk not responding
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
|
@ -110,7 +118,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CephOSDDiskUnavailable
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
|
||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
|
||||
}}.
|
||||
message: Disk not accessible
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
|
@ -145,8 +154,10 @@ groups:
|
|||
rules:
|
||||
- alert: PersistentVolumeUsageNearFull
|
||||
annotations:
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
|
||||
75%. Free up some space or expand the PVC.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
|
||||
or PVC expansion is required.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -156,8 +167,10 @@ groups:
|
|||
severity: warning
|
||||
- alert: PersistentVolumeUsageCritical
|
||||
annotations:
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
|
||||
85%. Free up some space or expand the PVC immediately.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
|
||||
or PVC expansion is required.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -191,7 +204,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: CephOSDVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different versions of Ceph OSD components running.
|
||||
description: There are {{ $value }} different versions of Ceph OSD components
|
||||
running.
|
||||
message: There are multiple versions of storage services running.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -202,7 +216,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: CephMonVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different versions of Ceph Mon components running.
|
||||
description: There are {{ $value }} different versions of Ceph Mon components
|
||||
running.
|
||||
message: There are multiple versions of storage services running.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -215,8 +230,10 @@ groups:
|
|||
rules:
|
||||
- alert: CephClusterNearFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster.
|
||||
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
|
||||
description: Storage cluster utilization has crossed 75% and will become read-only
|
||||
at 85%. Free up some space or expand the storage cluster.
|
||||
message: Storage cluster is nearing full. Data deletion or cluster expansion
|
||||
is required.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -226,8 +243,10 @@ groups:
|
|||
severity: warning
|
||||
- alert: CephClusterCriticallyFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
|
||||
description: Storage cluster utilization has crossed 80% and will become read-only
|
||||
at 85%. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is critically full and needs immediate data deletion
|
||||
or cluster expansion.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -237,8 +256,10 @@ groups:
|
|||
severity: critical
|
||||
- alert: CephClusterReadOnly
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion.
|
||||
description: Storage cluster utilization has crossed 85% and will become read-only
|
||||
now. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is read-only now and needs immediate data deletion
|
||||
or cluster expansion.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
|
|
@ -12,7 +12,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CoreDNSLatencyHigh
|
||||
annotations:
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} .
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
|
||||
{{ $labels.server }} zone {{ $labels.zone }} .
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
|
||||
|
@ -21,7 +22,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CoreDNSErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||
of requests.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
@ -32,7 +34,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CoreDNSErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||
of requests.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
@ -45,7 +48,8 @@ groups:
|
|||
rules:
|
||||
- alert: CoreDNSForwardLatencyHigh
|
||||
annotations:
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}.
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
|
||||
requests to {{ $labels.to }}.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
|
||||
|
@ -54,7 +58,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CoreDNSForwardErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||
of forward requests to {{ $labels.to }}.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
@ -65,7 +70,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CoreDNSForwardErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }}
|
||||
of forward requests to {{ $labels.to }}.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
|
|
@ -107,7 +107,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: CortexIngesterRestarts
|
||||
annotations:
|
||||
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.'
|
||||
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f"
|
||||
$value }} times in the last 30 mins.'
|
||||
expr: |
|
||||
changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1
|
||||
labels:
|
||||
|
@ -278,7 +279,8 @@ groups:
|
|||
rules:
|
||||
- alert: CortexGossipMembersMismatch
|
||||
annotations:
|
||||
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.'
|
||||
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of
|
||||
gossip members.'
|
||||
expr: |
|
||||
memberlist_client_cluster_members_count
|
||||
!= on (cluster, namespace) group_left
|
||||
|
@ -290,7 +292,8 @@ groups:
|
|||
rules:
|
||||
- alert: CortexIngesterHasNotShippedBlocks
|
||||
annotations:
|
||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
|
||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||
not shipped any block in the last 4 hours.
|
||||
expr: |
|
||||
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4)
|
||||
and
|
||||
|
@ -302,7 +305,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CortexIngesterHasNotShippedBlocksSinceStart
|
||||
annotations:
|
||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
|
||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||
not shipped any block in the last 4 hours.
|
||||
expr: |
|
||||
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0)
|
||||
and
|
||||
|
@ -312,7 +316,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CortexIngesterTSDBHeadCompactionFailed
|
||||
annotations:
|
||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.
|
||||
message: Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing
|
||||
to compact TSDB head.
|
||||
expr: |
|
||||
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
|
||||
for: 15m
|
||||
|
@ -320,7 +325,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CortexQuerierHasNotScanTheBucket
|
||||
annotations:
|
||||
message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.
|
||||
message: Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not
|
||||
successfully scanned the bucket since {{ $value | humanizeDuration }}.
|
||||
expr: |
|
||||
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
|
||||
and
|
||||
|
@ -330,7 +336,9 @@ groups:
|
|||
severity: critical
|
||||
- alert: CortexQuerierHighRefetchRate
|
||||
annotations:
|
||||
message: Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.
|
||||
message: Cortex Queries in {{ $labels.namespace }} are refetching series from
|
||||
different store-gateways (because of missing blocks) for the {{ printf "%.0f"
|
||||
$value }}% of queries.
|
||||
expr: |
|
||||
100 * (
|
||||
(
|
||||
|
@ -347,7 +355,9 @@ groups:
|
|||
severity: warning
|
||||
- alert: CortexStoreGatewayHasNotSyncTheBucket
|
||||
annotations:
|
||||
message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.
|
||||
message: Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }}
|
||||
has not successfully synched the bucket since {{ $value | humanizeDuration
|
||||
}}.
|
||||
expr: |
|
||||
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
|
||||
and
|
||||
|
@ -359,7 +369,8 @@ groups:
|
|||
rules:
|
||||
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks
|
||||
annotations:
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||
not successfully cleaned up blocks in the last 24 hours.
|
||||
expr: |
|
||||
(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24)
|
||||
and
|
||||
|
@ -369,7 +380,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart
|
||||
annotations:
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||
not successfully cleaned up blocks in the last 24 hours.
|
||||
expr: |
|
||||
cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0
|
||||
for: 24h
|
||||
|
@ -377,7 +389,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CortexCompactorHasNotUploadedBlocks
|
||||
annotations:
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||
not uploaded any block in the last 24 hours.
|
||||
expr: |
|
||||
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24)
|
||||
and
|
||||
|
@ -387,7 +400,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: CortexCompactorHasNotUploadedBlocksSinceStart
|
||||
annotations:
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
|
||||
message: Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has
|
||||
not uploaded any block in the last 24 hours.
|
||||
expr: |
|
||||
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0
|
||||
for: 24h
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
groups:
|
||||
- name: cortex_api
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
|
||||
|
@ -13,185 +16,279 @@ groups:
|
|||
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, route))
|
||||
record: cluster_job_route:cortex_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, route))
|
||||
record: cluster_job_route:cortex_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||
record: cluster_job_route:cortex_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)
|
||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||
route)
|
||||
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, namespace, job, route))
|
||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, namespace, job, route))
|
||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
|
||||
namespace, job, route)
|
||||
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)
|
||||
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||
job, route)
|
||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)
|
||||
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
job, route)
|
||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)
|
||||
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||
job, route)
|
||||
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
|
||||
- name: cortex_cache
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, method))
|
||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, method))
|
||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
|
||||
by (cluster, job, method)
|
||||
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_cache_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, method))
|
||||
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, method))
|
||||
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
method)
|
||||
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
|
||||
- name: cortex_storage
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m]))
|
||||
by (cluster, job, operation)
|
||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m]))
|
||||
by (cluster, job, operation)
|
||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le,
|
||||
cluster, job, operation)
|
||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by
|
||||
(cluster, job, operation)
|
||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
operation)
|
||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
|
||||
job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le,
|
||||
cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
|
||||
by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by (le, cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
|
||||
by (le, cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
|
||||
by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m])) by (le, cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
|
||||
by (le, cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile
|
||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
|
||||
/ sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_chunks_per_query:avg
|
||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job)
|
||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate
|
||||
- expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, method))
|
||||
record: cluster_job_method:cortex_database_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, method))
|
||||
record: cluster_job_method:cortex_database_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m]))
|
||||
by (cluster, job, method)
|
||||
record: cluster_job_method:cortex_database_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster, job, method)
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job, method)
|
||||
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, method)
|
||||
record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, operation))
|
||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job, operation)
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job, operation)
|
||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
|
||||
operation)
|
||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
|
||||
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job,
|
||||
operation)
|
||||
record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_kv_request_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
|
||||
|
@ -199,11 +296,14 @@ groups:
|
|||
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
|
||||
- name: cortex_queries
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_query_frontend_retries:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_query_frontend_retries:50quantile
|
||||
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_query_frontend_retries:avg
|
||||
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
|
||||
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
|
||||
|
@ -211,23 +311,33 @@ groups:
|
|||
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
|
||||
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
|
||||
(cluster, job)
|
||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
|
||||
cluster, job)
|
||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
|
||||
job)
|
||||
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_ingester_queried_series:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_ingester_queried_series:50quantile
|
||||
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_series:avg
|
||||
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
|
||||
|
@ -235,11 +345,14 @@ groups:
|
|||
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
|
||||
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_ingester_queried_chunks:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_ingester_queried_chunks:50quantile
|
||||
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_chunks:avg
|
||||
- expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate
|
||||
|
@ -247,11 +360,14 @@ groups:
|
|||
record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate
|
||||
- expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_ingester_queried_samples:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job))
|
||||
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:cortex_ingester_queried_samples:50quantile
|
||||
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
|
||||
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_samples:avg
|
||||
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
|
||||
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
|
||||
|
|
|
@ -18,7 +18,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
|
||||
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
}}).'
|
||||
expr: |
|
||||
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
|
@ -26,7 +27,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
|
||||
no leader.'
|
||||
expr: |
|
||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
|
@ -34,7 +36,9 @@ groups:
|
|||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within
|
||||
the last 15 minutes. Frequent elections may be a sign of insufficient resources,
|
||||
high network latency, or disruptions by other components and should be investigated.'
|
||||
expr: |
|
||||
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
for: 5m
|
||||
|
@ -42,7 +46,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
|
||||
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
|
@ -53,7 +58,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
|
||||
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
|
@ -64,7 +70,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
|
||||
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
|
@ -73,7 +80,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
|
||||
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
|
@ -82,7 +90,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
|
||||
the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
|
@ -90,7 +99,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
|
@ -99,7 +109,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
|
@ -108,7 +119,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
without (code) > 0.01
|
||||
|
@ -117,7 +129,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
without (code) > 0.05
|
||||
|
@ -126,7 +139,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
|
||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow.
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
|
|
|
@ -49,7 +49,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||
than 80%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
||||
|
@ -58,7 +59,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||
than 90%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
||||
|
@ -69,7 +71,8 @@ groups:
|
|||
rules:
|
||||
- alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
|
||||
than 80%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
|
@ -77,7 +80,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more
|
||||
than 90%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
|
@ -85,7 +89,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||
than 80%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
|
@ -93,7 +98,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||
than 90%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
|
|
|
@ -13,7 +13,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance,
|
||||
job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -21,7 +23,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -29,7 +33,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m]))
|
||||
by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -45,7 +51,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
|
||||
job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -53,7 +61,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -61,7 +71,8 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
|
||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
|
||||
> 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -69,7 +80,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job,
|
||||
namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -77,7 +90,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
|
||||
namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -85,7 +100,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
|
||||
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -93,7 +110,9 @@ groups:
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job,
|
||||
namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance,
|
||||
job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
|||
rules:
|
||||
- alert: CockroachInstanceFlapping
|
||||
annotations:
|
||||
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m'
|
||||
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
|
||||
{{ $value }} time(s) in 10m'
|
||||
expr: |
|
||||
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
|
||||
for: 1m
|
||||
|
@ -29,7 +30,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: CockroachStoreDiskLow
|
||||
annotations:
|
||||
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction
|
||||
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
|
||||
}} available disk fraction
|
||||
expr: |
|
||||
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
|
||||
for: 30m
|
||||
|
@ -61,7 +63,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: CockroachHighOpenFDCount
|
||||
annotations:
|
||||
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used'
|
||||
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
|
||||
}} fraction used'
|
||||
expr: |
|
||||
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
|
||||
for: 10m
|
||||
|
|
|
@ -3,7 +3,10 @@ groups:
|
|||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in
|
||||
list operations. This is likely causing it to not be able to expose metrics
|
||||
about Kubernetes objects correctly or at all.
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
|
@ -14,7 +17,10 @@ groups:
|
|||
severity: critical
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in
|
||||
watch operations. This is likely causing it to not be able to expose metrics
|
||||
about Kubernetes objects correctly or at all.
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
|||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||
expr: |
|
||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
||||
|
@ -12,7 +13,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (
|
||||
|
@ -27,7 +29,9 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} does not match, this indicates that the Deployment has failed but has not
|
||||
been rolled back.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
||||
expr: |
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||
|
@ -38,7 +42,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
||||
matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||
expr: |
|
||||
(
|
||||
|
@ -55,7 +60,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not
|
||||
matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||
expr: |
|
||||
(
|
||||
|
@ -72,7 +78,9 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
}} does not match, this indicates that the StatefulSet has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
||||
expr: |
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||
|
@ -83,7 +91,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||
has not been rolled out.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
||||
expr: |
|
||||
(
|
||||
|
@ -108,7 +117,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
|
||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
|
||||
or progressed for at least 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||
expr: |
|
||||
(
|
||||
|
@ -139,7 +149,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
|
||||
has been in waiting state for longer than 1 hour.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
||||
expr: |
|
||||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
||||
|
@ -148,7 +159,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
|
@ -159,7 +171,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||
|
@ -168,7 +181,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
|
||||
12 hours to complete.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
||||
expr: |
|
||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||
|
@ -186,7 +200,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
|
||||
number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||
expr: |
|
||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||
|
@ -199,7 +214,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max
|
||||
replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||
expr: |
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||
|
@ -212,7 +228,8 @@ groups:
|
|||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot
|
||||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||
|
@ -225,7 +242,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
||||
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
||||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||
|
@ -264,7 +282,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
||||
expr: |
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
|
@ -276,7 +295,9 @@ groups:
|
|||
severity: info
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{
|
||||
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
||||
}}.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||
expr: |
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
|
@ -290,7 +311,9 @@ groups:
|
|||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
|
||||
in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
||||
}} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
|
@ -302,7 +325,9 @@ groups:
|
|||
severity: critical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
||||
days. Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
(
|
||||
|
@ -317,7 +342,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
|
||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
||||
$labels.phase }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
||||
expr: |
|
||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
|
@ -328,7 +354,8 @@ groups:
|
|||
rules:
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
message: There are {{ $value }} different semantic versions of Kubernetes components running.
|
||||
message: There are {{ $value }} different semantic versions of Kubernetes components
|
||||
running.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||
expr: |
|
||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
|
@ -337,7 +364,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
||||
}}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||
expr: |
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||
|
@ -405,7 +433,8 @@ groups:
|
|||
rules:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 7.0 days.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
||||
|
@ -413,7 +442,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 24.0 hours.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
||||
|
@ -421,7 +451,9 @@ groups:
|
|||
severity: critical
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
|
||||
errors. The number of errors have increased for it in the past five minutes.
|
||||
High values indicate that the availability of the service changes too often.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||
expr: |
|
||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||
|
@ -429,7 +461,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been
|
||||
only {{ $value | humanize }}% available over the last 5m.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||
expr: |
|
||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
||||
|
@ -466,7 +499,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
|
||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
count by(node) (
|
||||
|
@ -481,7 +515,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
|
||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value
|
||||
}} times in the last 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||
expr: |
|
||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||
|
@ -490,7 +525,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
|
||||
of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||
expr: |
|
||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
|
@ -499,7 +535,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
|
||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
||||
on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
|
||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
|||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -18,7 +19,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -33,7 +35,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -46,7 +49,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -59,7 +63,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -74,7 +79,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -89,7 +95,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -102,7 +109,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
|
||||
only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -115,7 +123,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
increase(node_network_receive_errs_total[2m]) > 10
|
||||
|
@ -124,7 +133,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
increase(node_network_transmit_errs_total[2m]) > 10
|
||||
|
@ -149,7 +159,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
|
||||
NTP is configured correctly on this host.
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
|
@ -168,7 +179,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is
|
||||
configured on this host.
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
|
|
|
@ -14,8 +14,10 @@ groups:
|
|||
severity: critical
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is
|
||||
running full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than
|
||||
30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
|
@ -29,8 +31,10 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a
|
||||
specific Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
|
@ -44,7 +48,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
|
||||
from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without(alertmanager) (
|
||||
|
@ -70,7 +75,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
reload failures over the last 3h.
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||
|
@ -79,7 +85,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
compaction failures over the last 3h.
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||
|
@ -97,7 +104,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with different values but duplicated timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -106,7 +114,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -115,7 +124,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f"
|
||||
$value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
|
@ -134,7 +144,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f"
|
||||
$value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
|
@ -150,8 +161,12 @@ groups:
|
|||
severity: critical
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation
|
||||
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
|
||||
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
|
||||
$labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more
|
||||
than configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
|
@ -165,7 +180,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf
|
||||
"%.0f" $value }} rules in the last 5m.
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -174,7 +190,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
|
||||
}} rule group evaluations in the last 5m.
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -183,8 +200,10 @@ groups:
|
|||
severity: warning
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||
}} targets because the number of targets exceeded the configured target_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||
the targets limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
|
|
|
@ -3,7 +3,8 @@ groups:
|
|||
rules:
|
||||
- alert: ThanosCompactMultipleRunning
|
||||
annotations:
|
||||
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
|
||||
message: No more than one Thanos Compact instance should be running at once.
|
||||
There are {{ $value }}
|
||||
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
|
@ -17,7 +18,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosCompactHighCompactionFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
|
||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
|
||||
}}% of compactions.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
|
@ -30,7 +32,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosCompactBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value
|
||||
| humanize }}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
|
@ -44,14 +47,16 @@ groups:
|
|||
- alert: ThanosCompactHasNotRun
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
||||
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24
|
||||
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
|
||||
/ 60 / 60 > 24
|
||||
labels:
|
||||
severity: warning
|
||||
- name: thanos-query.rules
|
||||
rules:
|
||||
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of "query" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
||||
|
@ -63,7 +68,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of "query_range" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||
|
@ -75,7 +81,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosQueryGrpcServerErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
||||
|
@ -88,7 +95,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosQueryGrpcClientErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
||||
|
@ -100,7 +108,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
|
||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing
|
||||
DNS queries for store endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
|
@ -112,7 +121,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosQueryInstantLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for instant queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
||||
|
@ -124,7 +134,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosQueryRangeLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for range queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
||||
|
@ -138,7 +149,8 @@ groups:
|
|||
rules:
|
||||
- alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||
|
@ -150,7 +162,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosReceiveHttpRequestLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
|
||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{
|
||||
$value }} seconds for requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
||||
|
@ -162,7 +175,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosReceiveHighReplicationFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.
|
||||
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value |
|
||||
humanize }}% of requests.
|
||||
expr: |
|
||||
thanos_receive_replication_factor > 1
|
||||
and
|
||||
|
@ -184,7 +198,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosReceiveHighForwardRequestFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
|
||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||
|
@ -196,7 +211,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosReceiveHighHashringFileRefreshFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
|
||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file,
|
||||
{{ $value | humanize }} of attempts failed.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||
|
@ -209,14 +225,17 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosReceiveConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
|
||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
|
||||
message: Thanos Receive {{$labels.job}} has not been able to reload hashring
|
||||
configurations.
|
||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"})
|
||||
by (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosReceiveNoUpload
|
||||
annotations:
|
||||
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.
|
||||
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
|
||||
latest data to object storage.
|
||||
expr: |
|
||||
(up{job=~"thanos-receive.*"} - 1)
|
||||
+ on (instance) # filters to only alert on current instance last 3h
|
||||
|
@ -236,7 +255,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosSidecarUnhealthy
|
||||
annotations:
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{
|
||||
$value }} seconds.
|
||||
expr: |
|
||||
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
|
||||
labels:
|
||||
|
@ -245,7 +265,8 @@ groups:
|
|||
rules:
|
||||
- alert: ThanosStoreGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
||||
|
@ -258,7 +279,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosStoreSeriesGateLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
|
||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for store series gate requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
|
@ -270,7 +292,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosStoreBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value
|
||||
| humanize }}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||
|
@ -283,7 +306,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosStoreObjstoreOperationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
|
||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of
|
||||
{{ $value }} seconds for the bucket operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
|
@ -305,7 +329,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosRuleSenderIsFailingAlerts
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts
|
||||
to alertmanager.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 5m
|
||||
|
@ -313,7 +338,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosRuleHighRuleEvaluationFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate
|
||||
rules.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
|
@ -326,7 +352,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosRuleHighRuleEvaluationWarnings
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
|
||||
warnings.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 15m
|
||||
|
@ -334,7 +361,8 @@ groups:
|
|||
severity: info
|
||||
- alert: ThanosRuleRuleEvaluationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
|
||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
|
||||
than interval for {{$labels.rule_group}}.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
||||
|
@ -346,7 +374,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosRuleGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
||||
|
@ -360,13 +389,15 @@ groups:
|
|||
- alert: ThanosRuleConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
|
||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by
|
||||
(job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: ThanosRuleQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
|
||||
DNS queries for query endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
|
@ -379,7 +410,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosRuleAlertmanagerHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
|
||||
DNS queries for Alertmanager endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
|
@ -392,7 +424,8 @@ groups:
|
|||
severity: warning
|
||||
- alert: ThanosRuleNoEvaluationFor10Intervals
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups
|
||||
that did not evaluate for at least 10x of their expected interval.
|
||||
expr: |
|
||||
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
||||
>
|
||||
|
@ -402,7 +435,8 @@ groups:
|
|||
severity: info
|
||||
- alert: ThanosNoRuleEvaluations
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
|
||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in
|
||||
the past 2 minutes.
|
||||
expr: |
|
||||
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
||||
and
|
||||
|
@ -472,7 +506,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosBucketReplicateErrorRate
|
||||
annotations:
|
||||
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.
|
||||
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
|
||||
failed.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
|
||||
|
@ -484,7 +519,8 @@ groups:
|
|||
severity: critical
|
||||
- alert: ThanosBucketReplicateRunLatency
|
||||
annotations:
|
||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
|
||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
|
||||
$value }} seconds for the replicate operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
||||
|
|
|
@ -59,7 +59,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephMdsMissingReplicas
|
||||
annotations:
|
||||
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
|
||||
description: Minimum required replicas for storage metadata service not available.
|
||||
Might affect the working of storage cluster.
|
||||
message: Insufficient replicas for storage metadata service.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -93,7 +94,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephMonHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
|
||||
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||
message: Storage Cluster has seen many leader changes recently.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -129,7 +131,9 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDCriticallyFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
|
||||
crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or
|
||||
expand the storage cluster or contact support.
|
||||
message: Back-end storage device is critically full.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
|
@ -145,7 +149,9 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDNearFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
|
||||
crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage
|
||||
cluster or contact support.
|
||||
message: Back-end storage device is nearing full.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
|
@ -161,7 +167,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDDiskNotResponding
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
|
||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
|
||||
}}.
|
||||
message: Disk not responding
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
|
@ -177,7 +184,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDDiskUnavailable
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
|
||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
|
||||
}}.
|
||||
message: Disk not accessible
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
|
@ -227,8 +235,10 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PersistentVolumeUsageNearFull
|
||||
annotations:
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%.
|
||||
Free up some space or expand the PVC.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
|
||||
or PVC expansion is required.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -243,8 +253,10 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PersistentVolumeUsageCritical
|
||||
annotations:
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
|
||||
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%.
|
||||
Free up some space or expand the PVC immediately.
|
||||
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
|
||||
or PVC expansion is required.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -327,8 +339,10 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephClusterNearFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 75% and will become read-only at 85%. Free up some space or expand the storage cluster.
|
||||
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
|
||||
description: Storage cluster utilization has crossed 75% and will become read-only
|
||||
at 85%. Free up some space or expand the storage cluster.
|
||||
message: Storage cluster is nearing full. Data deletion or cluster expansion is
|
||||
required.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -343,8 +357,10 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephClusterCriticallyFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 80% and will become read-only at 85%. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
|
||||
description: Storage cluster utilization has crossed 80% and will become read-only
|
||||
at 85%. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is critically full and needs immediate data deletion or
|
||||
cluster expansion.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
@ -359,8 +375,10 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CephClusterReadOnly
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 85% and will become read-only now. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is read-only now and needs immediate data deletion or cluster expansion.
|
||||
description: Storage cluster utilization has crossed 85% and will become read-only
|
||||
now. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is read-only now and needs immediate data deletion or cluster
|
||||
expansion.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
|
|
|
@ -39,7 +39,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CoreDNSLatencyHigh
|
||||
annotations:
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} .
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server
|
||||
{{ $labels.server }} zone {{ $labels.zone }} .
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4
|
||||
|
@ -54,7 +55,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CoreDNSErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||
requests.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
@ -71,7 +73,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CoreDNSErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||
requests.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
@ -90,7 +93,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CoreDNSForwardLatencyHigh
|
||||
annotations:
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}.
|
||||
message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding
|
||||
requests to {{ $labels.to }}.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4
|
||||
|
@ -105,7 +109,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CoreDNSForwardErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||
forward requests to {{ $labels.to }}.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
@ -122,7 +127,8 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CoreDNSForwardErrorsHigh
|
||||
annotations:
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}.
|
||||
message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of
|
||||
forward requests to {{ $labels.to }}.
|
||||
runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh
|
||||
expr: |
|
||||
sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m]))
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -56,7 +56,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no
|
||||
leader.'
|
||||
expr: |
|
||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
|
@ -69,7 +70,9 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the
|
||||
last 15 minutes. Frequent elections may be a sign of insufficient resources, high
|
||||
network latency, or disruptions by other components and should be investigated.'
|
||||
expr: |
|
||||
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
for: 5m
|
||||
|
@ -82,7 +85,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
|
||||
}} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
|
@ -98,7 +102,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method
|
||||
}} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
|
@ -114,7 +119,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
|
||||
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
|
@ -128,7 +134,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
|
||||
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
|
@ -142,7 +149,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
|
||||
the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
|
@ -155,7 +163,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{
|
||||
$value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
|
@ -169,7 +178,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{
|
||||
$value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
|
@ -183,7 +193,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
without (code) > 0.01
|
||||
|
@ -197,7 +208,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}.'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
without (code) > 0.05
|
||||
|
@ -211,7 +223,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
|
||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow.
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
|
|
|
@ -96,7 +96,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||
than 80%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
||||
|
@ -110,7 +111,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more
|
||||
than 90%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
||||
|
@ -126,7 +128,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
|
||||
80%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
|
@ -139,7 +142,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than
|
||||
90%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
|
@ -152,7 +156,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||
than 80%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
|
@ -165,7 +170,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more
|
||||
than 90%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
|
|
|
@ -38,7 +38,9 @@ alert: JaegerAgentHTTPServerErrs
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job,
|
||||
namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -51,7 +53,9 @@ alert: JaegerClientSpansDropped
|
|||
annotations:
|
||||
message: |
|
||||
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -64,7 +68,9 @@ alert: JaegerAgentSpansDropped
|
|||
annotations:
|
||||
message: |
|
||||
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by
|
||||
(instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -90,7 +96,9 @@ alert: JaegerCollectorDroppingSpans
|
|||
annotations:
|
||||
message: |
|
||||
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job,
|
||||
namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
|
||||
job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -103,7 +111,9 @@ alert: JaegerSamplingUpdateFailing
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job,
|
||||
namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -116,7 +126,8 @@ alert: JaegerCollectorPersistenceSlow
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
|
||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
|
||||
> 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -129,7 +140,9 @@ alert: JaegerThrottlingUpdateFailing
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job,
|
||||
namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -142,7 +155,9 @@ alert: JaegerQueryReqsFailing
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
|
||||
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
|
||||
namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -155,7 +170,9 @@ alert: JaegerCassandraWritesFailing
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
|
||||
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -168,7 +185,9 @@ alert: JaegerCassandraReadsFailing
|
|||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace)
|
||||
/ sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)>
|
||||
1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CockroachInstanceFlapping
|
||||
annotations:
|
||||
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ $value }} time(s) in 10m'
|
||||
message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{
|
||||
$value }} time(s) in 10m'
|
||||
expr: |
|
||||
resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
|
||||
for: 1m
|
||||
|
@ -64,7 +65,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CockroachStoreDiskLow
|
||||
annotations:
|
||||
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} available disk fraction
|
||||
message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }}
|
||||
available disk fraction
|
||||
expr: |
|
||||
:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
|
||||
for: 30m
|
||||
|
@ -116,7 +118,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CockroachHighOpenFDCount
|
||||
annotations:
|
||||
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used'
|
||||
message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }}
|
||||
fraction used'
|
||||
expr: |
|
||||
cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
|
||||
for: 10m
|
||||
|
|
|
@ -23,7 +23,10 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list
|
||||
operations. This is likely causing it to not be able to expose metrics about Kubernetes
|
||||
objects correctly or at all.
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
|
@ -39,7 +42,10 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch
|
||||
operations. This is likely causing it to not be able to expose metrics about Kubernetes
|
||||
objects correctly or at all.
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
|
|
|
@ -24,7 +24,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubePodCrashLooping
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }})
|
||||
is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||
expr: |
|
||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
||||
|
@ -39,7 +40,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubePodNotReady
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state
|
||||
for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (
|
||||
|
@ -60,7 +62,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} does not match, this indicates that the Deployment has failed but has not been
|
||||
rolled back.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
||||
expr: |
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||
|
@ -77,7 +81,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched
|
||||
the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||
expr: |
|
||||
(
|
||||
|
@ -100,7 +105,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched
|
||||
the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||
expr: |
|
||||
(
|
||||
|
@ -123,7 +129,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
}} does not match, this indicates that the StatefulSet has failed but has not
|
||||
been rolled back.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
||||
expr: |
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||
|
@ -140,7 +148,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has
|
||||
not been rolled out.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
||||
expr: |
|
||||
(
|
||||
|
@ -171,7 +180,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
|
||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
|
||||
or progressed for at least 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||
expr: |
|
||||
(
|
||||
|
@ -208,7 +218,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeContainerWaiting
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
|
||||
has been in waiting state for longer than 1 hour.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
||||
expr: |
|
||||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
||||
|
@ -223,7 +234,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
|
@ -240,7 +252,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||
|
@ -255,7 +268,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeJobCompletion
|
||||
annotations:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
|
||||
12 hours to complete.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
||||
expr: |
|
||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||
|
@ -285,7 +299,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
|
||||
number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||
expr: |
|
||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||
|
@ -304,7 +319,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas
|
||||
for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||
expr: |
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||
|
@ -323,7 +339,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate
|
||||
node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||
|
@ -342,7 +359,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
||||
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
||||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||
|
@ -399,7 +417,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
||||
expr: |
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
|
@ -417,7 +436,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace
|
||||
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||
expr: |
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
|
@ -437,7 +457,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in
|
||||
Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
|
@ -455,7 +476,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} is expected to fill up within four days.
|
||||
Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
(
|
||||
|
@ -476,7 +499,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
|
||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase
|
||||
}}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
||||
expr: |
|
||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
|
@ -493,7 +517,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeVersionMismatch
|
||||
annotations:
|
||||
message: There are {{ $value }} different semantic versions of Kubernetes components running.
|
||||
message: There are {{ $value }} different semantic versions of Kubernetes components
|
||||
running.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||
expr: |
|
||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
|
@ -508,7 +533,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeClientErrors
|
||||
annotations:
|
||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}'
|
||||
is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||
expr: |
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||
|
@ -606,7 +632,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 7.0 days.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
||||
|
@ -620,7 +647,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 24.0 hours.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
||||
|
@ -634,7 +662,9 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
|
||||
errors. The number of errors have increased for it in the past five minutes. High
|
||||
values indicate that the availability of the service changes too often.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||
expr: |
|
||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||
|
@ -648,7 +678,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: AggregatedAPIDown
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only
|
||||
{{ $value | humanize }}% available over the last 5m.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||
expr: |
|
||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
||||
|
@ -709,7 +740,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeletTooManyPods
|
||||
annotations:
|
||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
|
||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
count by(node) (
|
||||
|
@ -730,7 +762,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
|
||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value }}
|
||||
times in the last 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||
expr: |
|
||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||
|
@ -745,7 +778,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
|
||||
of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||
expr: |
|
||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
|
@ -760,7 +794,8 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md
|
|||
{{< code lang="yaml" >}}
|
||||
alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
|
||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on
|
||||
node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
|
||||
|
|
|
@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -43,7 +44,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -63,7 +65,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -81,7 +84,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -99,7 +103,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -119,7 +124,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
|
@ -139,7 +145,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -157,7 +164,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
|
@ -175,7 +183,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
increase(node_network_receive_errs_total[2m]) > 10
|
||||
|
@ -189,7 +198,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
increase(node_network_transmit_errs_total[2m]) > 10
|
||||
|
@ -229,7 +239,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
|
||||
NTP is configured correctly on this host.
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
|
@ -253,7 +264,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
|
||||
on this host.
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
|
|
|
@ -35,13 +35,15 @@ labels:
|
|||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotificationQueueRunningFull
|
||||
Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
Prometheus alert notification queue predicted to run full in less than
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running
|
||||
full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than
|
||||
30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
|
@ -56,14 +58,17 @@ labels:
|
|||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||
|
||||
Prometheus has encountered more than 1% errors sending alerts to a specific
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
|
||||
Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
|
@ -78,13 +83,14 @@ labels:
|
|||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without(alertmanager) (
|
||||
|
@ -120,7 +126,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
reload failures over the last 3h.
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||
|
@ -134,7 +141,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
compaction failures over the last 3h.
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||
|
@ -162,7 +170,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with different values but duplicated timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -176,7 +185,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -190,7 +200,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value
|
||||
}}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
|
@ -214,7 +225,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value
|
||||
}}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
|
@ -235,8 +247,12 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation
|
||||
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
|
||||
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
|
||||
$labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than
|
||||
configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
|
@ -255,7 +271,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f"
|
||||
$value }} rules in the last 5m.
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -269,7 +286,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
|
||||
}} rule group evaluations in the last 5m.
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||
|
@ -283,8 +301,10 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||
}} targets because the number of targets exceeded the configured target_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||
the targets limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
|
@ -296,5 +316,5 @@ labels:
|
|||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
||||
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
|
||||
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
||||
|
|
|
@ -23,7 +23,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactMultipleRunning
|
||||
annotations:
|
||||
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
|
||||
message: No more than one Thanos Compact instance should be running at once. There
|
||||
are {{ $value }}
|
||||
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
|
@ -47,7 +48,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactHighCompactionFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
|
||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
|
||||
}}% of compactions.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
|
@ -65,7 +67,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value |
|
||||
humanize }}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
|
@ -84,7 +87,8 @@ labels:
|
|||
alert: ThanosCompactHasNotRun
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
||||
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24
|
||||
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
|
||||
/ 60 / 60 > 24
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
@ -96,7 +100,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of "query" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
||||
|
@ -113,7 +118,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of "query_range" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||
|
@ -130,7 +136,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryGrpcServerErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
||||
|
@ -148,7 +155,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryGrpcClientErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
|
||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}%
|
||||
of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
||||
|
@ -165,7 +173,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
|
||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS
|
||||
queries for store endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
|
@ -182,7 +191,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryInstantLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for instant queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
||||
|
@ -199,7 +209,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryRangeLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for range queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
||||
|
@ -218,7 +229,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||
|
@ -235,7 +247,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHttpRequestLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
|
||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
||||
|
@ -252,7 +265,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHighReplicationFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.
|
||||
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
thanos_receive_replication_factor > 1
|
||||
and
|
||||
|
@ -279,7 +293,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHighForwardRequestFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
|
||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||
|
@ -296,7 +311,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHighHashringFileRefreshFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
|
||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{
|
||||
$value | humanize }} of attempts failed.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||
|
@ -315,7 +331,8 @@ labels:
|
|||
alert: ThanosReceiveConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
|
||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
|
||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by
|
||||
(job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -326,7 +343,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveNoUpload
|
||||
annotations:
|
||||
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.
|
||||
message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
|
||||
latest data to object storage.
|
||||
expr: |
|
||||
(up{job=~"thanos-receive.*"} - 1)
|
||||
+ on (instance) # filters to only alert on current instance last 3h
|
||||
|
@ -356,7 +374,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosSidecarUnhealthy
|
||||
annotations:
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value
|
||||
}} seconds.
|
||||
expr: |
|
||||
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
|
||||
labels:
|
||||
|
@ -370,7 +389,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
|
||||
}}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
||||
|
@ -388,7 +408,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreSeriesGateLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
|
||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for store series gate requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
|
@ -405,7 +426,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize
|
||||
}}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||
|
@ -423,7 +445,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreObjstoreOperationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
|
||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{
|
||||
$value }} seconds for the bucket operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
|
@ -452,12 +475,13 @@ labels:
|
|||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleSenderIsFailingAlerts
|
||||
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
||||
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleSenderIsFailingAlerts
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to
|
||||
alertmanager.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 5m
|
||||
|
@ -488,7 +512,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleHighRuleEvaluationWarnings
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
|
||||
warnings.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 15m
|
||||
|
@ -501,7 +526,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleRuleEvaluationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
|
||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
|
||||
than interval for {{$labels.rule_group}}.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
||||
|
@ -518,7 +544,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}%
|
||||
of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
||||
|
@ -537,7 +564,8 @@ labels:
|
|||
alert: ThanosRuleConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
|
||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job)
|
||||
!= 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
|
@ -548,7 +576,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
|
||||
queries for query endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
|
@ -566,7 +595,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleAlertmanagerHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
|
||||
queries for Alertmanager endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
|
@ -584,7 +614,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleNoEvaluationFor10Intervals
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that
|
||||
did not evaluate for at least 10x of their expected interval.
|
||||
expr: |
|
||||
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
||||
>
|
||||
|
@ -599,7 +630,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosNoRuleEvaluations
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
|
||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the
|
||||
past 2 minutes.
|
||||
expr: |
|
||||
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
||||
and
|
||||
|
@ -726,7 +758,8 @@ labels:
|
|||
{{< code lang="yaml" >}}
|
||||
alert: ThanosBucketReplicateRunLatency
|
||||
annotations:
|
||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
|
||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value
|
||||
}} seconds for the replicate operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
||||
|
|
Loading…
Reference in a new issue