diff --git a/assets/coredns/alerts.yaml b/assets/coredns/alerts.yaml index af11ab7..22048c8 100644 --- a/assets/coredns/alerts.yaml +++ b/assets/coredns/alerts.yaml @@ -3,8 +3,9 @@ groups: rules: - alert: CoreDNSDown annotations: - message: CoreDNS has disappeared from Prometheus target discovery. + description: CoreDNS has disappeared from Prometheus target discovery. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsdown + summary: CoreDNS has disappeared from Prometheus target discovery. expr: | absent(up{job="kube-dns"} == 1) for: 15m @@ -12,35 +13,38 @@ groups: severity: critical - alert: CoreDNSLatencyHigh annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server - {{ $labels.server }} zone {{ $labels.zone }} . + description: CoreDNS has 99th percentile latency of {{ $value }} seconds for + server {{ $labels.server }} zone {{ $labels.zone }} . runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh + summary: CoreDNS is experiencing high 99th percentile latency. expr: | - histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4 + histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) without (instance,pod)) > 4 for: 10m labels: severity: critical - alert: CoreDNSErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of requests. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of requests. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh + summary: CoreDNS is returning SERVFAIL. expr: | - sum(rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.03 + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.03 for: 10m labels: severity: critical - alert: CoreDNSErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of requests. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of requests. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh + summary: CoreDNS is returning SERVFAIL. expr: | - sum(rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.01 + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.01 for: 10m labels: severity: warning @@ -48,54 +52,59 @@ groups: rules: - alert: CoreDNSForwardLatencyHigh annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding + description: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh + summary: CoreDNS is experiencing high latency forwarding requests. expr: | - histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4 + histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) without (pod, instance, rcode)) > 4 for: 10m labels: severity: critical - alert: CoreDNSForwardErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of forward requests to {{ $labels.to }}. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of forward requests to {{ $labels.to }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh + summary: CoreDNS is returning SERVFAIL for forward requests. expr: | - sum(rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.03 + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.03 for: 10m labels: severity: critical - alert: CoreDNSForwardErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of forward requests to {{ $labels.to }}. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of forward requests to {{ $labels.to }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh + summary: CoreDNS is returning SERVFAIL for forward requests. expr: | - sum(rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.01 + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.01 for: 10m labels: severity: warning - alert: CoreDNSForwardHealthcheckFailureCount annotations: - message: CoreDNS health checks have failed to upstream server {{ $labels.to + description: CoreDNS health checks have failed to upstream server {{ $labels.to }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardhealthcheckfailurecount + summary: CoreDNS health checks have failed to upstream server. expr: | - sum(rate(coredns_forward_healthcheck_failures_total{job="kube-dns"}[5m])) by (to) > 0 + sum without (pod, instance) (rate(coredns_forward_healthcheck_failures_total{job="kube-dns"}[5m])) > 0 for: 10m labels: severity: warning - alert: CoreDNSForwardHealthcheckBrokenCount annotations: - message: CoreDNS health checks have failed for all upstream servers. + description: CoreDNS health checks have failed for all upstream servers. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardhealthcheckbrokencount + summary: CoreDNS health checks have failed for all upstream servers. expr: | - sum(rate(coredns_forward_healthcheck_broken_total{job="kube-dns"}[5m])) > 0 + sum without (pod, instance) (rate(coredns_forward_healthcheck_broken_total{job="kube-dns"}[5m])) > 0 for: 10m labels: severity: warning diff --git a/site/content/coredns/_index.md b/site/content/coredns/_index.md index f57192d..d246595 100644 --- a/site/content/coredns/_index.md +++ b/site/content/coredns/_index.md @@ -24,8 +24,9 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSDown annotations: - message: CoreDNS has disappeared from Prometheus target discovery. + description: CoreDNS has disappeared from Prometheus target discovery. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsdown + summary: CoreDNS has disappeared from Prometheus target discovery. expr: | absent(up{job="kube-dns"} == 1) for: 15m @@ -39,11 +40,12 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSLatencyHigh annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server + description: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} . runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh + summary: CoreDNS is experiencing high 99th percentile latency. expr: | - histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4 + histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) without (instance,pod)) > 4 for: 10m labels: severity: critical @@ -55,13 +57,14 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of - requests. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} + of requests. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh + summary: CoreDNS is returning SERVFAIL. expr: | - sum(rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.03 + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.03 for: 10m labels: severity: critical @@ -73,13 +76,14 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of - requests. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} + of requests. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh + summary: CoreDNS is returning SERVFAIL. expr: | - sum(rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.01 + sum without (pod, instance, server, zone, view, rcode, plugin) (rate(coredns_dns_responses_total{job="kube-dns"}[5m])) > 0.01 for: 10m labels: severity: warning @@ -93,11 +97,12 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSForwardLatencyHigh annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding + description: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh + summary: CoreDNS is experiencing high latency forwarding requests. expr: | - histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4 + histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) without (pod, instance, rcode)) > 4 for: 10m labels: severity: critical @@ -109,13 +114,14 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSForwardErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of - forward requests to {{ $labels.to }}. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} + of forward requests to {{ $labels.to }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh + summary: CoreDNS is returning SERVFAIL for forward requests. expr: | - sum(rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.03 + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.03 for: 10m labels: severity: critical @@ -127,13 +133,14 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSForwardErrorsHigh annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of - forward requests to {{ $labels.to }}. + description: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} + of forward requests to {{ $labels.to }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh + summary: CoreDNS is returning SERVFAIL for forward requests. expr: | - sum(rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns",rcode="SERVFAIL"}[5m])) / - sum(rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.01 + sum without (pod, instance, rcode) (rate(coredns_forward_responses_total{job="kube-dns"}[5m])) > 0.01 for: 10m labels: severity: warning @@ -145,10 +152,12 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSForwardHealthcheckFailureCount annotations: - message: CoreDNS health checks have failed to upstream server {{ $labels.to }}. + description: CoreDNS health checks have failed to upstream server {{ $labels.to + }}. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardhealthcheckfailurecount + summary: CoreDNS health checks have failed to upstream server. expr: | - sum(rate(coredns_forward_healthcheck_failures_total{job="kube-dns"}[5m])) by (to) > 0 + sum without (pod, instance) (rate(coredns_forward_healthcheck_failures_total{job="kube-dns"}[5m])) > 0 for: 10m labels: severity: warning @@ -160,10 +169,11 @@ https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-core {{< code lang="yaml" >}} alert: CoreDNSForwardHealthcheckBrokenCount annotations: - message: CoreDNS health checks have failed for all upstream servers. + description: CoreDNS health checks have failed for all upstream servers. runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardhealthcheckbrokencount + summary: CoreDNS health checks have failed for all upstream servers. expr: | - sum(rate(coredns_forward_healthcheck_broken_total{job="kube-dns"}[5m])) > 0 + sum without (pod, instance) (rate(coredns_forward_healthcheck_broken_total{job="kube-dns"}[5m])) > 0 for: 10m labels: severity: warning