1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00

assets,site/content: daily assets regeneration

This commit is contained in:
github-actions[bot] 2020-11-13 03:17:29 +00:00
parent efa80a72eb
commit 9105d0ab1d
6 changed files with 62 additions and 62 deletions

View file

@ -46,22 +46,6 @@ groups:
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without(alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
@ -217,3 +201,19 @@ groups:
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical

View file

@ -14,7 +14,7 @@ groups:
annotations:
description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
summary: Thanos Compact has failed to run ans is now halted.
expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1
expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1
for: 5m
labels:
severity: warning
@ -123,9 +123,9 @@ groups:
summary: Thanos Query is having high number of DNS failures.
expr: |
(
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
/
sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
) * 100 > 1
for: 15m
labels:
@ -436,9 +436,9 @@ groups:
summary: Thanos Rule is having high number of DNS failures.
expr: |
(
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
/
sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
* 100 > 1
)
for: 15m
@ -451,9 +451,9 @@ groups:
summary: Thanos Rule is having high number of DNS failures.
expr: |
(
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
/
sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
* 100 > 1
)
for: 15m

View file

@ -1144,7 +1144,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(thanos_querier_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)",
"expr": "sum(rate(thanos_query_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "lookups {{job}}",
@ -1223,7 +1223,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(thanos_querier_store_apis_dns_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_querier_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))",
"expr": "sum(rate(thanos_query_store_apis_dns_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_query_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "error",

View file

@ -17,11 +17,11 @@ groups:
record: :grpc_client_failures_per_stream:sum_rate
- expr: |
(
sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
/
sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
)
record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
- expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le)

View file

@ -82,29 +82,6 @@ labels:
severity: warning
{{< /code >}}
##### PrometheusErrorSendingAlertsToAnyAlertmanager
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
{{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without(alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical
{{< /code >}}
##### PrometheusNotConnectedToAlertmanagers
{{< code lang="yaml" >}}
@ -320,6 +297,29 @@ labels:
severity: warning
{{< /code >}}
##### PrometheusErrorSendingAlertsToAnyAlertmanager
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
{{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:

View file

@ -39,7 +39,7 @@ alert: ThanosCompactHalted
annotations:
description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
summary: Thanos Compact has failed to run ans is now halted.
expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1
expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1
for: 5m
labels:
severity: warning
@ -187,9 +187,9 @@ annotations:
summary: Thanos Query is having high number of DNS failures.
expr: |
(
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
/
sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
) * 100 > 1
for: 15m
labels:
@ -618,9 +618,9 @@ annotations:
summary: Thanos Rule is having high number of DNS failures.
expr: |
(
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
/
sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
* 100 > 1
)
for: 15m
@ -638,9 +638,9 @@ annotations:
summary: Thanos Rule is having high number of DNS failures.
expr: |
(
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
/
sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
* 100 > 1
)
for: 15m
@ -868,16 +868,16 @@ expr: |
record: :grpc_client_failures_per_stream:sum_rate
{{< /code >}}
##### :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
##### :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
{{< code lang="yaml" >}}
expr: |
(
sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
/
sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
)
record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
{{< /code >}}
##### :query_duration_seconds:histogram_quantile