mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
assets,site/content: daily assets regeneration
This commit is contained in:
parent
efa80a72eb
commit
9105d0ab1d
6 changed files with 62 additions and 62 deletions
|
@ -46,22 +46,6 @@ groups:
|
|||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
|
||||
from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
|
||||
|
@ -217,3 +201,19 @@ groups:
|
|||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
|
||||
from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
@ -14,7 +14,7 @@ groups:
|
|||
annotations:
|
||||
description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
|
||||
summary: Thanos Compact has failed to run ans is now halted.
|
||||
expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1
|
||||
expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -123,9 +123,9 @@ groups:
|
|||
summary: Thanos Query is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
) * 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
|
@ -436,9 +436,9 @@ groups:
|
|||
summary: Thanos Rule is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
|
@ -451,9 +451,9 @@ groups:
|
|||
summary: Thanos Rule is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
|
|
|
@ -1144,7 +1144,7 @@
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(thanos_querier_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)",
|
||||
"expr": "sum(rate(thanos_query_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "lookups {{job}}",
|
||||
|
@ -1223,7 +1223,7 @@
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(thanos_querier_store_apis_dns_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_querier_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))",
|
||||
"expr": "sum(rate(thanos_query_store_apis_dns_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_query_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "error",
|
||||
|
|
|
@ -17,11 +17,11 @@ groups:
|
|||
record: :grpc_client_failures_per_stream:sum_rate
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
)
|
||||
record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
|
||||
record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le)
|
||||
|
|
|
@ -82,29 +82,6 @@ labels:
|
|||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotConnectedToAlertmanagers
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
|
@ -320,6 +297,29 @@ labels:
|
|||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ alert: ThanosCompactHalted
|
|||
annotations:
|
||||
description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
|
||||
summary: Thanos Compact has failed to run ans is now halted.
|
||||
expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1
|
||||
expr: thanos_compact_halted{job=~"thanos-compact.*"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -187,9 +187,9 @@ annotations:
|
|||
summary: Thanos Query is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
) * 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
|
@ -618,9 +618,9 @@ annotations:
|
|||
summary: Thanos Rule is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
|
@ -638,9 +638,9 @@ annotations:
|
|||
summary: Thanos Rule is having high number of DNS failures.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
|
@ -868,16 +868,16 @@ expr: |
|
|||
record: :grpc_client_failures_per_stream:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
|
||||
##### :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
sum(rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
sum(rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
)
|
||||
record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
|
||||
record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :query_duration_seconds:histogram_quantile
|
||||
|
|
Loading…
Reference in a new issue