1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00
monitoring-mixins-website/assets/presto/alerts.yaml
Vitaly Zhuravlev b3b400137a Add jsonnet-libs mixins
Add blackbox exporter
Add mysql exporter
2024-05-04 12:01:41 +00:00

84 lines
3.8 KiB
YAML

groups:
- name: presto-alerts
rules:
- alert: PrestoHighInsufficientResources
annotations:
description: The number of insufficient resource failures on {{$labels.instance}}
is {{ printf "%.0f" $value }} which is greater than the threshold of 0.
summary: The amount of failures that are occurring due to insufficient resources
are scaling, causing saturation in the system.
expr: |
increase(presto_QueryManager_InsufficientResourcesFailures_TotalCount[5m]) > 0
for: 5m
labels:
severity: critical
- alert: PrestoHighTaskFailuresWarning
annotations:
description: The number of task failures on {{$labels.instance}} is {{ printf
"%.0f" $value }} which is above the threshold of 0.
summary: The amount of tasks that are failing is increasing, this might affect
query processing and could result in incomplete or incorrect results.
expr: |
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) > 0
for: 5m
labels:
severity: warning
- alert: PrestoHighTaskFailuresCritical
annotations:
description: The number of task failures on {{$labels.instance}} is {{ printf
"%.0f" $value }} which is above the threshold of 30%s.
summary: The amount of tasks that are failing has reached a critical level.
This might affect query processing and could result in incomplete or incorrect
results.
expr: |
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) / clamp_min(increase(presto_TaskManager_FailedTasks_TotalCount[10m]), 1) * 100 > 30
for: 5m
labels:
severity: critical
- alert: PrestoHighQueuedTaskCount
annotations:
description: The number of queued tasks on {{$labels.instance}} is {{ printf
"%.0f" $value }} which is greater than the threshold of 5
summary: The amount of tasks that are being put in queue is increasing. A high
number of queued tasks can lead to increased query latencies and degraded
system performance.
expr: |
increase(presto_QueryExecution_Executor_QueuedTaskCount[5m]) > 5
for: 5m
labels:
severity: warning
- alert: PrestoHighBlockedNodes
annotations:
description: The number of blocked nodes on {{$labels.instance}} is {{ printf
"%.0f" $value }} which is greater than the threshold of 0
summary: The amount of nodes that are blocked due to memory restrictions is
increasing. Blocked nodes can cause performance degradation and resource starvation.
expr: |
increase(presto_ClusterMemoryPool_general_BlockedNodes[5m]) > 0
for: 5m
labels:
severity: critical
- alert: PrestoHighFailedQueriesWarning
annotations:
description: The number of failed queries on {{$labels.instance}} is {{ printf
"%.0f" $value }} which is greater than the threshold of 0
summary: The amount of queries failing is increasing. Failed queries can prevent
users from accessing data, disrupt analytics processes, and might indicate
underlying issues with the system or data.
expr: |
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) > 0
for: 5m
labels:
severity: warning
- alert: PrestoHighFailedQueriesCritical
annotations:
description: The number of failed queries on {{$labels.instance}} is {{ printf
"%.0f" $value }} which is greater than the threshold of 30%s.
summary: The amount of queries failing has increased to critical levels. Failed
queries can prevent users from accessing data, disrupt analytics processes,
and might indicate underlying issues with the system or data.
expr: |
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) / clamp_min(increase(presto_QueryManager_FailedQueries_TotalCount[10m]), 1) * 100 > 30
for: 5m
labels:
severity: critical