1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00
monitoring-mixins-website/assets/apache-airflow/alerts.yaml
Vitaly Zhuravlev b3b400137a Add jsonnet-libs mixins
Add blackbox exporter
Add mysql exporter
2024-05-04 12:01:41 +00:00

45 lines
2 KiB
YAML

groups:
- name: apache-airflow
rules:
- alert: ApacheAirflowStarvingPoolTasks
annotations:
description: |
The number of starved tasks is {{ printf "%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.pool_name }} which is above the threshold of 0.
summary: There are starved tasks detected in the Apache Airflow pool.
expr: |
airflow_pool_starving_tasks > 0
for: 5m
labels:
severity: critical
- alert: ApacheAirflowDAGScheduleDelayWarningLevel
annotations:
description: |
The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m on {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 10.
summary: The delay in DAG schedule time to DAG run time has reached the warning
threshold.
expr: |
increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 10
for: 1m
labels:
severity: warning
- alert: ApacheAirflowDAGScheduleDelayCriticalLevel
annotations:
description: |
The average delay in DAG schedule to run time is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 60.
summary: The delay in DAG schedule time to DAG run time has reached the critical
threshold.
expr: |
increase(airflow_dagrun_schedule_delay_sum[5m]) / clamp_min(increase(airflow_dagrun_schedule_delay_count[5m]),1) > 60
for: 1m
labels:
severity: critical
- alert: ApacheAirflowDAGFailures
annotations:
description: |
The number of DAG failures seen is {{ printf "%.0f" $value }} over the last 1m for {{ $labels.instance }} - {{ $labels.dag_id }} which is above the threshold of 0.
summary: There have been DAG failures detected.
expr: |
increase(airflow_dagrun_duration_failed_count[5m]) > 0
for: 1m
labels:
severity: critical