2021-12-28 03:25:10 +00:00
groups :
- name : loki_alerts
rules :
- alert : LokiRequestErrors
annotations :
2024-10-25 03:33:09 +00:00
description : |
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary : Loki request error rate is high.
2021-12-28 03:25:10 +00:00
expr : |
2024-10-25 03:33:09 +00:00
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
2021-12-28 03:25:10 +00:00
/
2024-10-25 03:33:09 +00:00
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
2021-12-28 03:25:10 +00:00
> 10
for : 15m
labels :
severity : critical
- alert : LokiRequestPanics
annotations :
2024-10-25 03:33:09 +00:00
description : |
{{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
summary : Loki requests are causing code panics.
2021-12-28 03:25:10 +00:00
expr : |
2024-10-25 03:33:09 +00:00
sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0
2021-12-28 03:25:10 +00:00
labels :
severity : critical
- alert : LokiRequestLatency
annotations :
2024-10-25 03:33:09 +00:00
description : |
{{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary : Loki request error latency is high.
2021-12-28 03:25:10 +00:00
expr : |
2023-02-04 03:22:58 +00:00
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
2021-12-28 03:25:10 +00:00
for : 15m
labels :
severity : critical
- alert : LokiTooManyCompactorsRunning
annotations :
2024-10-25 03:33:09 +00:00
description : |
2023-01-14 03:22:29 +00:00
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
2024-10-25 03:33:09 +00:00
summary : Loki deployment is running more than one compactor.
2021-12-28 03:25:10 +00:00
expr : |
2024-10-25 03:33:09 +00:00
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
2021-12-28 03:25:10 +00:00
for : 5m
labels :
severity : warning
2024-10-25 03:33:09 +00:00
- alert : LokiCompactorHasNotSuccessfullyRunCompaction
annotations :
description : |
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor.
summary : Loki compaction has not run in the last 3 hours since the last compaction.
expr : |
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
min (
time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0)
)
by (cluster, namespace)
> 60 * 60 * 3
for : 1h
labels :
severity : critical
- alert : LokiCompactorHasNotSuccessfullyRunCompaction
annotations :
description : |
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor.
summary : Loki compaction has not run in the last 3h since startup.
expr : |
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
max(
max_over_time(
loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h]
)
) by (cluster, namespace)
== 0
for : 1h
labels :
severity : critical