2020-07-01 09:17:43 +00:00
|
|
|
groups:
|
|
|
|
- name: cortex_alerts
|
|
|
|
rules:
|
|
|
|
- alert: CortexIngesterUnhealthy
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{
|
|
|
|
printf "%f" $value }} unhealthy ingester(s).
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexRequestErrors
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2020-12-15 03:27:30 +00:00
|
|
|
100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
/
|
2020-12-15 03:27:30 +00:00
|
|
|
sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
> 1
|
|
|
|
for: 15m
|
|
|
|
labels:
|
2021-04-22 03:34:19 +00:00
|
|
|
severity: critical
|
2020-07-01 09:17:43 +00:00
|
|
|
- alert: CortexRequestLatency
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
|
|
|
expr: |
|
2021-01-07 03:47:04 +00:00
|
|
|
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
|
2020-07-01 09:17:43 +00:00
|
|
|
>
|
|
|
|
2.5
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexTableSyncFailure
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables.
|
|
|
|
expr: |
|
|
|
|
100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m])
|
|
|
|
/
|
|
|
|
rate(cortex_table_manager_sync_duration_seconds_count[15m])
|
|
|
|
> 10
|
|
|
|
for: 30m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexQueriesIncorrect
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
The Cortex cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m]))
|
|
|
|
/
|
|
|
|
sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
2021-06-23 03:25:35 +00:00
|
|
|
- alert: CortexInconsistentRuntimeConfig
|
2020-08-12 17:33:36 +00:00
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}.
|
2020-08-12 17:33:36 +00:00
|
|
|
expr: |
|
2021-06-23 03:25:35 +00:00
|
|
|
count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1
|
2020-08-12 17:33:36 +00:00
|
|
|
for: 1h
|
|
|
|
labels:
|
2021-06-23 03:25:35 +00:00
|
|
|
severity: critical
|
2020-07-20 14:43:19 +00:00
|
|
|
- alert: CortexBadRuntimeConfig
|
2020-07-01 09:17:43 +00:00
|
|
|
annotations:
|
|
|
|
message: |
|
2020-07-20 14:43:19 +00:00
|
|
|
{{ $labels.job }} failed to reload runtime config.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-06-23 03:25:35 +00:00
|
|
|
# The metric value is reset to 0 on error while reloading the config at runtime.
|
2020-07-20 14:43:19 +00:00
|
|
|
cortex_runtime_config_last_reload_successful == 0
|
2021-01-19 04:04:52 +00:00
|
|
|
for: 5m
|
2020-07-01 09:17:43 +00:00
|
|
|
labels:
|
2021-06-23 03:25:35 +00:00
|
|
|
severity: critical
|
2020-07-01 09:17:43 +00:00
|
|
|
- alert: CortexFrontendQueriesStuck
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} query-frontend.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
sum by (cluster, namespace) (cortex_query_frontend_queue_length) > 1
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2020-12-15 03:27:30 +00:00
|
|
|
- alert: CortexSchedulerQueriesStuck
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} query-scheduler.
|
2020-12-15 03:27:30 +00:00
|
|
|
expr: |
|
|
|
|
sum by (cluster, namespace) (cortex_query_scheduler_queue_length) > 1
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-07-03 03:24:59 +00:00
|
|
|
- alert: CortexMemcachedRequestErrors
|
2020-07-01 09:17:43 +00:00
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Memcached {{ $labels.name }} used by Cortex {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-07-03 03:24:59 +00:00
|
|
|
(
|
|
|
|
sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operation_failures_total[1m])) /
|
|
|
|
sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operations_total[1m]))
|
|
|
|
) * 100 > 5
|
|
|
|
for: 5m
|
2020-07-01 09:17:43 +00:00
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexIngesterRestarts
|
|
|
|
annotations:
|
2020-08-13 10:50:10 +00:00
|
|
|
message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f"
|
|
|
|
$value }} times in the last 30 mins.'
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-06-09 03:51:16 +00:00
|
|
|
changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) >= 2
|
2020-07-01 09:17:43 +00:00
|
|
|
labels:
|
2021-06-09 03:51:16 +00:00
|
|
|
severity: warning
|
2020-07-01 09:17:43 +00:00
|
|
|
- alert: CortexTransferFailed
|
|
|
|
annotations:
|
|
|
|
message: '{{ $labels.job }}/{{ $labels.instance }} transfer failed.'
|
|
|
|
expr: |
|
|
|
|
max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m])
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexOldChunkInMemory
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory.
|
|
|
|
expr: |
|
|
|
|
(time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000)
|
|
|
|
and
|
|
|
|
(cortex_oldest_unflushed_chunk_timestamp_seconds > 0)
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
2020-12-15 03:27:30 +00:00
|
|
|
- alert: CortexMemoryMapAreasTooHigh
|
|
|
|
annotations:
|
|
|
|
message: '{{ $labels.job }}/{{ $labels.instance }} has a number of mmap-ed areas
|
|
|
|
close to the limit.'
|
|
|
|
expr: |
|
2021-01-20 04:06:35 +00:00
|
|
|
process_memory_map_areas{job=~".+(cortex|ingester.*|store-gateway)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester.*|store-gateway)"} > 0.8
|
2020-12-15 03:27:30 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-04-23 03:33:43 +00:00
|
|
|
- name: cortex_ingester_instance_alerts
|
|
|
|
rules:
|
|
|
|
- alert: CortexIngesterReachingSeriesLimit
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit.
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"})
|
|
|
|
and ignoring (limit)
|
|
|
|
(cortex_ingester_instance_limits{limit="max_series"} > 0)
|
|
|
|
) > 0.7
|
2021-07-29 03:25:46 +00:00
|
|
|
for: 3h
|
2021-04-23 03:33:43 +00:00
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexIngesterReachingSeriesLimit
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit.
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"})
|
|
|
|
and ignoring (limit)
|
|
|
|
(cortex_ingester_instance_limits{limit="max_series"} > 0)
|
2021-07-29 03:25:46 +00:00
|
|
|
) > 0.85
|
2021-04-23 03:33:43 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexIngesterReachingTenantsLimit
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit.
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"})
|
|
|
|
and ignoring (limit)
|
|
|
|
(cortex_ingester_instance_limits{limit="max_tenants"} > 0)
|
|
|
|
) > 0.7
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexIngesterReachingTenantsLimit
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit.
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"})
|
|
|
|
and ignoring (limit)
|
|
|
|
(cortex_ingester_instance_limits{limit="max_tenants"} > 0)
|
|
|
|
) > 0.8
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2020-07-01 09:17:43 +00:00
|
|
|
- name: cortex_wal_alerts
|
|
|
|
rules:
|
|
|
|
- alert: CortexWALCorruption
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint.
|
|
|
|
expr: |
|
|
|
|
increase(cortex_ingester_wal_corruptions_total[5m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexCheckpointCreationFailed
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint.
|
|
|
|
expr: |
|
|
|
|
increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexCheckpointCreationFailed
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint.
|
|
|
|
expr: |
|
|
|
|
increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexCheckpointDeletionFailed
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint.
|
|
|
|
expr: |
|
|
|
|
increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexCheckpointDeletionFailed
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
{{ $labels.instance }} is failing to delete checkpoint.
|
|
|
|
expr: |
|
|
|
|
increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- name: cortex-provisioning
|
|
|
|
rules:
|
|
|
|
- alert: CortexProvisioningMemcachedTooSmall
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Chunk memcached cluster in {{ $labels.cluster }}/{{ $labels.namespace }} is too small, should be at least {{ printf "%.2f" $value }}GB.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
4 *
|
|
|
|
sum by (cluster, namespace) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count)
|
|
|
|
/ 1e9
|
|
|
|
)
|
|
|
|
>
|
|
|
|
(
|
|
|
|
sum by (cluster, namespace) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9
|
|
|
|
)
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexProvisioningTooManyActiveSeries
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2020-08-12 17:33:36 +00:00
|
|
|
avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6
|
2021-07-03 03:24:59 +00:00
|
|
|
for: 2h
|
2020-07-01 09:17:43 +00:00
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexProvisioningTooManyWrites
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
avg by (cluster, namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
2020-08-12 17:33:36 +00:00
|
|
|
- alert: CortexAllocatingTooMuchMemory
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
|
2020-08-12 17:33:36 +00:00
|
|
|
expr: |
|
|
|
|
(
|
2020-09-23 03:20:42 +00:00
|
|
|
container_memory_working_set_bytes{container="ingester"}
|
2020-08-12 17:33:36 +00:00
|
|
|
/
|
2020-09-23 03:20:42 +00:00
|
|
|
container_spec_memory_limit_bytes{container="ingester"}
|
2021-01-28 03:32:19 +00:00
|
|
|
) > 0.65
|
2020-08-12 17:33:36 +00:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexAllocatingTooMuchMemory
|
2020-07-01 09:17:43 +00:00
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2020-08-12 17:33:36 +00:00
|
|
|
(
|
2020-09-23 03:20:42 +00:00
|
|
|
container_memory_working_set_bytes{container="ingester"}
|
2020-07-01 09:17:43 +00:00
|
|
|
/
|
2020-09-23 03:20:42 +00:00
|
|
|
container_spec_memory_limit_bytes{container="ingester"}
|
2020-08-12 17:33:36 +00:00
|
|
|
) > 0.8
|
2020-07-01 09:17:43 +00:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- name: ruler_alerts
|
|
|
|
rules:
|
2021-07-03 03:24:59 +00:00
|
|
|
- alert: CortexRulerTooManyFailedPushes
|
2020-07-01 09:17:43 +00:00
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Cortex Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-07-03 03:24:59 +00:00
|
|
|
100 * (
|
|
|
|
sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_failed_total[1m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
/
|
2021-07-03 03:24:59 +00:00
|
|
|
sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_total[1m]))
|
|
|
|
) > 1
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexRulerTooManyFailedQueries
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Cortex Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.
|
|
|
|
expr: |
|
|
|
|
100 * (
|
|
|
|
sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_failed_total[1m]))
|
|
|
|
/
|
|
|
|
sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_total[1m]))
|
|
|
|
) > 1
|
2020-07-01 09:17:43 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexRulerMissedEvaluations
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Cortex Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2020-07-20 14:43:19 +00:00
|
|
|
sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
/
|
2020-07-20 14:43:19 +00:00
|
|
|
sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
> 0.01
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexRulerFailedRingCheck
|
|
|
|
annotations:
|
|
|
|
message: |
|
2021-07-03 03:24:59 +00:00
|
|
|
Cortex Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-05-05 03:31:47 +00:00
|
|
|
sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
> 0
|
2021-05-05 03:31:47 +00:00
|
|
|
for: 5m
|
2020-07-01 09:17:43 +00:00
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- name: gossip_alerts
|
|
|
|
rules:
|
|
|
|
- alert: CortexGossipMembersMismatch
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex instance {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} sees incorrect number of gossip members.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
memberlist_client_cluster_members_count
|
|
|
|
!= on (cluster, namespace) group_left
|
2021-08-25 03:22:56 +00:00
|
|
|
sum by (cluster, namespace) (up{job=~".+/(admin-api|compactor|store-gateway|distributor|ingester.*|querier.*|cortex|ruler)"})
|
2020-07-01 09:17:43 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
2021-02-26 03:26:18 +00:00
|
|
|
- name: etcd_alerts
|
|
|
|
rules:
|
|
|
|
- alert: EtcdAllocatingTooMuchMemory
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit.
|
2021-06-23 03:25:35 +00:00
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
container_memory_working_set_bytes{container="etcd"}
|
|
|
|
/
|
|
|
|
container_spec_memory_limit_bytes{container="etcd"}
|
|
|
|
) > 0.65
|
2021-02-26 03:26:18 +00:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: EtcdAllocatingTooMuchMemory
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit.
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
container_memory_working_set_bytes{container="etcd"}
|
|
|
|
/
|
|
|
|
container_spec_memory_limit_bytes{container="etcd"}
|
|
|
|
) > 0.8
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-08-26 03:23:05 +00:00
|
|
|
- name: alertmanager_alerts
|
|
|
|
rules:
|
|
|
|
- alert: CortexAlertmanagerSyncConfigsFailing
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage.
|
|
|
|
expr: |
|
|
|
|
rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0
|
|
|
|
for: 30m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexAlertmanagerRingCheckFailing
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring.
|
|
|
|
expr: |
|
|
|
|
rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexAlertmanagerPartialStateMergeFailing
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica.
|
|
|
|
expr: |
|
|
|
|
rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexAlertmanagerReplicationFailing
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas.
|
|
|
|
expr: |
|
|
|
|
rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexAlertmanagerPersistStateFailing
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage.
|
|
|
|
expr: |
|
|
|
|
rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0
|
|
|
|
for: 1h
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexAlertmanagerInitialSyncFailed
|
|
|
|
annotations:
|
|
|
|
message: |
|
|
|
|
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up.
|
|
|
|
expr: |
|
|
|
|
increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2020-07-01 09:17:43 +00:00
|
|
|
- name: cortex_blocks_alerts
|
|
|
|
rules:
|
|
|
|
- alert: CortexIngesterHasNotShippedBlocks
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not shipped any block in the last 4 hours.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-07-03 03:24:59 +00:00
|
|
|
(min by(cluster, namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
|
2020-08-12 17:33:36 +00:00
|
|
|
and
|
2021-07-03 03:24:59 +00:00
|
|
|
(max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
|
2020-07-01 09:17:43 +00:00
|
|
|
and
|
2021-05-28 04:45:45 +00:00
|
|
|
# Only if the ingester has ingested samples over the last 4h.
|
2021-07-03 03:24:59 +00:00
|
|
|
(max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
|
2021-05-28 04:45:45 +00:00
|
|
|
and
|
|
|
|
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
|
|
|
|
# had ingested samples in the past, then no traffic was received for a long period and then it starts
|
|
|
|
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
|
|
|
|
# samples, while the a block shipping is expected within the next 4h.
|
2021-07-03 03:24:59 +00:00
|
|
|
(max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
|
2020-07-01 09:17:43 +00:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexIngesterHasNotShippedBlocksSinceStart
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not shipped any block in the last 4 hours.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-07-03 03:24:59 +00:00
|
|
|
(max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
|
2020-08-12 17:33:36 +00:00
|
|
|
and
|
2021-07-03 03:24:59 +00:00
|
|
|
(max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
|
2020-07-01 09:17:43 +00:00
|
|
|
for: 4h
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-01-27 03:31:43 +00:00
|
|
|
- alert: CortexIngesterHasUnshippedBlocks
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has compacted a block {{ $value | humanizeDuration }}
|
|
|
|
ago but it hasn't been successfully uploaded to the storage yet.
|
2021-01-27 03:31:43 +00:00
|
|
|
expr: |
|
|
|
|
(time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600)
|
|
|
|
and
|
|
|
|
(cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0)
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2020-08-12 17:33:36 +00:00
|
|
|
- alert: CortexIngesterTSDBHeadCompactionFailed
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} is failing to compact TSDB head.
|
2020-08-12 17:33:36 +00:00
|
|
|
expr: |
|
|
|
|
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2020-11-11 03:16:57 +00:00
|
|
|
- alert: CortexIngesterTSDBHeadTruncationFailed
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} is failing to truncate TSDB head.
|
2020-11-11 03:16:57 +00:00
|
|
|
expr: |
|
|
|
|
rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexIngesterTSDBCheckpointCreationFailed
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} is failing to create TSDB checkpoint.
|
2020-11-11 03:16:57 +00:00
|
|
|
expr: |
|
|
|
|
rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexIngesterTSDBCheckpointDeletionFailed
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} is failing to delete TSDB checkpoint.
|
2020-11-11 03:16:57 +00:00
|
|
|
expr: |
|
|
|
|
rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexIngesterTSDBWALTruncationFailed
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} is failing to truncate TSDB WAL.
|
2020-11-11 03:16:57 +00:00
|
|
|
expr: |
|
|
|
|
rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexIngesterTSDBWALCorrupted
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} got a corrupted TSDB WAL.
|
2020-11-11 03:16:57 +00:00
|
|
|
expr: |
|
|
|
|
rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexIngesterTSDBWALWritesFailed
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} is failing to write to TSDB WAL.
|
2020-11-11 03:16:57 +00:00
|
|
|
expr: |
|
|
|
|
rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0
|
|
|
|
for: 3m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2020-07-01 09:17:43 +00:00
|
|
|
- alert: CortexQuerierHasNotScanTheBucket
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Querier {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
|
|
|
|
}} has not successfully scanned the bucket since {{ $value | humanizeDuration
|
|
|
|
}}.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
|
|
|
|
and
|
|
|
|
cortex_querier_blocks_last_successful_scan_timestamp_seconds > 0
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexQuerierHighRefetchRate
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Queries in {{ $labels.cluster }}/{{ $labels.namespace }} are
|
|
|
|
refetching series from different store-gateways (because of missing blocks)
|
|
|
|
for the {{ printf "%.0f" $value }}% of queries.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
100 * (
|
|
|
|
(
|
2021-07-03 03:24:59 +00:00
|
|
|
sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
-
|
2021-07-03 03:24:59 +00:00
|
|
|
sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
)
|
|
|
|
/
|
2021-07-03 03:24:59 +00:00
|
|
|
sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
|
2020-07-01 09:17:43 +00:00
|
|
|
)
|
|
|
|
> 1
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CortexStoreGatewayHasNotSyncTheBucket
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Store Gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not successfully synched the bucket since {{ $value
|
|
|
|
| humanizeDuration }}.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
|
|
|
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
|
|
|
|
and
|
|
|
|
cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-01-06 03:45:40 +00:00
|
|
|
- alert: CortexBucketIndexNotUpdated
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.cluster
|
|
|
|
}}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration
|
|
|
|
}}.
|
2021-01-06 03:45:40 +00:00
|
|
|
expr: |
|
2021-07-03 03:24:59 +00:00
|
|
|
min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
|
2021-01-06 03:45:40 +00:00
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CortexTenantHasPartialBlocks
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace
|
|
|
|
}} has {{ $value }} partial blocks.
|
2021-01-06 03:45:40 +00:00
|
|
|
expr: |
|
2021-07-03 03:24:59 +00:00
|
|
|
max by(cluster, namespace, user) (cortex_bucket_blocks_partials_count) > 0
|
2021-01-06 03:45:40 +00:00
|
|
|
for: 6h
|
|
|
|
labels:
|
|
|
|
severity: warning
|
2020-07-01 09:17:43 +00:00
|
|
|
- name: cortex_compactor_alerts
|
|
|
|
rules:
|
|
|
|
- alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not successfully cleaned up blocks in the last 6
|
|
|
|
hours.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-04-21 03:34:15 +00:00
|
|
|
(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6)
|
|
|
|
for: 1h
|
2020-07-01 09:17:43 +00:00
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-04-21 03:34:15 +00:00
|
|
|
- alert: CortexCompactorHasNotSuccessfullyRunCompaction
|
2020-07-01 09:17:43 +00:00
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not run compaction in the last 24 hours.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-04-22 03:34:19 +00:00
|
|
|
(time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24)
|
|
|
|
and
|
|
|
|
(cortex_compactor_last_successful_run_timestamp_seconds > 0)
|
2021-04-21 03:34:15 +00:00
|
|
|
for: 1h
|
2020-07-01 09:17:43 +00:00
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-04-22 03:34:19 +00:00
|
|
|
- alert: CortexCompactorHasNotSuccessfullyRunCompaction
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not run compaction in the last 24 hours.
|
2021-04-22 03:34:19 +00:00
|
|
|
expr: |
|
|
|
|
cortex_compactor_last_successful_run_timestamp_seconds == 0
|
|
|
|
for: 24h
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-06-22 03:27:15 +00:00
|
|
|
- alert: CortexCompactorHasNotSuccessfullyRunCompaction
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} failed to run 2 consecutive compactions.
|
2021-06-22 03:27:15 +00:00
|
|
|
expr: |
|
|
|
|
increase(cortex_compactor_runs_failed_total[2h]) >= 2
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2020-07-01 09:17:43 +00:00
|
|
|
- alert: CortexCompactorHasNotUploadedBlocks
|
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not uploaded any block in the last 24 hours.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-04-03 03:33:49 +00:00
|
|
|
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor.*"} > 60 * 60 * 24)
|
2020-07-01 09:17:43 +00:00
|
|
|
and
|
2021-04-03 03:33:49 +00:00
|
|
|
(thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor.*"} > 0)
|
2020-07-01 09:17:43 +00:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2021-06-22 03:27:15 +00:00
|
|
|
- alert: CortexCompactorHasNotUploadedBlocks
|
2020-07-01 09:17:43 +00:00
|
|
|
annotations:
|
2021-07-03 03:24:59 +00:00
|
|
|
message: Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{
|
|
|
|
$labels.namespace }} has not uploaded any block in the last 24 hours.
|
2020-07-01 09:17:43 +00:00
|
|
|
expr: |
|
2021-04-03 03:33:49 +00:00
|
|
|
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor.*"} == 0
|
2020-07-01 09:17:43 +00:00
|
|
|
for: 24h
|
|
|
|
labels:
|
|
|
|
severity: critical
|