2020-07-01 09:17:43 +00:00
groups :
- name : cortex_alerts
rules :
- alert : CortexIngesterUnhealthy
annotations :
message : There are {{ printf "%f" $value }} unhealthy ingester(s).
expr : |
min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0
for : 15m
labels :
severity : critical
- alert : CortexRequestErrors
annotations :
message : |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr : |
100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m]))
/
sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count[1m]))
> 1
for : 15m
labels :
severity : warning
- alert : CortexRequestLatency
annotations :
message : |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr : |
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"}
>
2.5
for : 15m
labels :
severity : warning
- alert : CortexTableSyncFailure
annotations :
message : |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables.
expr : |
100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m])
/
rate(cortex_table_manager_sync_duration_seconds_count[15m])
> 10
for : 30m
labels :
severity : critical
- alert : CortexQueriesIncorrect
annotations :
message : |
Incorrect results for {{ printf "%.2f" $value }}% of queries.
expr : |
100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m]))
/
sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1
for : 15m
labels :
severity : warning
2020-08-12 17:33:36 +00:00
- alert : CortexInconsistentConfig
annotations :
message : |
An inconsistent config file hash is used across cluster {{ $labels.job }}.
expr : |
count(count by(cluster, namespace, job, sha256) (cortex_config_hash)) without(sha256) > 1
for : 1h
labels :
severity : warning
2020-07-20 14:43:19 +00:00
- alert : CortexBadRuntimeConfig
2020-07-01 09:17:43 +00:00
annotations :
message : |
2020-07-20 14:43:19 +00:00
{{ $labels.job }} failed to reload runtime config.
2020-07-01 09:17:43 +00:00
expr : |
2020-07-20 14:43:19 +00:00
cortex_runtime_config_last_reload_successful == 0
or
2020-07-01 09:17:43 +00:00
cortex_overrides_last_reload_successful == 0
for : 15m
labels :
severity : warning
- alert : CortexQuerierCapacityFull
annotations :
message : |
{{ $labels.job }} is at capacity processing queries.
expr : |
prometheus_engine_queries_concurrent_max{job=~".+/(cortex|ruler|querier)"} - prometheus_engine_queries{job=~".+/(cortex|ruler|querier)"} == 0
for : 5m
labels :
severity : critical
- alert : CortexFrontendQueriesStuck
annotations :
message : |
There are {{ $value }} queued up queries.
expr : |
sum by (cluster, namespace) (cortex_query_frontend_queue_length) > 1
for : 5m
labels :
severity : critical
- alert : CortexCacheRequestErrors
annotations :
message : |
Cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors.
expr : |
100 * sum by (cluster, namespace, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m]))
/
sum by (cluster, namespace, method) (rate(cortex_cache_request_duration_seconds_count[1m]))
> 1
for : 15m
labels :
severity : warning
- alert : CortexIngesterRestarts
annotations :
2020-07-03 12:42:21 +00:00
message : '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.'
2020-07-01 09:17:43 +00:00
expr : |
2020-07-03 12:42:21 +00:00
changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1
2020-07-01 09:17:43 +00:00
labels :
severity : critical
- alert : CortexTransferFailed
annotations :
message : '{{ $labels.job }}/{{ $labels.instance }} transfer failed.'
expr : |
max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m])
for : 5m
labels :
severity : critical
- alert : CortexOldChunkInMemory
annotations :
message : |
{{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory.
expr : |
(time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000)
and
(cortex_oldest_unflushed_chunk_timestamp_seconds > 0)
for : 5m
labels :
severity : warning
- name : cortex_wal_alerts
rules :
- alert : CortexWALCorruption
annotations :
message : |
{{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint.
expr : |
increase(cortex_ingester_wal_corruptions_total[5m]) > 0
labels :
severity : critical
- alert : CortexCheckpointCreationFailed
annotations :
message : |
{{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint.
expr : |
increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0
labels :
severity : warning
- alert : CortexCheckpointCreationFailed
annotations :
message : |
{{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint.
expr : |
increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1
labels :
severity : critical
- alert : CortexCheckpointDeletionFailed
annotations :
message : |
{{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint.
expr : |
increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0
labels :
severity : warning
- alert : CortexCheckpointDeletionFailed
annotations :
message : |
{{ $labels.instance }} is failing to delete checkpoint.
expr : |
increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1
labels :
severity : critical
- name : cortex-provisioning
rules :
- alert : CortexProvisioningMemcachedTooSmall
annotations :
message : |
Chunk memcached cluster is too small, should be at least {{ printf "%.2f" $value }}GB.
expr : |
(
4 *
sum by (cluster, namespace) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count)
/ 1e9
)
>
(
sum by (cluster, namespace) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9
)
for : 15m
labels :
severity : warning
- alert : CortexProvisioningTooManyActiveSeries
annotations :
message : |
Too many active series for ingesters, add more ingesters.
expr : |
2020-08-12 17:33:36 +00:00
avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6
2020-07-01 09:17:43 +00:00
and
sum by (cluster, namespace) (rate(cortex_ingester_received_chunks[1h])) == 0
for : 1h
labels :
severity : warning
- alert : CortexProvisioningTooManyWrites
annotations :
message : |
High QPS for ingesters, add more ingesters.
expr : |
avg by (cluster, namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3
for : 15m
labels :
severity : warning
2020-08-12 17:33:36 +00:00
- alert : CortexAllocatingTooMuchMemory
annotations :
message : |
Too much memory being used by {{ $labels.instance }} - add more ingesters.
expr : |
(
container_memory_working_set_bytes{container_name="ingester"}
/
container_spec_memory_limit_bytes{container_name="ingester"}
) > 0.5
for : 15m
labels :
severity : warning
- alert : CortexAllocatingTooMuchMemory
2020-07-01 09:17:43 +00:00
annotations :
message : |
2020-08-12 17:33:36 +00:00
Too much memory being used by {{ $labels.instance }} - add more ingesters.
2020-07-01 09:17:43 +00:00
expr : |
2020-08-12 17:33:36 +00:00
(
2020-07-01 09:17:43 +00:00
container_memory_working_set_bytes{container_name="ingester"}
/
container_spec_memory_limit_bytes{container_name="ingester"}
2020-08-12 17:33:36 +00:00
) > 0.8
2020-07-01 09:17:43 +00:00
for : 15m
labels :
severity : critical
- name : ruler_alerts
rules :
- alert : CortexRulerFailedEvaluations
annotations :
message : |
2020-07-20 14:43:19 +00:00
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors for the rule group {{ $labels.rule_group }}.
2020-07-01 09:17:43 +00:00
expr : |
2020-07-20 14:43:19 +00:00
sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total[1m]))
2020-07-01 09:17:43 +00:00
/
2020-07-20 14:43:19 +00:00
sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_evaluations_total[1m]))
2020-07-01 09:17:43 +00:00
> 0.01
for : 5m
labels :
severity : warning
- alert : CortexRulerMissedEvaluations
annotations :
message : |
2020-07-20 14:43:19 +00:00
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}.
2020-07-01 09:17:43 +00:00
expr : |
2020-07-20 14:43:19 +00:00
sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m]))
2020-07-01 09:17:43 +00:00
/
2020-07-20 14:43:19 +00:00
sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m]))
2020-07-01 09:17:43 +00:00
> 0.01
for : 5m
labels :
severity : warning
- alert : CortexRulerFailedRingCheck
annotations :
message : |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors when checking the ring for rule group ownership.
expr : |
sum by (cluster, namespace) (rate(cortex_ruler_ring_check_errors_total[5m]))
> 0
for : 1m
labels :
severity : critical
- name : gossip_alerts
rules :
- alert : CortexGossipMembersMismatch
annotations :
message : '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.'
expr : |
memberlist_client_cluster_members_count
!= on (cluster, namespace) group_left
sum by (cluster, namespace) (up{job=~".+/(distributor|ingester|querier|cortex|ruler)"})
for : 5m
labels :
severity : warning
- name : cortex_blocks_alerts
rules :
- alert : CortexIngesterHasNotShippedBlocks
annotations :
message : Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
expr : |
2020-08-12 17:33:36 +00:00
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4)
and
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 0)
2020-07-01 09:17:43 +00:00
and
2020-08-12 17:33:36 +00:00
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
2020-07-01 09:17:43 +00:00
for : 15m
labels :
severity : critical
- alert : CortexIngesterHasNotShippedBlocksSinceStart
annotations :
message : Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.
expr : |
2020-08-12 17:33:36 +00:00
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0)
and
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
2020-07-01 09:17:43 +00:00
for : 4h
labels :
severity : critical
2020-08-12 17:33:36 +00:00
- alert : CortexIngesterTSDBHeadCompactionFailed
annotations :
message : Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.
expr : |
rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
for : 15m
labels :
severity : critical
2020-07-01 09:17:43 +00:00
- alert : CortexQuerierHasNotScanTheBucket
annotations :
message : Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.
expr : |
(time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
and
cortex_querier_blocks_last_successful_scan_timestamp_seconds > 0
for : 5m
labels :
severity : critical
- alert : CortexQuerierHighRefetchRate
annotations :
message : Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.
expr : |
100 * (
(
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
-
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0"}[5m]))
)
/
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
)
> 1
for : 10m
labels :
severity : warning
- alert : CortexStoreGatewayHasNotSyncTheBucket
annotations :
message : Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.
expr : |
(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
and
cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
for : 5m
labels :
severity : critical
- name : cortex_compactor_alerts
rules :
- alert : CortexCompactorHasNotSuccessfullyCleanedUpBlocks
annotations :
message : Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
expr : |
(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24)
and
(cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0)
for : 15m
labels :
severity : critical
- alert : CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart
annotations :
message : Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.
expr : |
cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0
for : 24h
labels :
severity : critical
- alert : CortexCompactorHasNotUploadedBlocks
annotations :
message : Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
expr : |
(time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24)
and
(thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 0)
for : 15m
labels :
severity : critical
- alert : CortexCompactorHasNotUploadedBlocksSinceStart
annotations :
message : Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.
expr : |
thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0
for : 24h
labels :
severity : critical