1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00
monitoring-mixins-website/assets/cortex/rules.yaml

698 lines
39 KiB
YAML
Raw Permalink Normal View History

groups:
- name: cortex_api_1
rules:
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:cortex_request_duration_seconds:avg
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_count:sum_rate
- name: cortex_api_2
rules:
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate
- name: cortex_api_3
rules:
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate
- name: cortex_querier_api
rules:
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by
(cluster, job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m]))
by (cluster, namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg
- expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate
- name: cortex_cache
rules:
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job,
method)
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate
- name: cortex_storage
rules:
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
job, operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m]))
by (cluster, job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
job, operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m]))
by (cluster, job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le,
cluster, job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by
(cluster, job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
job) / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
by (cluster, job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
by (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
by (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
by (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
/ sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate
- expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate
- expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
job, method) / sum(rate(cortex_database_request_duration_seconds_count[1m]))
by (cluster, job, method)
record: cluster_job_method:cortex_database_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster,
job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
operation) / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster,
job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster,
job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job,
operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
/ sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster,
job)
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate
- expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate
- name: cortex_queries
rules:
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
by (cluster, job)
record: cluster_job:cortex_query_frontend_retries:avg
- expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate
- expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by
(cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le,
cluster, job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_series:avg
- expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks:avg
- expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:99quantile
2020-08-13 10:50:10 +00:00
- expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:50quantile
2020-08-13 10:50:10 +00:00
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples:avg
- expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate
- expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate
- expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate
- name: cortex_received_samples
rules:
- expr: |
sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m
- name: cortex_scaling_rules
rules:
- expr: |
sum by (cluster, namespace, deployment) (
label_replace(
kube_deployment_spec_replicas,
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
or
sum by (cluster, namespace, deployment) (
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
)
record: cluster_namespace_deployment:actual_replicas:count
- expr: |
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
/ 240000
)
labels:
deployment: distributor
reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
* 0.59999999999999998 / 240000
)
labels:
deployment: distributor
reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
* 3 / 80000
)
labels:
deployment: ingester
reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
quantile_over_time(0.99,
sum by(cluster, namespace) (
cortex_ingester_memory_series
)[24h:]
)
/ 3000000
)
labels:
deployment: ingester
reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"})
* 3 * 0.59999999999999998 / 3000000
)
labels:
deployment: ingester
reason: active_series_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
* 0.59999999999999998 / 80000
)
labels:
deployment: ingester
reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
ceil(
(sum by (cluster, namespace) (
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
) / 4)
/
avg by (cluster, namespace) (
memcached_limit_bytes{job=~".+/memcached"}
)
)
labels:
deployment: memcached
reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate
- expr: |
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
#
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_cpu_cores was removed:
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_cpu_cores,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
or
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="cpu"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
- expr: |
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
)
labels:
reason: cpu_usage
record: cluster_namespace_deployment_reason:required_replicas:count
- expr: |
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
container_memory_usage_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
record: cluster_namespace_deployment:container_memory_usage_bytes:sum
- expr: |
# This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
# that remove resource metrics, ref:
# - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
# - https://github.com/kubernetes/kube-state-metrics/pull/1004
#
# This is the old expression, compatible with kube-state-metrics < v2.0.0,
# where kube_pod_container_resource_requests_memory_bytes was removed:
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests_memory_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
or
# This expression is compatible with kube-state-metrics >= v1.4.0,
# where kube_pod_container_resource_requests was introduced.
(
sum by (cluster, namespace, deployment) (
label_replace(
label_replace(
kube_pod_container_resource_requests{resource="memory"},
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
),
# The question mark in "(.*?)" is used to make it non-greedy, otherwise it
# always matches everything and the (optional) zone is not removed.
"deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
)
)
)
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
/
cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
)
labels:
reason: memory_usage
record: cluster_namespace_deployment_reason:required_replicas:count
- name: cortex_alertmanager_rules
rules:
- expr: |
sum by (cluster, job, pod) (cortex_alertmanager_alerts)
record: cluster_job_pod:cortex_alertmanager_alerts:sum
- expr: |
sum by (cluster, job, pod) (cortex_alertmanager_silences)
record: cluster_job_pod:cortex_alertmanager_silences:sum
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m
- expr: |
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m
- expr: |
sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m
- expr: |
sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m