2020-06-08 17:34:52 +00:00
groups :
2021-07-09 03:25:28 +00:00
- name : kube-apiserver-burnrate.rules
2020-06-08 17:34:52 +00:00
rules :
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
2020-06-08 17:34:52 +00:00
)
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:burnrate1d
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
2020-06-08 17:34:52 +00:00
)
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:burnrate1h
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
2020-06-08 17:34:52 +00:00
)
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:burnrate2h
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
2020-06-08 17:34:52 +00:00
)
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:burnrate30m
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
2020-06-08 17:34:52 +00:00
)
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:burnrate3d
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
2020-06-08 17:34:52 +00:00
)
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:burnrate5m
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
2020-06-08 17:34:52 +00:00
)
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:burnrate6h
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
2020-06-08 17:34:52 +00:00
)
+
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:burnrate1d
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
2020-06-08 17:34:52 +00:00
)
+
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:burnrate1h
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
2020-06-08 17:34:52 +00:00
)
+
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:burnrate2h
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
2020-06-08 17:34:52 +00:00
)
+
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:burnrate30m
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
2020-06-08 17:34:52 +00:00
)
+
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:burnrate3d
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
2020-06-08 17:34:52 +00:00
)
+
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:burnrate5m
- expr : |
(
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
2020-06-08 17:34:52 +00:00
)
+
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:burnrate6h
2021-07-09 03:25:28 +00:00
- name : kube-apiserver-histogram.rules
rules :
2020-06-08 17:34:52 +00:00
- expr : |
2022-04-05 03:34:06 +00:00
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
2020-06-08 17:34:52 +00:00
labels :
quantile : "0.99"
verb : read
2022-04-05 03:34:06 +00:00
record : cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile
2020-06-08 17:34:52 +00:00
- expr : |
2022-04-05 03:34:06 +00:00
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
2020-06-08 17:34:52 +00:00
labels :
quantile : "0.99"
verb : write
2022-04-05 03:34:06 +00:00
record : cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile
2020-06-08 17:34:52 +00:00
- interval : 3m
name : kube-apiserver-availability.rules
rules :
2021-07-09 03:25:28 +00:00
- expr : |
avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
record : code_verb:apiserver_request_total:increase30d
- expr : |
sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels :
verb : read
record : code:apiserver_request_total:increase30d
- expr : |
sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels :
verb : write
record : code:apiserver_request_total:increase30d
2021-08-20 03:23:50 +00:00
- expr : |
2022-04-05 03:34:06 +00:00
sum by (cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h]))
record : cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h
2021-08-20 03:23:50 +00:00
- expr : |
2022-04-05 03:34:06 +00:00
sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30)
record : cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d
2021-08-20 03:23:50 +00:00
- expr : |
2022-04-05 03:34:06 +00:00
sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h]))
record : cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h
2021-08-20 03:23:50 +00:00
- expr : |
2022-04-05 03:34:06 +00:00
sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
record : cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d
2020-06-08 17:34:52 +00:00
- expr : |
1 - (
(
# write too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
2020-06-08 17:34:52 +00:00
) +
(
# read too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"})
2020-06-08 17:34:52 +00:00
-
(
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
2020-06-08 17:34:52 +00:00
)
) +
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (code:apiserver_request_total:increase30d)
2020-06-08 17:34:52 +00:00
labels :
verb : all
record : apiserver_request:availability30d
- expr : |
1 - (
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"})
2020-06-08 17:34:52 +00:00
-
(
# too slow
2020-07-03 12:42:21 +00:00
(
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
2020-07-03 12:42:21 +00:00
or
vector(0)
)
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
2020-07-03 12:42:21 +00:00
+
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
2020-06-08 17:34:52 +00:00
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
2020-06-08 17:34:52 +00:00
labels :
verb : read
record : apiserver_request:availability30d
- expr : |
1 - (
(
# too slow
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
2020-06-08 17:34:52 +00:00
-
2022-04-05 03:34:06 +00:00
sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
2020-06-08 17:34:52 +00:00
)
+
# errors
2021-04-22 03:34:19 +00:00
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
2020-06-08 17:34:52 +00:00
)
/
2021-04-22 03:34:19 +00:00
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
2020-06-08 17:34:52 +00:00
labels :
verb : write
record : apiserver_request:availability30d
- expr : |
2021-07-09 03:25:28 +00:00
sum by (cluster,code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
labels :
verb : read
record : code_resource:apiserver_request_total:rate5m
2020-06-08 17:34:52 +00:00
- expr : |
2021-07-09 03:25:28 +00:00
sum by (cluster,code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels :
verb : write
record : code_resource:apiserver_request_total:rate5m
2020-06-08 17:34:52 +00:00
- expr : |
2021-07-09 03:25:28 +00:00
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
2021-01-12 03:55:11 +00:00
record : code_verb:apiserver_request_total:increase1h
2020-06-08 17:34:52 +00:00
- expr : |
2021-07-09 03:25:28 +00:00
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
2021-01-12 03:55:11 +00:00
record : code_verb:apiserver_request_total:increase1h
2020-06-08 17:34:52 +00:00
- expr : |
2021-07-09 03:25:28 +00:00
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
2021-01-12 03:55:11 +00:00
record : code_verb:apiserver_request_total:increase1h
- expr : |
2021-07-09 03:25:28 +00:00
sum by (cluster, code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
2021-01-12 03:55:11 +00:00
record : code_verb:apiserver_request_total:increase1h
2020-06-08 17:34:52 +00:00
- name : k8s.rules
rules :
- expr : |
sum by (cluster, namespace, pod, container) (
2021-06-11 03:32:12 +00:00
irate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])
2020-06-08 17:34:52 +00:00
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1 , max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
2021-06-11 03:32:12 +00:00
record : node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
2020-06-08 17:34:52 +00:00
- expr : |
container_memory_working_set_bytes{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record : node_namespace_pod_container:container_memory_working_set_bytes
- expr : |
container_memory_rss{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record : node_namespace_pod_container:container_memory_rss
- expr : |
container_memory_cache{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record : node_namespace_pod_container:container_memory_cache
- expr : |
container_memory_swap{job="cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record : node_namespace_pod_container:container_memory_swap
2021-07-17 03:25:17 +00:00
- expr : |
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
2021-11-20 03:23:44 +00:00
group_left() max by (namespace, pod, cluster) (
2021-07-17 03:25:17 +00:00
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record : cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
2020-06-08 17:34:52 +00:00
- expr : |
2021-03-05 03:27:06 +00:00
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
2021-03-26 04:18:04 +00:00
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
2021-07-29 03:25:46 +00:00
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
2021-03-26 04:18:04 +00:00
kube_pod_status_phase{phase=~"Pending|Running"} == 1
2020-06-08 17:34:52 +00:00
)
)
)
2021-03-26 04:18:04 +00:00
record : namespace_memory:kube_pod_container_resource_requests:sum
2021-07-17 03:25:17 +00:00
- expr : |
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
2021-11-20 03:23:44 +00:00
group_left() max by (namespace, pod, cluster) (
2021-07-17 03:25:17 +00:00
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record : cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
2020-06-08 17:34:52 +00:00
- expr : |
2021-03-05 03:27:06 +00:00
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
2021-03-26 04:18:04 +00:00
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
2021-07-29 03:25:46 +00:00
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
2020-06-08 17:34:52 +00:00
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
2021-03-26 04:18:04 +00:00
record : namespace_cpu:kube_pod_container_resource_requests:sum
2021-07-17 03:25:17 +00:00
- expr : |
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
2021-11-20 03:23:44 +00:00
group_left() max by (namespace, pod, cluster) (
2021-07-17 03:25:17 +00:00
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record : cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
2021-06-23 03:25:35 +00:00
- expr : |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
2021-07-29 03:25:46 +00:00
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
2021-06-23 03:25:35 +00:00
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record : namespace_memory:kube_pod_container_resource_limits:sum
2021-07-17 03:25:17 +00:00
- expr : |
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
2021-11-20 03:23:44 +00:00
group_left() max by (namespace, pod, cluster) (
2021-07-17 03:25:17 +00:00
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record : cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
2021-06-23 03:25:35 +00:00
- expr : |
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
2021-07-29 03:25:46 +00:00
) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
2021-06-23 03:25:35 +00:00
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record : namespace_cpu:kube_pod_container_resource_limits:sum
2020-06-08 17:34:52 +00:00
- expr : |
max by (cluster, namespace, workload, pod) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset" , "$1" , "owner_name" , "(.*)"
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1 , max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload" , "$1" , "owner_name" , "(.*)"
)
)
labels :
workload_type : deployment
2020-07-20 14:43:19 +00:00
record : namespace_workload_pod:kube_pod_owner:relabel
2020-06-08 17:34:52 +00:00
- expr : |
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload" , "$1" , "owner_name" , "(.*)"
)
)
labels :
workload_type : daemonset
2020-07-20 14:43:19 +00:00
record : namespace_workload_pod:kube_pod_owner:relabel
2020-06-08 17:34:52 +00:00
- expr : |
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload" , "$1" , "owner_name" , "(.*)"
)
)
labels :
workload_type : statefulset
2020-07-20 14:43:19 +00:00
record : namespace_workload_pod:kube_pod_owner:relabel
2022-01-08 03:25:55 +00:00
- expr : |
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload" , "$1" , "owner_name" , "(.*)"
)
)
labels :
workload_type : job
record : namespace_workload_pod:kube_pod_owner:relabel
2020-06-08 17:34:52 +00:00
- name : kube-scheduler.rules
rules :
- expr : |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.99"
record : cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.99"
record : cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.99"
record : cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.9"
record : cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.9"
record : cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.9"
record : cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.5"
record : cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.5"
record : cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels :
quantile : "0.5"
record : cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- name : node.rules
rules :
- expr : |
2022-03-31 03:34:08 +00:00
topk by(cluster, namespace, pod) (1,
max by (cluster, node, namespace, pod) (
2020-06-08 17:34:52 +00:00
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record : 'node_namespace_pod:kube_pod_info:'
- expr : |
count by (cluster, node) (sum by (node, cpu) (
node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node)
2021-02-23 03:26:19 +00:00
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
2020-06-08 17:34:52 +00:00
))
record : node:node_num_cpu:sum
- expr : |
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by (cluster)
record : : node_memory_MemAvailable_bytes:sum
2022-02-24 03:22:40 +00:00
- expr : |
sum(rate(node_cpu_seconds_total{job="node-exporter",mode!="idle",mode!="iowait",mode!="steal"}[5m])) /
count(sum(node_cpu_seconds_total{job="node-exporter"}) by (cluster, instance, cpu))
record : cluster:node_cpu:ratio_rate5m
2020-06-08 17:34:52 +00:00
- name : kubelet.rules
rules :
- expr : |
2022-02-04 03:19:35 +00:00
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
2020-06-08 17:34:52 +00:00
labels :
quantile : "0.99"
record : node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr : |
2022-02-04 03:19:35 +00:00
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
2020-06-08 17:34:52 +00:00
labels :
quantile : "0.9"
record : node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr : |
2022-02-04 03:19:35 +00:00
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
2020-06-08 17:34:52 +00:00
labels :
quantile : "0.5"
record : node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile