mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 11:48:53 +00:00
kube-prometheus: generate manifests without kubectl
For `--dry-run` to work with kubectl a Kubernetes cluster's apiserver is actually used, which is unnecessary for generating these manifests. This approach also allows further customization, such as adding labels to the generated manifests.
This commit is contained in:
parent
6c891c9fe3
commit
21ebb87254
11 changed files with 242 additions and 115 deletions
Documentation/user-guides
contrib/kube-prometheus
assets/prometheus/rules
hack
cluster-monitoring
scripts
manifests
alertmanager
grafana
prometheus
|
@ -281,6 +281,10 @@ spec:
|
|||
serviceMonitorSelector:
|
||||
matchExpression:
|
||||
- {key: k8s-apps, operator: Exists}
|
||||
ruleSelector:
|
||||
matchLabels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
resources:
|
||||
requests:
|
||||
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
|
||||
|
|
|
@ -29,7 +29,7 @@ ALERT HighNumberOfFailedHTTPRequests
|
|||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
|
|
|
@ -32,6 +32,12 @@ kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml
|
|||
kctl apply -f manifests/alertmanager/alertmanager-config.yaml
|
||||
kctl apply -f manifests/alertmanager/alertmanager-service.yaml
|
||||
|
||||
# unfortunately statefulsets cannot be changed except for their replica count
|
||||
# so we need to make sure that the rule files are created before we create the
|
||||
# prometheus resource so it can properly discover the rule files when creating
|
||||
# the statefulset
|
||||
sleep 5
|
||||
|
||||
# `kubectl apply` is currently not working for third party resources so we are
|
||||
# using `kubectl create` here for the time being.
|
||||
# (https://github.com/kubernetes/kubernetes/issues/29542)
|
||||
|
|
11
contrib/kube-prometheus/hack/scripts/generate-alertmanager-config-secret.sh
Executable file
11
contrib/kube-prometheus/hack/scripts/generate-alertmanager-config-secret.sh
Executable file
|
@ -0,0 +1,11 @@
|
|||
#!/bin/bash
|
||||
|
||||
cat <<-EOF
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
data:
|
||||
alertmanager.yaml: $(cat assets/alertmanager/alertmanager.yaml | base64 --wrap=0)
|
||||
EOF
|
||||
|
15
contrib/kube-prometheus/hack/scripts/generate-dashboards-configmap.sh
Executable file
15
contrib/kube-prometheus/hack/scripts/generate-dashboards-configmap.sh
Executable file
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
cat <<-EOF
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
data:
|
||||
EOF
|
||||
|
||||
for f in assets/grafana/*
|
||||
do
|
||||
echo " $(basename $f): |+"
|
||||
cat $f | sed "s/^/ /g"
|
||||
done
|
|
@ -1,11 +1,11 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Generate Alert Rules ConfigMap
|
||||
kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml
|
||||
hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml
|
||||
|
||||
# Generate Dashboard ConfigMap
|
||||
kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml
|
||||
hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml
|
||||
|
||||
# Generate Secret for Alertmanager config
|
||||
kubectl create secret generic alertmanager-main --dry-run --from-file=assets/alertmanager/alertmanager.yaml -oyaml > manifests/alertmanager/alertmanager-config.yaml
|
||||
hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml
|
||||
|
||||
|
|
18
contrib/kube-prometheus/hack/scripts/generate-rules-configmap.sh
Executable file
18
contrib/kube-prometheus/hack/scripts/generate-rules-configmap.sh
Executable file
|
@ -0,0 +1,18 @@
|
|||
#!/bin/bash
|
||||
|
||||
cat <<-EOF
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-k8s-rules
|
||||
labels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
data:
|
||||
EOF
|
||||
|
||||
for f in assets/prometheus/rules/*.rules
|
||||
do
|
||||
echo " $(basename $f): |+"
|
||||
cat $f | sed "s/^/ /g"
|
||||
done
|
|
@ -1,7 +1,6 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
|
||||
kind: Secret
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: alertmanager-main
|
||||
data:
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
data:
|
||||
all-nodes-dashboard.json: |
|
||||
all-nodes-dashboard.json: |+
|
||||
{
|
||||
"dashboard":
|
||||
{
|
||||
|
@ -861,7 +864,7 @@ data:
|
|||
],
|
||||
"overwrite": true
|
||||
}
|
||||
deployment-dashboard.json: |-
|
||||
deployment-dashboard.json: |+
|
||||
{
|
||||
"dashboard": {
|
||||
"__inputs": [
|
||||
|
@ -1678,8 +1681,7 @@ data:
|
|||
}
|
||||
],
|
||||
"overwrite": true
|
||||
}
|
||||
kubernetes-pods-dashboard.json: |
|
||||
} kubernetes-pods-dashboard.json: |+
|
||||
{
|
||||
"dashboard": {
|
||||
"__inputs": [
|
||||
|
@ -2089,7 +2091,7 @@ data:
|
|||
],
|
||||
"overwrite": true
|
||||
}
|
||||
node-dashboard.json: |
|
||||
node-dashboard.json: |+
|
||||
{
|
||||
"dashboard":
|
||||
{
|
||||
|
@ -2970,7 +2972,7 @@ data:
|
|||
],
|
||||
"overwrite": true
|
||||
}
|
||||
prometheus-datasource.json: |
|
||||
prometheus-datasource.json: |+
|
||||
{
|
||||
"access": "proxy",
|
||||
"basicAuth": false,
|
||||
|
@ -2978,7 +2980,7 @@ data:
|
|||
"type": "prometheus",
|
||||
"url": "http://prometheus-k8s.monitoring.svc:9090"
|
||||
}
|
||||
resource-requests-dashboard.json: |-
|
||||
resource-requests-dashboard.json: |+
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
|
@ -3402,8 +3404,4 @@ data:
|
|||
"timezone": "browser",
|
||||
"title": "Resource Requests",
|
||||
"version": 1
|
||||
}
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: grafana-dashboards
|
||||
}
|
|
@ -1,62 +1,138 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-k8s-rules
|
||||
labels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
data:
|
||||
etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
|
||||
peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"}
|
||||
== 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity
|
||||
= \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n
|
||||
\ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
|
||||
\ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
|
||||
an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
|
||||
\ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
|
||||
0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||
\ summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP
|
||||
endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
|
||||
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
|
||||
\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||
> 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
|
||||
{{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT
|
||||
HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
|
||||
code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
|
||||
> 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"a high number of HTTP requests are failing\",\n description
|
||||
= \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
|
||||
on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile
|
||||
of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99,
|
||||
rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS
|
||||
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP
|
||||
requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP
|
||||
requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts
|
||||
###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
|
||||
if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
|
||||
\ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS
|
||||
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors
|
||||
soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance
|
||||
}} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors
|
||||
are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m],
|
||||
3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
|
||||
{\n summary = \"file descriptors soon exhausted\",\n description = \"{{
|
||||
$labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
|
||||
soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
|
||||
proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
|
||||
> 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary
|
||||
= \"a high number of failed proposals within the etcd cluster are happening\",\n
|
||||
\ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
|
||||
proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts
|
||||
###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
|
||||
HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
|
||||
> 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
|
||||
\ summary = \"high fsync durations\",\n description = \"ectd instance {{
|
||||
$labels.instance }} fync durations are high\",\n }\n"
|
||||
etcd2.rules: |+
|
||||
### General cluster availability ###
|
||||
|
||||
# alert if another failed peer will result in an unavailable cluster
|
||||
ALERT InsufficientPeers
|
||||
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Etcd cluster small",
|
||||
description = "If one more etcd peer goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
### HTTP requests alerts ###
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if 50% of requests get a 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
|
||||
}
|
||||
|
||||
### File descriptor alerts ###
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
### etcd proposal alerts ###
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of failed proposals within the etcd cluster are happening",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
### etcd disk io latency alerts ###
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "ectd instance {{ $labels.instance }} fync durations are high",
|
||||
}
|
||||
kubernetes.rules: |+
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
|
||||
### Container resources ###
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -65,7 +141,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_cpu_shares =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -74,7 +150,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:cpu_usage:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -85,7 +161,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -94,7 +170,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_working_set:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -103,7 +179,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_rss:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -112,7 +188,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_cache:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -121,7 +197,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:disk_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
|
@ -130,7 +206,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
|
@ -141,7 +217,7 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_oom:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
|
@ -152,39 +228,39 @@ data:
|
|||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
### Cluster resources ###
|
||||
|
||||
|
||||
cluster:memory_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_memory_limit_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
|
||||
cluster:memory_used:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_memory_usage_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
|
||||
cluster:cpu_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_cpu_shares{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
|
||||
)
|
||||
|
||||
|
||||
cluster:node_cpu_use:percent =
|
||||
100 * sum by (cluster) (
|
||||
rate(node_cpu{mode!="idle"}[5m])
|
||||
) / sum by (cluster) (
|
||||
machine_cpu_cores
|
||||
)
|
||||
|
||||
|
||||
### API latency ###
|
||||
|
||||
|
||||
# Raw metrics are in microseconds. Convert to seconds.
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(
|
||||
|
@ -201,30 +277,30 @@ data:
|
|||
0.5,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
|
||||
|
||||
### Scheduling latency ###
|
||||
|
||||
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
|
||||
ALERT K8SNodeDown
|
||||
IF up{job="kubelet"} == 0
|
||||
FOR 1h
|
||||
|
@ -236,7 +312,7 @@ data:
|
|||
summary = "Kubelet cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_ready{condition="true"} == 0
|
||||
FOR 1h
|
||||
|
@ -248,7 +324,7 @@ data:
|
|||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
|
||||
|
@ -267,7 +343,7 @@ data:
|
|||
summary = "Many K8s nodes are Not Ready",
|
||||
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SKubeletNodeExporterDown
|
||||
IF up{job="node-exporter"} == 0
|
||||
FOR 15m
|
||||
|
@ -279,7 +355,7 @@ data:
|
|||
summary = "Kubelet node_exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
|
@ -291,7 +367,7 @@ data:
|
|||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SApiserverDown
|
||||
IF up{job="kubernetes"} == 0
|
||||
FOR 15m
|
||||
|
@ -303,7 +379,7 @@ data:
|
|||
summary = "API server unreachable",
|
||||
description = "An API server could not be scraped.",
|
||||
}
|
||||
|
||||
|
||||
# Disable for non HA kubernetes setups.
|
||||
ALERT K8SApiserverDown
|
||||
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
|
||||
|
@ -316,7 +392,7 @@ data:
|
|||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SSchedulerDown
|
||||
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
|
||||
FOR 5m
|
||||
|
@ -328,7 +404,7 @@ data:
|
|||
summary = "Scheduler is down",
|
||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
|
||||
FOR 5m
|
||||
|
@ -340,7 +416,7 @@ data:
|
|||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
|
||||
FOR 10m
|
||||
|
@ -352,7 +428,7 @@ data:
|
|||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
|
||||
LABELS {
|
||||
|
@ -363,7 +439,7 @@ data:
|
|||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{ $value }}% full.",
|
||||
}
|
||||
|
||||
|
||||
# To catch the conntrack sysctl de-tuning when it happens
|
||||
ALERT K8SConntrackTuningMissing
|
||||
IF node_nf_conntrack_udp_timeout > 10
|
||||
|
@ -376,7 +452,7 @@ data:
|
|||
summary = "Node does not have the correct conntrack tunings",
|
||||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
|
||||
FOR 10m
|
||||
|
@ -388,7 +464,7 @@ data:
|
|||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
|
||||
FOR 10m
|
||||
|
@ -400,7 +476,7 @@ data:
|
|||
summary = "{{ $labels.job }} has too many open file descriptors",
|
||||
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
ALERT K8SApiServerLatency
|
||||
|
@ -417,7 +493,7 @@ data:
|
|||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SApiServerEtcdAccessLatency
|
||||
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
|
||||
FOR 15m
|
||||
|
@ -429,7 +505,7 @@ data:
|
|||
summary = "Access to etcd is slow",
|
||||
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
|
||||
}
|
||||
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
|
@ -440,8 +516,4 @@ data:
|
|||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
||||
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: prometheus-k8s-rules
|
||||
|
||||
|
|
|
@ -10,6 +10,10 @@ spec:
|
|||
serviceMonitorSelector:
|
||||
matchExpression:
|
||||
- {key: k8s-apps, operator: Exists}
|
||||
ruleSelector:
|
||||
matchLabels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
resources:
|
||||
requests:
|
||||
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue