1
0
Fork 0
mirror of https://github.com/prometheus-operator/prometheus-operator.git synced 2025-04-21 11:48:53 +00:00

kube-prometheus: generate manifests without kubectl

For `--dry-run` to work with kubectl a Kubernetes cluster's apiserver is
actually used, which is unnecessary for generating these manifests. This
approach also allows further customization, such as adding labels to the
generated manifests.
This commit is contained in:
Frederic Branczyk 2017-03-10 14:15:10 +01:00
parent 6c891c9fe3
commit 21ebb87254
No known key found for this signature in database
GPG key ID: CA14788B1E48B256
11 changed files with 242 additions and 115 deletions

View file

@ -281,6 +281,10 @@ spec:
serviceMonitorSelector:
matchExpression:
- {key: k8s-apps, operator: Exists}
ruleSelector:
matchLabels:
role: prometheus-rulefiles
prometheus: k8s
resources:
requests:
# 2Gi is default, but won't schedule if you don't have a node with >2Gi

View file

@ -29,7 +29,7 @@ ALERT HighNumberOfFailedHTTPRequests
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
FOR 5m
LABELS {

View file

@ -32,6 +32,12 @@ kctl apply -f manifests/prometheus/prometheus-k8s-service.yaml
kctl apply -f manifests/alertmanager/alertmanager-config.yaml
kctl apply -f manifests/alertmanager/alertmanager-service.yaml
# unfortunately statefulsets cannot be changed except for their replica count
# so we need to make sure that the rule files are created before we create the
# prometheus resource so it can properly discover the rule files when creating
# the statefulset
sleep 5
# `kubectl apply` is currently not working for third party resources so we are
# using `kubectl create` here for the time being.
# (https://github.com/kubernetes/kubernetes/issues/29542)

View file

@ -0,0 +1,11 @@
#!/bin/bash
cat <<-EOF
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-main
data:
alertmanager.yaml: $(cat assets/alertmanager/alertmanager.yaml | base64 --wrap=0)
EOF

View file

@ -0,0 +1,15 @@
#!/bin/bash
cat <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
data:
EOF
for f in assets/grafana/*
do
echo " $(basename $f): |+"
cat $f | sed "s/^/ /g"
done

View file

@ -1,11 +1,11 @@
#!/bin/bash
# Generate Alert Rules ConfigMap
kubectl create configmap --dry-run=true prometheus-k8s-rules --from-file=assets/prometheus/rules/ -oyaml > manifests/prometheus/prometheus-k8s-rules.yaml
hack/scripts/generate-rules-configmap.sh > manifests/prometheus/prometheus-k8s-rules.yaml
# Generate Dashboard ConfigMap
kubectl create configmap --dry-run=true grafana-dashboards --from-file=assets/grafana/ -oyaml > manifests/grafana/grafana-dashboards.yaml
hack/scripts/generate-dashboards-configmap.sh > manifests/grafana/grafana-dashboards.yaml
# Generate Secret for Alertmanager config
kubectl create secret generic alertmanager-main --dry-run --from-file=assets/alertmanager/alertmanager.yaml -oyaml > manifests/alertmanager/alertmanager-config.yaml
hack/scripts/generate-alertmanager-config-secret.sh > manifests/alertmanager/alertmanager-config.yaml

View file

@ -0,0 +1,18 @@
#!/bin/bash
cat <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-rules
labels:
role: prometheus-rulefiles
prometheus: k8s
data:
EOF
for f in assets/prometheus/rules/*.rules
do
echo " $(basename $f): |+"
cat $f | sed "s/^/ /g"
done

View file

@ -1,7 +1,6 @@
apiVersion: v1
data:
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==
kind: Secret
metadata:
creationTimestamp: null
name: alertmanager-main
data:
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnd2ViaG9vaycKcmVjZWl2ZXJzOgotIG5hbWU6ICd3ZWJob29rJwogIHdlYmhvb2tfY29uZmlnczoKICAtIHVybDogJ2h0dHA6Ly9hbGVydG1hbmFnZXJ3aDozMDUwMC8nCg==

View file

@ -1,6 +1,9 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
data:
all-nodes-dashboard.json: |
all-nodes-dashboard.json: |+
{
"dashboard":
{
@ -861,7 +864,7 @@ data:
],
"overwrite": true
}
deployment-dashboard.json: |-
deployment-dashboard.json: |+
{
"dashboard": {
"__inputs": [
@ -1678,8 +1681,7 @@ data:
}
],
"overwrite": true
}
kubernetes-pods-dashboard.json: |
} kubernetes-pods-dashboard.json: |+
{
"dashboard": {
"__inputs": [
@ -2089,7 +2091,7 @@ data:
],
"overwrite": true
}
node-dashboard.json: |
node-dashboard.json: |+
{
"dashboard":
{
@ -2970,7 +2972,7 @@ data:
],
"overwrite": true
}
prometheus-datasource.json: |
prometheus-datasource.json: |+
{
"access": "proxy",
"basicAuth": false,
@ -2978,7 +2980,7 @@ data:
"type": "prometheus",
"url": "http://prometheus-k8s.monitoring.svc:9090"
}
resource-requests-dashboard.json: |-
resource-requests-dashboard.json: |+
{
"__inputs": [
{
@ -3402,8 +3404,4 @@ data:
"timezone": "browser",
"title": "Resource Requests",
"version": 1
}
kind: ConfigMap
metadata:
creationTimestamp: null
name: grafana-dashboards
}

View file

@ -1,62 +1,138 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-rules
labels:
role: prometheus-rulefiles
prometheus: k8s
data:
etcd2.rules: "### General cluster availability ###\n\n# alert if another failed
peer will result in an unavailable cluster\nALERT InsufficientPeers\n IF count(up{job=\"etcd-k8s\"}
== 0) > (count(up{job=\"etcd-k8s\"}) / 2 - 1)\n FOR 3m\n LABELS {\n severity
= \"critical\"\n }\n ANNOTATIONS {\n summary = \"Etcd cluster small\",\n
\ description = \"If one more etcd peer goes down the cluster will be unavailable\",\n
\ }\n\n### HTTP requests alerts ###\n\n# alert if more than 1% of requests to
an HTTP endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))\n
\ / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m])) >
0.01\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
\ summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}\",\n }\n\n# alert if more than 5% of requests to an HTTP
endpoint have failed with a non 4xx response\nALERT HighNumberOfFailedHTTPRequests\n
\ IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\", code!~\"4[0-9]{2}\"}[5m]))
\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
> 0.05\n FOR 5m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}\",\n }\n\n# alert if 50% of requests get a 4xx response\nALERT
HighNumberOfFailedHTTPRequests\n IF sum by(method) (rate(etcd_http_failed_total{job=\"etcd-k8s\",
code=~\"4[0-9]{2}\"}[5m]))\n / sum by(method) (rate(etcd_http_received_total{job=\"etcd-k8s\"}[5m]))
> 0.5\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"a high number of HTTP requests are failing\",\n description
= \"{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses
on etcd instance {{ $labels.instance }}\",\n }\n\n# alert if the 99th percentile
of HTTP requests take more than 150ms\nALERT HTTPRequestsSlow\n IF histogram_quantile(0.99,
rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15\n FOR 10m\n LABELS
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"slow HTTP
requests\",\n description = \"on ectd instance {{ $labels.instance }} HTTP
requests to {{ $label.method }} are slow\",\n }\n\n### File descriptor alerts
###\n\ninstance:fd_utilization = process_open_fds / process_max_fds\n\n# alert
if file descriptors are likely to exhaust within the next 4 hours\nALERT FdExhaustionClose\n
\ IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1\n FOR 10m\n LABELS
{\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary = \"file descriptors
soon exhausted\",\n description = \"{{ $labels.job }} instance {{ $labels.instance
}} will exhaust in file descriptors soon\",\n }\n\n# alert if file descriptors
are likely to exhaust within the next hour\nALERT FdExhaustionClose\n IF predict_linear(instance:fd_utilization[10m],
3600) > 1\n FOR 10m\n LABELS {\n severity = \"critical\"\n }\n ANNOTATIONS
{\n summary = \"file descriptors soon exhausted\",\n description = \"{{
$labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors
soon\",\n }\n\n### etcd proposal alerts ###\n\n# alert if there are several failed
proposals within an hour\nALERT HighNumberOfFailedProposals\n IF increase(etcd_server_proposal_failed_total{job=\"etcd\"}[1h])
> 5\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n summary
= \"a high number of failed proposals within the etcd cluster are happening\",\n
\ description = \"etcd instance {{ $labels.instance }} has seen {{ $value }}
proposal failures within the last hour\",\n }\n\n### etcd disk io latency alerts
###\n\n# alert if 99th percentile of fsync durations is higher than 500ms\nALERT
HighFsyncDurations\n IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
> 0.5\n FOR 10m\n LABELS {\n severity = \"warning\"\n }\n ANNOTATIONS {\n
\ summary = \"high fsync durations\",\n description = \"ectd instance {{
$labels.instance }} fync durations are high\",\n }\n"
etcd2.rules: |+
### General cluster availability ###
# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
IF count(up{job="etcd-k8s"} == 0) > (count(up{job="etcd-k8s"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Etcd cluster small",
description = "If one more etcd peer goes down the cluster will be unavailable",
}
### HTTP requests alerts ###
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
}
# alert if 50% of requests get a 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="etcd-k8s", code=~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job="etcd-k8s"}[5m])) > 0.5
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on ectd instance {{ $labels.instance }} HTTP requests to {{ $label.method }} are slow",
}
### File descriptor alerts ###
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust in file descriptors soon",
}
### etcd proposal alerts ###
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of failed proposals within the etcd cluster are happening",
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
}
### etcd disk io latency alerts ###
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "ectd instance {{ $labels.instance }} fync durations are high",
}
kubernetes.rules: |+
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
### Container resources ###
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -65,7 +141,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:spec_cpu_shares =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -74,7 +150,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:cpu_usage:rate =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -85,7 +161,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_usage:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -94,7 +170,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_working_set:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -103,7 +179,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_rss:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -112,7 +188,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_cache:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -121,7 +197,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:disk_usage:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
@ -130,7 +206,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
@ -141,7 +217,7 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_oom:rate =
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
@ -152,39 +228,39 @@ data:
"pod_name", "^(.*)-[a-z0-9]+"
)
)
### Cluster resources ###
cluster:memory_allocation:percent =
100 * sum by (cluster) (
container_spec_memory_limit_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
cluster:memory_used:percent =
100 * sum by (cluster) (
container_memory_usage_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
cluster:cpu_allocation:percent =
100 * sum by (cluster) (
container_spec_cpu_shares{pod_name!=""}
) / sum by (cluster) (
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
)
cluster:node_cpu_use:percent =
100 * sum by (cluster) (
rate(node_cpu{mode!="idle"}[5m])
) / sum by (cluster) (
machine_cpu_cores
)
### API latency ###
# Raw metrics are in microseconds. Convert to seconds.
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(
@ -201,30 +277,30 @@ data:
0.5,
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
) / 1e6
### Scheduling latency ###
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
ALERT K8SNodeDown
IF up{job="kubelet"} == 0
FOR 1h
@ -236,7 +312,7 @@ data:
summary = "Kubelet cannot be scraped",
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour",
}
ALERT K8SNodeNotReady
IF kube_node_status_ready{condition="true"} == 0
FOR 1h
@ -248,7 +324,7 @@ data:
summary = "Node status is NotReady",
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
}
ALERT K8SManyNodesNotReady
IF
count by (cluster) (kube_node_status_ready{condition="true"} == 0) > 1
@ -267,7 +343,7 @@ data:
summary = "Many K8s nodes are Not Ready",
description = "{{ $value }} K8s nodes (more than 10% of cluster {{ $labels.cluster }}) are in the NotReady state.",
}
ALERT K8SKubeletNodeExporterDown
IF up{job="node-exporter"} == 0
FOR 15m
@ -279,7 +355,7 @@ data:
summary = "Kubelet node_exporter cannot be scraped",
description = "Prometheus could not scrape a {{ $labels.job }} for more than one hour.",
}
ALERT K8SKubeletDown
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
FOR 1h
@ -291,7 +367,7 @@ data:
summary = "Many Kubelets cannot be scraped",
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
}
ALERT K8SApiserverDown
IF up{job="kubernetes"} == 0
FOR 15m
@ -303,7 +379,7 @@ data:
summary = "API server unreachable",
description = "An API server could not be scraped.",
}
# Disable for non HA kubernetes setups.
ALERT K8SApiserverDown
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
@ -316,7 +392,7 @@ data:
summary = "API server unreachable",
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
}
ALERT K8SSchedulerDown
IF absent(up{job="kube-scheduler"}) or (count by(cluster) (up{job="kube-scheduler"} == 1) == 0)
FOR 5m
@ -328,7 +404,7 @@ data:
summary = "Scheduler is down",
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
}
ALERT K8SControllerManagerDown
IF absent(up{job="kube-controller-manager"}) or (count by(cluster) (up{job="kube-controller-manager"} == 1) == 0)
FOR 5m
@ -340,7 +416,7 @@ data:
summary = "Controller manager is down",
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
}
ALERT K8SConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
FOR 10m
@ -352,7 +428,7 @@ data:
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{ $value }}% full.",
}
ALERT K8SConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
LABELS {
@ -363,7 +439,7 @@ data:
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{ $value }}% full.",
}
# To catch the conntrack sysctl de-tuning when it happens
ALERT K8SConntrackTuningMissing
IF node_nf_conntrack_udp_timeout > 10
@ -376,7 +452,7 @@ data:
summary = "Node does not have the correct conntrack tunings",
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 50
FOR 10m
@ -388,7 +464,7 @@ data:
summary = "{{ $labels.job }} has too many open file descriptors",
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelet|kubernetes"} / process_max_fds > 80
FOR 10m
@ -400,7 +476,7 @@ data:
summary = "{{ $labels.job }} has too many open file descriptors",
description = "{{ $labels.node }} is using {{ $value }}% of the available file/socket descriptors.",
}
# Some verbs excluded because they are expected to be long-lasting:
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
ALERT K8SApiServerLatency
@ -417,7 +493,7 @@ data:
summary = "Kubernetes apiserver latency is high",
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
}
ALERT K8SApiServerEtcdAccessLatency
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
FOR 15m
@ -429,7 +505,7 @@ data:
summary = "Access to etcd is slow",
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
}
ALERT K8SKubeletTooManyPods
IF kubelet_running_pod_count > 100
LABELS {
@ -440,8 +516,4 @@ data:
summary = "Kubelet is close to pod limit",
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
}
kind: ConfigMap
metadata:
creationTimestamp: null
name: prometheus-k8s-rules

View file

@ -10,6 +10,10 @@ spec:
serviceMonitorSelector:
matchExpression:
- {key: k8s-apps, operator: Exists}
ruleSelector:
matchLabels:
role: prometheus-rulefiles
prometheus: k8s
resources:
requests:
# 2Gi is default, but won't schedule if you don't have a node with >2Gi