mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-16 09:16:38 +00:00
This patch introduces a new Custom Resource Definition to the Prometheus Operator - the Rule CRD. It addresses two main needs: 1. Prometheus (alerting and recording) Rule validation during creation time via Kubernetes Custom Resource Definition validation. 2. Life-cycle management of Prometheus application Rules alongside the application itself, inside the applications Kubernetes namespace, not necessarily the namespace of the scraping Prometheus instance. A user defines Prometheus alerting and recording Rules via a Kubernetes Custom Resource Definition. These Custom Resource Definitions can be fully validated by the Kubernetes API server during creation time via automatically generated OpenAPI specifications. Instead of the restriction of a Prometheus instance to only select Rule definitions inside its own namespace, the Prometheus specification is extended to also specify namespaces to look for Rule Custom Resource Definitions outside its own namespace. --- Dependent technical changes: - prometheus: Use github.com/jimmidyson/configmap-reload to reload rules - prometheus: Remove Prometheus Statefulset deletion function. Starting with K8s >=1.8 this is handled via OwnerReferences. - prometheus: Do not add rule files checksum to Prometheus configuration secret - prometheus: Update StatefulSet only on relevant changes. Instead of updating the Prometheus StatefulSet on every `sync()` run, only update it if the input parameters to `makeStatefulSet` change. Enforce this via a checksum of the parameters which is saved inside the annotations of the statefulset. - e2e/prometheus: Check how often resources (Secret, ConfigMap, Prometheus CRD, Service) are updated to enforce that Prometheus Operator only updated created resources if necessary. - contrib/prometheus-config-reloader: Remove logic to retriev K8s ConfigMaps. These are mounted into the pod right away now.
167 lines
15 KiB
YAML
167 lines
15 KiB
YAML
apiVersion: v1
|
|
data:
|
|
all.rules.yaml: "\"groups\": \n- \"name\": \"k8s.rules\"\n \"rules\": \n - \"expr\":
|
|
|\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\"}[5m]))
|
|
by (namespace)\n \"record\": \"namespace:container_cpu_usage_seconds_total:sum_rate\"\n
|
|
\ - \"expr\": |\n sum(container_memory_usage_bytes{job=\"kubelet\", image!=\"\"})
|
|
by (namespace)\n \"record\": \"namespace:container_memory_usage_bytes:sum\"\n
|
|
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\",
|
|
image!=\"\"}[5m])) by (namespace, pod_name)\n * on (namespace, pod_name)
|
|
group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
|
|
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_cpu_usage_seconds_total:sum_rate\"\n
|
|
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(container_memory_usage_bytes{job=\"kubelet\",image!=\"\"})
|
|
by (pod_name, namespace)\n * on (namespace, pod_name) group_left(label_name)\n
|
|
\ label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\",
|
|
\"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:container_memory_usage_bytes:sum\"\n
|
|
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"})
|
|
by (namespace, pod)\n * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
|
|
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_memory_bytes:sum\"\n
|
|
\ - \"expr\": |\n sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"}
|
|
and on(pod) kube_pod_status_scheduled{condition=\"true\"}) by (namespace, pod)\n
|
|
\ * on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"},
|
|
\"pod_name\", \"$1\", \"pod\", \"(.*)\")\n )\n \"record\": \"namespace_name:kube_pod_container_resource_requests_cpu_cores:sum\"\n-
|
|
\"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info)
|
|
by (node))\"\n \"record\": \":kube_pod_info_node_count:\"\n - \"expr\": |\n
|
|
\ max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\",
|
|
\"pod\", \"(.*)\")) by (node, namespace, pod)\n \"record\": \"node_namespace_pod:kube_pod_info:\"\n
|
|
\ - \"expr\": |\n count by (node) (sum by (node, cpu) (\n node_cpu{job=\"node-exporter\"}\n
|
|
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ ))\n \"record\": \"node:node_num_cpu:sum\"\n - \"expr\": |\n 1
|
|
- avg(rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m]))\n \"record\":
|
|
\":node_cpu_utilisation:avg1m\"\n - \"expr\": |\n 1 - avg by (node) (\n
|
|
\ rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[1m])\n * on (namespace,
|
|
pod) group_left(node)\n node_namespace_pod:kube_pod_info:)\n \"record\":
|
|
\"node:node_cpu_utilisation:avg1m\"\n - \"expr\": |\n sum(node_load1{job=\"node-exporter\"})\n
|
|
\ /\n sum(node:node_num_cpu:sum)\n \"record\": \":node_cpu_saturation_load1:\"\n
|
|
\ - \"expr\": |\n sum by (node) (\n node_load1{job=\"node-exporter\"}\n
|
|
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ )\n /\n node:node_num_cpu:sum\n \"record\": \"node:node_cpu_saturation_load1:\"\n
|
|
\ - \"expr\": |\n 1 -\n sum(node_memory_MemFree{job=\"node-exporter\"}
|
|
+ node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n
|
|
\ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n \"record\":
|
|
\":node_memory_utilisation:\"\n - \"expr\": |\n sum by (node) (\n (node_memory_MemFree{job=\"node-exporter\"}
|
|
+ node_memory_Cached{job=\"node-exporter\"} + node_memory_Buffers{job=\"node-exporter\"})\n
|
|
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ )\n \"record\": \"node:node_memory_bytes_available:sum\"\n - \"expr\":
|
|
|\n sum by (node) (\n node_memory_MemTotal{job=\"node-exporter\"}\n
|
|
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ )\n \"record\": \"node:node_memory_bytes_total:sum\"\n - \"expr\": |\n
|
|
\ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n
|
|
\ /\n scalar(sum(node:node_memory_bytes_total:sum))\n \"record\":
|
|
\"node:node_memory_utilisation:ratio\"\n - \"expr\": |\n 1e3 * sum(\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n
|
|
\ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n )\n \"record\":
|
|
\":node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n 1 -\n sum
|
|
by (node) (\n (node_memory_MemFree{job=\"node-exporter\"} + node_memory_Cached{job=\"node-exporter\"}
|
|
+ node_memory_Buffers{job=\"node-exporter\"})\n * on (namespace, pod) group_left(node)\n
|
|
\ node_namespace_pod:kube_pod_info:\n )\n /\n sum by (node)
|
|
(\n node_memory_MemTotal{job=\"node-exporter\"}\n * on (namespace,
|
|
pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\":
|
|
\"node:node_memory_utilisation:\"\n - \"expr\": |\n 1 - (node:node_memory_bytes_available:sum
|
|
/ node:node_memory_bytes_total:sum)\n \"record\": \"node:node_memory_utilisation_2:\"\n
|
|
\ - \"expr\": |\n 1e3 * sum by (node) (\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n
|
|
\ + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n * on (namespace,
|
|
pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n )\n \"record\":
|
|
\"node:node_memory_swap_io_bytes:sum_rate\"\n - \"expr\": |\n avg(irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
|
/ 1e3)\n \"record\": \":node_disk_utilisation:avg_irate\"\n - \"expr\": |\n
|
|
\ avg by (node) (\n irate(node_disk_io_time_ms{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
|
/ 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ )\n \"record\": \"node:node_disk_utilisation:avg_irate\"\n - \"expr\":
|
|
|\n avg(irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
|
/ 1e3)\n \"record\": \":node_disk_saturation:avg_irate\"\n - \"expr\": |\n
|
|
\ avg by (node) (\n irate(node_disk_io_time_weighted{job=\"node-exporter\",device=~\"(sd|xvd).+\"}[1m])
|
|
/ 1e3\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ )\n \"record\": \"node:node_disk_saturation:avg_irate\"\n - \"expr\":
|
|
|\n sum(irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))
|
|
+\n sum(irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
|
\ \"record\": \":node_net_utilisation:sum_irate\"\n - \"expr\": |\n sum
|
|
by (node) (\n (irate(node_network_receive_bytes{job=\"node-exporter\",device=\"eth0\"}[1m])
|
|
+\n irate(node_network_transmit_bytes{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
|
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ )\n \"record\": \"node:node_net_utilisation:sum_irate\"\n - \"expr\":
|
|
|\n sum(irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))
|
|
+\n sum(irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
|
\ \"record\": \":node_net_saturation:sum_irate\"\n - \"expr\": |\n sum
|
|
by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m])
|
|
+\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
|
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
|
\ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-apps\"\n
|
|
\ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n
|
|
\ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
|
}}) is restarting {{ printf \\\"%.2f\\\" $value }} / second\"\n \"expr\": |\n
|
|
\ rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m])
|
|
> 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\": \"critical\"\n
|
|
\ - \"alert\": \"KubePodNotReady\"\n \"annotations\": \n \"message\":
|
|
\"{{ $labels.namespace }}/{{ $labels.pod }} is not ready.\"\n \"expr\": |\n
|
|
\ sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",
|
|
phase!~\"Running|Succeeded\"}) > 0\n \"for\": \"1h\"\n \"labels\": \n \"severity\":
|
|
\"critical\"\n - \"alert\": \"KubeDeploymentGenerationMismatch\"\n \"annotations\":
|
|
\n \"message\": \"Deployment {{ $labels.namespace }}/{{ $labels.deployment
|
|
}} generation mismatch\"\n \"expr\": |\n kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n
|
|
\ !=\n kube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n
|
|
\ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n -
|
|
\"alert\": \"KubeDeploymentReplicasMismatch\"\n \"annotations\": \n \"message\":
|
|
\"Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica mismatch\"\n
|
|
\ \"expr\": |\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n
|
|
\ !=\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n
|
|
\ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n- \"name\":
|
|
\"kubernetes-resources\"\n \"rules\": \n - \"alert\": \"KubeCPUOvercommit\"\n
|
|
\ \"annotations\": \n \"message\": \"Overcommited CPU resource requests
|
|
on Pods, cannot tolerate node failure.\"\n \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n
|
|
\ /\n sum(node:node_num_cpu:sum)\n >\n (count(node:node_num_cpu:sum)-1)
|
|
/ count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\": \n \"severity\":
|
|
\"warning\"\n - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\":
|
|
\"Overcommited Memory resource requests on Pods, cannot tolerate node failure.\"\n
|
|
\ \"expr\": |\n sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)\n
|
|
\ /\n sum(node_memory_MemTotal)\n >\n (count(node:node_num_cpu:sum)-1)\n
|
|
\ /\n count(node:node_num_cpu:sum)\n \"for\": \"5m\"\n \"labels\":
|
|
\n \"severity\": \"warning\"\n - \"alert\": \"KubeCPUOvercommit\"\n \"annotations\":
|
|
\n \"message\": \"Overcommited CPU resource request quota on Namespaces.\"\n
|
|
\ \"expr\": |\n sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\",
|
|
resource=\"requests.cpu\"})\n /\n sum(node:node_num_cpu:sum)\n >
|
|
1.5\n \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
|
\ - \"alert\": \"KubeMemOvercommit\"\n \"annotations\": \n \"message\":
|
|
\"Overcommited Memory resource request quota on Namespaces.\"\n \"expr\": |\n
|
|
\ sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.memory\"})\n
|
|
\ /\n sum(node_memory_MemTotal{job=\"node-exporter\"})\n > 1.5\n
|
|
\ \"for\": \"5m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\":
|
|
\"KubeQuotaExceeded\"\n \"annotations\": \n \"message\": \"{{ printf \\\"%0.0f\\\"
|
|
$value }}% usage of {{ $labels.resource }} in namespace {{ $labels.namespace }}.\"\n
|
|
\ \"expr\": |\n 100 * kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n
|
|
\ / ignoring(instance, job, type)\n kube_resourcequota{job=\"kube-state-metrics\",
|
|
type=\"hard\"}\n > 90\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
|
|
\"warning\"\n- \"name\": \"kubernetes-storage\"\n \"rules\": \n - \"alert\":
|
|
\"KubePersistentVolumeUsageCritical\"\n \"annotations\": \n \"message\":
|
|
\"The persistent volume claimed by {{ $labels.persistentvolumeclaim }} in namespace
|
|
{{ $labels.namespace }} has {{ printf \\\"%0.0f\\\" $value }}% free.\"\n \"expr\":
|
|
|\n 100 * kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n
|
|
\ kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 3\n \"for\":
|
|
\"1m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubePersistentVolumeFullInFourDays\"\n
|
|
\ \"annotations\": \n \"message\": \"Based on recent sampling, the persistent
|
|
volume claimed by {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace
|
|
}} is expected to fill up within four days.\"\n \"expr\": |\n predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[1h],
|
|
4 * 24 * 3600) < 0\n \"for\": \"5m\"\n \"labels\": \n \"severity\":
|
|
\"critical\"\n- \"name\": \"kubernetes-system\"\n \"rules\": \n - \"alert\":
|
|
\"KubeNodeNotReady\"\n \"annotations\": \n \"message\": \"{{ $labels.node
|
|
}} has been unready for more than an hour\"\n \"expr\": |\n max(kube_node_status_ready{job=\"kube-state-metrics\",
|
|
condition=\"false\"} == 1) BY (node)\n \"for\": \"1h\"\n \"labels\": \n
|
|
\ \"severity\": \"warning\"\n - \"alert\": \"KubeVersionMismatch\"\n \"annotations\":
|
|
\n \"message\": \"There are {{ $value }} different versions of Kubernetes
|
|
components running.\"\n \"expr\": |\n count(count(kubernetes_build_info{job!=\"kube-dns\"})
|
|
by (gitVersion)) > 1\n \"for\": \"1h\"\n \"labels\": \n \"severity\":
|
|
\"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\":
|
|
\"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing
|
|
{{ printf \\\"%0.0f\\\" $value }}% errors.'\"\n \"expr\": |\n sum(rate(rest_client_requests_total{code!~\"2..\"}[5m]))
|
|
by (instance, job) * 100\n /\n sum(rate(rest_client_requests_total[5m]))
|
|
by (instance, job)\n > 1\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
|
|
\"warning\"\n - \"alert\": \"KubeClientErrors\"\n \"annotations\": \n \"message\":
|
|
\"Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing
|
|
{{ printf \\\"%0.0f\\\" $value }} errors / sec.'\"\n \"expr\": |\n sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m]))
|
|
by (instance, job) > 0.1\n \"for\": \"15m\"\n \"labels\": \n \"severity\":
|
|
\"warning\""
|
|
kind: ConfigMap
|
|
metadata:
|
|
labels:
|
|
prometheus: k8s
|
|
role: alert-rules
|
|
name: prometheus-k8s-rules
|
|
namespace: monitoring
|