diff --git a/assets/kubernetes/alerts.yaml b/assets/kubernetes/alerts.yaml index 289e964..9765bc3 100644 --- a/assets/kubernetes/alerts.yaml +++ b/assets/kubernetes/alerts.yaml @@ -438,9 +438,9 @@ groups: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors summary: Kubernetes API server client is experiencing errors. expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, namespace) + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace) / - sum(rate(rest_client_requests_total[5m])) by (instance, job, namespace)) + sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01 for: 15m labels: @@ -598,11 +598,11 @@ groups: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods summary: Kubelet is running at capacity. expr: | - count by(node) ( + count by(cluster, node) ( (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) ) / - max by(node) ( + max by(cluster, node) ( kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 ) > 0.95 for: 15m @@ -615,7 +615,7 @@ groups: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping summary: Node readiness status is flapping. expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: severity: warning diff --git a/assets/kubernetes/rules.yaml b/assets/kubernetes/rules.yaml index 0aa7b05..136bed3 100644 --- a/assets/kubernetes/rules.yaml +++ b/assets/kubernetes/rules.yaml @@ -638,8 +638,8 @@ groups: - name: node.rules rules: - expr: | - topk by(namespace, pod) (1, - max by (node, namespace, pod) ( + topk by(cluster, namespace, pod) (1, + max by (cluster, node, namespace, pod) ( label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") )) record: 'node_namespace_pod:kube_pod_info:' diff --git a/assets/prometheus/alerts.yaml b/assets/prometheus/alerts.yaml index 2bcb432..f9b3471 100644 --- a/assets/prometheus/alerts.yaml +++ b/assets/prometheus/alerts.yaml @@ -215,14 +215,25 @@ groups: severity: warning - alert: PrometheusScrapeBodySizeLimitHit annotations: - description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value - }} targets because some targets exceeded the configured body_size_limit. + description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value + }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. summary: Prometheus has dropped some targets that exceeded body size limit. expr: | increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus"}[5m]) > 0 for: 15m labels: severity: warning + - alert: PrometheusScrapeSampleLimitHit + annotations: + description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value + }} scrapes in the last 5m because some targets exceeded the configured sample_limit. + summary: Prometheus has failed scrapes that have exceeded the configured sample + limit. + expr: | + increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus"}[5m]) > 0 + for: 15m + labels: + severity: warning - alert: PrometheusTargetSyncFailure annotations: description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}} diff --git a/site/content/kubernetes/_index.md b/site/content/kubernetes/_index.md index e9e5a42..59d975e 100644 --- a/site/content/kubernetes/_index.md +++ b/site/content/kubernetes/_index.md @@ -622,9 +622,9 @@ annotations: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors summary: Kubernetes API server client is experiencing errors. expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, namespace) + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace) / - sum(rate(rest_client_requests_total[5m])) by (instance, job, namespace)) + sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01 for: 15m labels: @@ -860,11 +860,11 @@ annotations: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods summary: Kubelet is running at capacity. expr: | - count by(node) ( + count by(cluster, node) ( (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) ) / - max by(node) ( + max by(cluster, node) ( kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 ) > 0.95 for: 15m @@ -883,7 +883,7 @@ annotations: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping summary: Node readiness status is flapping. expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: severity: warning @@ -2028,8 +2028,8 @@ record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile {{< code lang="yaml" >}} expr: | - topk by(namespace, pod) (1, - max by (node, namespace, pod) ( + topk by(cluster, namespace, pod) (1, + max by (cluster, node, namespace, pod) ( label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") )) record: 'node_namespace_pod:kube_pod_info:' diff --git a/site/content/prometheus/_index.md b/site/content/prometheus/_index.md index df3a91a..b0250a5 100644 --- a/site/content/prometheus/_index.md +++ b/site/content/prometheus/_index.md @@ -319,8 +319,8 @@ labels: {{< code lang="yaml" >}} alert: PrometheusScrapeBodySizeLimitHit annotations: - description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value - }} targets because some targets exceeded the configured body_size_limit. + description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value + }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. summary: Prometheus has dropped some targets that exceeded body size limit. expr: | increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus"}[5m]) > 0 @@ -329,6 +329,22 @@ labels: severity: warning {{< /code >}} +##### PrometheusScrapeSampleLimitHit + +{{< code lang="yaml" >}} +alert: PrometheusScrapeSampleLimitHit +annotations: + description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value + }} scrapes in the last 5m because some targets exceeded the configured sample_limit. + summary: Prometheus has failed scrapes that have exceeded the configured sample + limit. +expr: | + increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus"}[5m]) > 0 +for: 15m +labels: + severity: warning +{{< /code >}} + ##### PrometheusTargetSyncFailure {{< code lang="yaml" >}}