mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
assets,site/content: daily assets regeneration
This commit is contained in:
parent
3df33bef38
commit
809156abf1
5 changed files with 45 additions and 18 deletions
|
@ -438,9 +438,9 @@ groups:
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||||
summary: Kubernetes API server client is experiencing errors.
|
summary: Kubernetes API server client is experiencing errors.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, namespace)
|
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace)
|
||||||
/
|
/
|
||||||
sum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))
|
sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace))
|
||||||
> 0.01
|
> 0.01
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
|
@ -598,11 +598,11 @@ groups:
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||||
summary: Kubelet is running at capacity.
|
summary: Kubelet is running at capacity.
|
||||||
expr: |
|
expr: |
|
||||||
count by(node) (
|
count by(cluster, node) (
|
||||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||||
)
|
)
|
||||||
/
|
/
|
||||||
max by(node) (
|
max by(cluster, node) (
|
||||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||||
) > 0.95
|
) > 0.95
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -615,7 +615,7 @@ groups:
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||||
summary: Node readiness status is flapping.
|
summary: Node readiness status is flapping.
|
||||||
expr: |
|
expr: |
|
||||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
@ -638,8 +638,8 @@ groups:
|
||||||
- name: node.rules
|
- name: node.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
topk by(namespace, pod) (1,
|
topk by(cluster, namespace, pod) (1,
|
||||||
max by (node, namespace, pod) (
|
max by (cluster, node, namespace, pod) (
|
||||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||||
))
|
))
|
||||||
record: 'node_namespace_pod:kube_pod_info:'
|
record: 'node_namespace_pod:kube_pod_info:'
|
||||||
|
|
|
@ -215,14 +215,25 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusScrapeBodySizeLimitHit
|
- alert: PrometheusScrapeBodySizeLimitHit
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value
|
||||||
}} targets because some targets exceeded the configured body_size_limit.
|
}} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
|
||||||
summary: Prometheus has dropped some targets that exceeded body size limit.
|
summary: Prometheus has dropped some targets that exceeded body size limit.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus"}[5m]) > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
- alert: PrometheusScrapeSampleLimitHit
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value
|
||||||
|
}} scrapes in the last 5m because some targets exceeded the configured sample_limit.
|
||||||
|
summary: Prometheus has failed scrapes that have exceeded the configured sample
|
||||||
|
limit.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- alert: PrometheusTargetSyncFailure
|
- alert: PrometheusTargetSyncFailure
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}}
|
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}}
|
||||||
|
|
|
@ -622,9 +622,9 @@ annotations:
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||||
summary: Kubernetes API server client is experiencing errors.
|
summary: Kubernetes API server client is experiencing errors.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, namespace)
|
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace)
|
||||||
/
|
/
|
||||||
sum(rate(rest_client_requests_total[5m])) by (instance, job, namespace))
|
sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace))
|
||||||
> 0.01
|
> 0.01
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
|
@ -860,11 +860,11 @@ annotations:
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||||
summary: Kubelet is running at capacity.
|
summary: Kubelet is running at capacity.
|
||||||
expr: |
|
expr: |
|
||||||
count by(node) (
|
count by(cluster, node) (
|
||||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||||
)
|
)
|
||||||
/
|
/
|
||||||
max by(node) (
|
max by(cluster, node) (
|
||||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||||
) > 0.95
|
) > 0.95
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -883,7 +883,7 @@ annotations:
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||||
summary: Node readiness status is flapping.
|
summary: Node readiness status is flapping.
|
||||||
expr: |
|
expr: |
|
||||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -2028,8 +2028,8 @@ record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||||
|
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
expr: |
|
expr: |
|
||||||
topk by(namespace, pod) (1,
|
topk by(cluster, namespace, pod) (1,
|
||||||
max by (node, namespace, pod) (
|
max by (cluster, node, namespace, pod) (
|
||||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||||
))
|
))
|
||||||
record: 'node_namespace_pod:kube_pod_info:'
|
record: 'node_namespace_pod:kube_pod_info:'
|
||||||
|
|
|
@ -319,8 +319,8 @@ labels:
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
alert: PrometheusScrapeBodySizeLimitHit
|
alert: PrometheusScrapeBodySizeLimitHit
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value
|
||||||
}} targets because some targets exceeded the configured body_size_limit.
|
}} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
|
||||||
summary: Prometheus has dropped some targets that exceeded body size limit.
|
summary: Prometheus has dropped some targets that exceeded body size limit.
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus"}[5m]) > 0
|
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus"}[5m]) > 0
|
||||||
|
@ -329,6 +329,22 @@ labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
{{< /code >}}
|
{{< /code >}}
|
||||||
|
|
||||||
|
##### PrometheusScrapeSampleLimitHit
|
||||||
|
|
||||||
|
{{< code lang="yaml" >}}
|
||||||
|
alert: PrometheusScrapeSampleLimitHit
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{$labels.instance}} has failed {{ printf "%.0f" $value
|
||||||
|
}} scrapes in the last 5m because some targets exceeded the configured sample_limit.
|
||||||
|
summary: Prometheus has failed scrapes that have exceeded the configured sample
|
||||||
|
limit.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
{{< /code >}}
|
||||||
|
|
||||||
##### PrometheusTargetSyncFailure
|
##### PrometheusTargetSyncFailure
|
||||||
|
|
||||||
{{< code lang="yaml" >}}
|
{{< code lang="yaml" >}}
|
||||||
|
|
Loading…
Reference in a new issue