mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 03:38:43 +00:00
Delete chart exporter-kube-api because it has been replaced by kube-controller-manager alerts
This commit is contained in:
parent
3923139e40
commit
59421f7018
64 changed files with 7171 additions and 7201 deletions
.gitignore
contrib/kube-prometheus
assets/prometheus/rules
hack/grafana-dashboards-configmap-generator/bin
helm
alertmanager
exporter-kube-api
exporter-kube-controller-manager
exporter-kube-dns
exporter-kube-etcd
exporter-kube-scheduler
exporter-kube-state
exporter-kubelets
exporter-kubernetes
exporter-node
grafana
hack
kube-prometheus
prometheus-operator
prometheus
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -8,3 +8,4 @@ requirements.lock
|
|||
.DS_Store
|
||||
__pycache__
|
||||
.env/
|
||||
.history/
|
|
@ -10,4 +10,4 @@ groups:
|
|||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
summary: Controller manager is down
|
||||
|
|
|
@ -203,7 +203,7 @@ addArrayToConfigMap() {
|
|||
# Dashboard foot
|
||||
test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE
|
||||
done
|
||||
#echo "---"
|
||||
echo "---"
|
||||
|
||||
IFS=$OLDIFS
|
||||
return 0
|
||||
|
|
|
@ -4,6 +4,8 @@ engine: gotpl
|
|||
maintainers:
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
name: alertmanager
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
|
|
|
@ -46,13 +46,14 @@ Parameter | Description | Default
|
|||
`config` | Alertmanager configuration directives | `{}`
|
||||
`externalUrl` | External URL at which Alertmanager will be reachable | `""`
|
||||
`image.repository` | Image | `quay.io/prometheus/alertmanager`
|
||||
`image.tag` | Image tag | `v0.9.1`
|
||||
`image.tag` | Image tag | `v0.12.0`
|
||||
`ingress.enabled` | If true, Alertmanager Ingress will be created | `false`
|
||||
`ingress.annotations` | Annotations for Alertmanager Ingress` | `{}`
|
||||
`ingress.fqdn` | Alertmanager Ingress fully-qualified domain name | `""`
|
||||
`ingress.tls` | TLS configuration for Alertmanager Ingress | `[]`
|
||||
`nodeSelector` | Node labels for pod assignment | `{}`
|
||||
`paused` | If true, the Operator won't process any Alertmanager configuration changes | `false`
|
||||
`prometheusRules` | Prometheus rules | `[templates/alertmanager.rules.yaml](templates/alertmanager.rules.yaml)`
|
||||
`replicaCount` | Number of Alertmanager replicas desired | `1`
|
||||
`resources` | Pod resource requests & limits | `{}`
|
||||
`service.annotations` | Annotations to be added to the Alertmanager Service | `{}`
|
||||
|
|
32
helm/alertmanager/templates/alertmanager.rules.yaml
Normal file
32
helm/alertmanager/templates/alertmanager.rules.yaml
Normal file
|
@ -0,0 +1,32 @@
|
|||
{{ define "alertmanager.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{`{{$labels.service}}`}}` are out of sync.
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{`{{ $labels.namespace
|
||||
}}`}}/{{`{{ $labels.pod}}`}}.
|
||||
{{ end }}
|
21
helm/alertmanager/templates/configmap.yaml
Normal file
21
helm/alertmanager/templates/configmap.yaml
Normal file
|
@ -0,0 +1,21 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
app: "alertmanager"
|
||||
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
|
||||
heritage: {{ .Release.Service }}
|
||||
prometheus: {{ .Release.Name }}
|
||||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
alertmanager.rules: |-
|
||||
{{- include "alertmanager.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -11,7 +11,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
alertmanager: {{ .Release.Name }}
|
||||
|
|
|
@ -49,7 +49,7 @@ selfServiceMonitor: true
|
|||
##
|
||||
image:
|
||||
repository: quay.io/prometheus/alertmanager
|
||||
tag: v0.9.1
|
||||
tag: v0.12.0
|
||||
|
||||
## Labels to be added to the Alertmanager
|
||||
##
|
||||
|
@ -150,36 +150,6 @@ storageSpec: {}
|
|||
# requests:
|
||||
# storage: 50Gi
|
||||
# selector: {}
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
alertmanager.rules: |-
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{$labels.service}}` are out of sync.
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
|
||||
# default rules are in templates/alertmanager.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,21 +0,0 @@
|
|||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
|
@ -1,7 +0,0 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kube-api
|
||||
version: 0.1.1
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
|
@ -1,16 +0,0 @@
|
|||
{{/* vim: set filetype=mustache: */}}
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
*/}}
|
||||
{{- define "fullname" -}}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride -}}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
|
@ -1,21 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: {{ template "name" . }}
|
||||
component: kube-api
|
||||
heritage: {{ .Release.Service }}
|
||||
release: {{ .Release.Name }}
|
||||
chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
|
||||
name: {{ template "fullname" . }}
|
||||
namespace: kube-system
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: https-metrics
|
||||
port: 443
|
||||
protocol: TCP
|
||||
targetPort: 443
|
||||
selector:
|
||||
k8s-app: kube-apiserver
|
||||
type: ClusterIP
|
|
@ -1,29 +0,0 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
labels:
|
||||
chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
|
||||
component: kube-api
|
||||
heritage: "{{ .Release.Service }}"
|
||||
release: "{{ .Release.Name }}"
|
||||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
component: kube-api
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "kube-system"
|
||||
endpoints:
|
||||
- port: https-metrics
|
||||
interval: 15s
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
# Skip verification until we have resolved why the certificate validation
|
||||
# for the kubelet on API server nodes fail.
|
||||
insecureSkipVerify: true
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
@ -1,93 +0,0 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
ruleFiles:
|
||||
kube-api.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,node,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{`{{ $labels.verb }}`}} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
||||
|
||||
### API latency ###
|
||||
|
||||
# Raw metrics are in microseconds. Convert to seconds.
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(
|
||||
0.9,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(
|
||||
0.5,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
|
||||
### File descriptor alerts
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS
|
||||
{
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS
|
||||
{
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
|
||||
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
|
||||
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
|
||||
}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kube-controller-manager
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kube-controller-manager.rules: |-
|
||||
{{- include "kube-controller-manager.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -0,0 +1,15 @@
|
|||
{{ define "kube-controller-manager.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
{{ end }}
|
|
@ -1,16 +1,2 @@
|
|||
# WARNING: Don't touch this file, it's automatically generated by helm/hack/sync-kube-prometheus.sh
|
||||
ruleFiles:
|
||||
kube-controller-manager.rules: |-
|
||||
groups:
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
# default rules are in templates/kube-controller-manager.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart singleton for kube-state-metrics
|
||||
name: exporter-kube-dns
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,7 +9,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kube-etcd
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
etcd3.rules: |-
|
||||
{{- include "etcd3.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
125
helm/exporter-kube-etcd/templates/etcd3.rules.yaml
Normal file
125
helm/exporter-kube-etcd/templates/etcd3.rules.yaml
Normal file
|
@ -0,0 +1,125 @@
|
|||
{{ define "etcd3.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: ./etcd3.rules
|
||||
rules:
|
||||
- alert: InsufficientMembers
|
||||
expr: count(up{job="{{ template "fullname" . }}"} == 0) > (count(up{job="{{ template "fullname" . }}"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: NoLeader
|
||||
expr: etcd_server_has_leader{job="{{ template "fullname" . }}"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{`{{ $labels.instance }}`}} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="{{ template "fullname" . }}"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
|
||||
on etcd instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
|
||||
on etcd instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="{{ template "fullname" . }}",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{`{{ $labels.instance }}`}} gRPC requests to {{`{{ $labels.grpc_method
|
||||
}}`}} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
|
||||
instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
|
||||
instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method
|
||||
}}`}} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} member communication with
|
||||
{{`{{ $labels.To }}`}} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="{{ template "fullname" . }}"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} proposal
|
||||
failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} commit durations are high
|
||||
summary: high commit durations
|
||||
{{ end }}
|
|
@ -1,128 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
etcd3.rules: |-
|
||||
groups:
|
||||
- name: ./etcd3.rules
|
||||
rules:
|
||||
- alert: InsufficientMembers
|
||||
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: NoLeader
|
||||
expr: etcd_server_has_leader{job="etcd"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{ $labels.instance }} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
||||
}} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} member communication with
|
||||
{{ $labels.To }} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
||||
failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||
summary: high commit durations
|
||||
# default rules are in templates/etcd3.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart singleton for kube-state-metrics
|
||||
name: exporter-kube-scheduler
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kube-scheduler.rules: |-
|
||||
{{- include "kube-scheduler.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -0,0 +1,60 @@
|
|||
{{ define "kube-scheduler.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: K8SSchedulerDown
|
||||
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S scheduler. New pods are not being assigned
|
||||
to nodes.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
||||
summary: Scheduler is down
|
||||
{{ end }}
|
|
@ -1,63 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
kube-scheduler.rules: |-
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: K8SSchedulerDown
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S scheduler. New pods are not being assigned
|
||||
to nodes.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
||||
summary: Scheduler is down
|
||||
# default rules are in templates/kube-scheduler.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -3,5 +3,7 @@ description: A Helm chart singleton for kube-state-metrics
|
|||
name: exporter-kube-state
|
||||
version: 0.1.3
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
|
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kube-state-metrics.rules: |-
|
||||
{{- include "kube-state-metrics.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -0,0 +1,57 @@
|
|||
{{ define "kube-state-metrics.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kube-state-metrics.rules
|
||||
rules:
|
||||
- alert: DeploymentGenerationMismatch
|
||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Observed deployment generation does not match expected one for
|
||||
deployment {{`{{$labels.namespaces}}`}}{{`{{$labels.deployment}}`}}
|
||||
- alert: DeploymentReplicasNotUpdated
|
||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
||||
unless (kube_deployment_spec_paused == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Replicas are not updated and available for deployment {{`{{$labels.namespaces}}`}}/{{`{{$labels.deployment}}`}}
|
||||
- alert: DaemonSetRolloutStuck
|
||||
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
|
||||
* 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Only {{`{{$value}}`}}% of desired pods scheduled and ready for daemon
|
||||
set {{`{{$labels.namespaces}}`}}/{{`{{$labels.daemonset}}`}}
|
||||
- alert: K8SDaemonSetsNotScheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not scheduled.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: DaemonSetsMissScheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are running where they are not supposed
|
||||
to run.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: PodFrequentlyRestarting
|
||||
expr: increase(kube_pod_container_status_restarts[1h]) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Pod {{`{{$labels.namespaces}}`}}/{{`{{$labels.pod}}`}} is was restarted {{`{{$value}}`}}
|
||||
times within the last hour
|
||||
{{ end }}
|
|
@ -9,7 +9,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -25,61 +25,6 @@ addon_resizer:
|
|||
requests:
|
||||
cpu: 100m
|
||||
memory: 30Mi
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
kube-state-metrics.rules: |-
|
||||
groups:
|
||||
- name: kube-state-metrics.rules
|
||||
rules:
|
||||
- alert: DeploymentGenerationMismatch
|
||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Observed deployment generation does not match expected one for
|
||||
deployment {{$labels.namespaces}}{{$labels.deployment}}
|
||||
- alert: DeploymentReplicasNotUpdated
|
||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
||||
unless (kube_deployment_spec_paused == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
||||
- alert: DaemonSetRolloutStuck
|
||||
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
|
||||
* 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
||||
set {{$labels.namespaces}}/{{$labels.daemonset}}
|
||||
- alert: K8SDaemonSetsNotScheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not scheduled.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: DaemonSetsMissScheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are running where they are not supposed
|
||||
to run.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: PodFrequentlyRestarting
|
||||
expr: increase(kube_pod_container_status_restarts[1h]) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
||||
times within the last hour
|
||||
|
||||
# default rules are in templates/kube-state-metrics.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kubelets
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kubelet.rules: |-
|
||||
{{- include "kubelet.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
49
helm/exporter-kubelets/templates/kubelet.rules.yaml
Normal file
49
helm/exporter-kubelets/templates/kubelet.rules.yaml
Normal file
|
@ -0,0 +1,49 @@
|
|||
{{ define "kubelet.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The Kubelet on {{`{{ $labels.node }}`}} has not checked in with the API,
|
||||
or has set itself to NotReady, for more than an hour
|
||||
summary: Node status is NotReady
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of Kubernetes nodes are not ready'
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets.
|
||||
- alert: K8SKubeletDown
|
||||
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
||||
* 100 > 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets, or all Kubelets
|
||||
have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{`{{$labels.instance}}`}} is running {{`{{$value}}`}} pods, close
|
||||
to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
{{ end }}
|
|
@ -1,52 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
kubelet.rules: |-
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
||||
or has set itself to NotReady, for more than an hour
|
||||
summary: Node status is NotReady
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||
- alert: K8SKubeletDown
|
||||
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
||||
* 100 > 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
||||
have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||
to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
# default rules are in templates/kubelet.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kubernetes
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kubernetes.rules: |-
|
||||
{{- include "kubernetes.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
88
helm/exporter-kubernetes/templates/kubernetes.rules.yaml
Normal file
88
helm/exporter-kubernetes/templates/kubernetes.rules.yaml
Normal file
|
@ -0,0 +1,88 @@
|
|||
{{ define "kubernetes.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kubernetes.rules
|
||||
rules:
|
||||
- record: pod_name:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(pod_name)
|
||||
- record: pod_name:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: pod_name:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
BY (pod_name)
|
||||
- record: pod_name:container_fs_usage_bytes:sum
|
||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: namespace:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
||||
BY (namespace)
|
||||
- record: cluster:memory_usage:ratio
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:container_spec_cpu_shares:ratio
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: cluster:container_cpu_usage:ratio
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
|
||||
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
|
||||
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: API server returns errors for {{`{{ $value }}`}}% of requests
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: API server returns errors for {{`{{ $value }}`}}% of requests
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="kubernetes"} == 1)
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: No API servers are reachable or all have disappeared from service
|
||||
discovery
|
||||
{{ end }}
|
|
@ -1,91 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
kubernetes.rules: |-
|
||||
groups:
|
||||
- name: kubernetes.rules
|
||||
rules:
|
||||
- record: pod_name:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(pod_name)
|
||||
- record: pod_name:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: pod_name:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
BY (pod_name)
|
||||
- record: pod_name:container_fs_usage_bytes:sum
|
||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: namespace:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
||||
BY (namespace)
|
||||
- record: cluster:memory_usage:ratio
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:container_spec_cpu_shares:ratio
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: cluster:container_cpu_usage:ratio
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: No API servers are reachable or all have disappeared from service
|
||||
discovery
|
||||
# default rules are in templates/kubernetes.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-node
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
node.rules: |-
|
||||
{{- include "node.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
46
helm/exporter-node/templates/node.rules.yaml
Normal file
46
helm/exporter-node/templates/node.rules.yaml
Normal file
|
@ -0,0 +1,46 @@
|
|||
{{ define "node.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
|
||||
full within the next 24 hours (mounted at {{`{{$labels.mountpoint}}`}})
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
|
||||
full within the next 2 hours (mounted at {{`{{$labels.mountpoint}}`}})
|
||||
{{ end }}
|
|
@ -9,7 +9,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
replicaCount: 1
|
||||
image:
|
||||
repository: quay.io/prometheus/node-exporter
|
||||
tag: v0.15.1
|
||||
tag: v0.15.2
|
||||
pullPolicy: IfNotPresent
|
||||
service:
|
||||
type: ClusterIP
|
||||
|
@ -16,50 +16,6 @@ resources:
|
|||
requests:
|
||||
cpu: 100m
|
||||
memory: 30Mi
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
node.rules: |-
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="node-exporter"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
||||
|
||||
# default rules are in templates/node.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -9,7 +9,9 @@ metadata:
|
|||
name: {{ template "grafana.server.fullname" . }}
|
||||
data:
|
||||
{{- if .Values.serverDashboardFiles }}
|
||||
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
|
||||
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
|
||||
{{ else }}
|
||||
{{- include "grafana-dashboards.yaml.tpl" . | indent 2}}
|
||||
{{- end }}
|
||||
{{- if .Values.dataSource }}
|
||||
{{ toYaml .Values.dataSource | indent 2 }}
|
||||
|
|
6305
helm/grafana/templates/grafana-dashboards.yaml
Normal file
6305
helm/grafana/templates/grafana-dashboards.yaml
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,63 +1,73 @@
|
|||
#!/usr/bin/env python
|
||||
import os
|
||||
import re
|
||||
from ruamel import yaml
|
||||
|
||||
def escape(s):
|
||||
return s.replace("{{","{{`{{").replace("}}","}}`}}")
|
||||
|
||||
def get_header(file_name):
|
||||
return "{{ define \"" + file_name + ".tpl\" }}\n"
|
||||
|
||||
#####
|
||||
## This script read the kube-prometheus rules and convert into helm charts format
|
||||
## Step 1 - Sync prometheus alert rules, create template file
|
||||
####
|
||||
### ----------------------------
|
||||
### Sync all prometheus rules
|
||||
###
|
||||
charts = [{'file_name': 'alertmanager', 'search_var': 'ruleFiles',
|
||||
'source':'contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml',
|
||||
'destination': 'helm/alertmanager/values.yaml'},
|
||||
{'file_name': 'kube-controller-manager', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-controller-manager/values.yaml'},
|
||||
{'file_name': 'kube-scheduler', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-scheduler/values.yaml'},
|
||||
{'file_name': 'kube-state-metrics', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-state-metrics.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-state/values.yaml'},
|
||||
{'file_name': 'node', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml',
|
||||
'destination': 'helm/exporter-node/values.yaml'},
|
||||
{'file_name': 'prometheus', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml',
|
||||
'destination': 'helm/prometheus/values.yaml'},
|
||||
{'file_name': 'etcd3', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-etcd/values.yaml'},
|
||||
# //TODO add {'file_name': 'general', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml',
|
||||
# 'destination': 'helm/kube-prometheus/general_rules.yaml'},
|
||||
{'file_name': 'kubelet', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml',
|
||||
'destination': 'helm/exporter-kubelets/values.yaml'},
|
||||
{'file_name': 'kubernetes', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml',
|
||||
'destination': 'helm/exporter-kubernetes/values.yaml'},
|
||||
###
|
||||
### Sync grafana dashboards
|
||||
###
|
||||
{'file_name': 'grafana-dashboards-0', 'search_var': 'serverDashboardFiles', 'source':'contrib/kube-prometheus/manifests/grafana/grafana-dashboards.yaml',
|
||||
'destination': 'helm/grafana/values.yaml'},
|
||||
charts = [
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml',
|
||||
'destination': 'helm/alertmanager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source': 'contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-controller-manager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-scheduler/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-state-metrics.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-state/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml',
|
||||
'destination': 'helm/exporter-node/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml',
|
||||
'destination': 'helm/prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-etcd/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml',
|
||||
'destination': 'helm/kube-prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml',
|
||||
'destination': 'helm/exporter-kubelets/', 'job_replace_by': 'kubelet'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml',
|
||||
'destination': 'helm/exporter-kubernetes/', 'job_replace_by': 'kubernetes'},
|
||||
]
|
||||
|
||||
# read the rules, create a new template file
|
||||
for chart in charts:
|
||||
lines = ""
|
||||
## parse current values.yaml file
|
||||
f = open(chart['destination'], 'r')
|
||||
for l in f.readlines():
|
||||
|
||||
# stop reading file after the rule
|
||||
if "{}:".format(chart['search_var']) in l:
|
||||
break
|
||||
lines+= l
|
||||
_, name = os.path.split(chart['source'])
|
||||
lines = get_header(name)
|
||||
|
||||
lines+= "{}:\n".format(chart['search_var'])
|
||||
lines+= " {}.rules: |-\n".format(chart['file_name'])
|
||||
|
||||
|
||||
## parse kube-prometheus rule
|
||||
f = open(chart['source'])
|
||||
for l in f.readlines():
|
||||
lines += " {}".format(l)
|
||||
f = open(chart['source'], 'r')
|
||||
lines += escape(f.read())
|
||||
lines = re.sub("job=\"(.*?)\"", "job=\"" + chart['job_replace_by'] + "\"", lines) #replace the job name by chart variable
|
||||
|
||||
lines += "{{ end }}" # footer
|
||||
|
||||
new_f = "{}/templates/{}".format(chart['destination'], name)
|
||||
|
||||
# recreate the file
|
||||
with open(chart['destination'], 'w') as f:
|
||||
with open(new_f, 'w') as f:
|
||||
f.write(lines)
|
||||
|
||||
print "Generated {}".format(new_f)
|
||||
|
||||
### ----------------------------
|
||||
### 2
|
||||
###
|
||||
######
|
||||
## Step 2 - Parse grafana dashboards, create a template file
|
||||
######
|
||||
|
||||
with open('contrib/kube-prometheus/manifests/grafana/grafana-dashboards.yaml', 'r') as s:
|
||||
data = yaml.load(s, Loader=yaml.RoundTripLoader)['data']
|
||||
|
||||
# prometheus datasource it's not required now
|
||||
del data['prometheus-datasource.json']
|
||||
|
||||
data_s = get_header("grafana-dashboards.yaml.tpl")
|
||||
data_s += escape(yaml.dump(data, Dumper=yaml.RoundTripDumper))
|
||||
data_s += "{{ end }}" # footer
|
||||
|
||||
with open('helm/grafana/templates/grafana-dashboards.yaml', 'w') as f:
|
||||
f.write(data_s)
|
||||
|
|
|
@ -4,6 +4,8 @@ engine: gotpl
|
|||
maintainers:
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
name: kube-prometheus
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
|
|
|
@ -1,62 +1,63 @@
|
|||
dependencies:
|
||||
- name: alertmanager
|
||||
version: 0.0.6
|
||||
repository: file://../alertmanager
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
#e2e-repository: file://../alertmanager
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: prometheus
|
||||
version: 0.0.8
|
||||
repository: file://../prometheus
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
#e2e-repository: file://../prometheus
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-api
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-kube-api
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-controller-manager
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-kube-controller-manager
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-controller-manager
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-dns
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-kube-dns
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-dns
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-etcd
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-kube-etcd
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-etcd
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-scheduler
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-kube-scheduler
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-scheduler
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-state
|
||||
version: 0.1.3
|
||||
repository: file://../exporter-kube-state
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
#e2e-repository: file://../exporter-kube-state
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kubelets
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-kubelets
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kubelets
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kubernetes
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-kubernetes
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kubernetes
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-node
|
||||
version: 0.1.1
|
||||
repository: file://../exporter-node
|
||||
# repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-node
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
condition: deployExporterNode
|
||||
|
||||
- name: grafana
|
||||
version: 0.0.5
|
||||
repository: file://../grafana
|
||||
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
#e2e-repository: file://../grafana
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
condition: deployGrafana
|
||||
|
||||
- name: prometheus-operator
|
||||
version: 0.0.8
|
||||
#e2e-repository: file://../prometheus-operator
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
condition: deployPrometheusOperator
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
general.rules: |-
|
||||
{{- include "general.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
41
helm/kube-prometheus/templates/general.rules.yaml
Normal file
41
helm/kube-prometheus/templates/general.rules.yaml
Normal file
|
@ -0,0 +1,41 @@
|
|||
{{ define "general.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of {{`{{ $labels.job }}`}} targets are down.'
|
||||
summary: Targets are down
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
annotations:
|
||||
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
|
||||
pipeline is functional.
|
||||
summary: Alerting DeadMansSwitch
|
||||
- record: fd_utilization
|
||||
expr: process_open_fds / process_max_fds
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
|
||||
will exhaust in file/socket descriptors within the next 4 hours'
|
||||
summary: file descriptors soon exhausted
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
|
||||
will exhaust in file/socket descriptors within the next hour'
|
||||
summary: file descriptors soon exhausted
|
||||
{{ end }}
|
|
@ -4,6 +4,9 @@ deployExporterNode: True
|
|||
# Grafana
|
||||
deployGrafana: True
|
||||
|
||||
# Prometheus operator
|
||||
deployPrometheusOperator: False
|
||||
|
||||
alertmanager:
|
||||
## Alertmanager configuration directives
|
||||
## Ref: https://prometheus.io/docs/alerting/configuration/
|
||||
|
@ -24,7 +27,6 @@ alertmanager:
|
|||
receivers:
|
||||
- name: 'null'
|
||||
|
||||
|
||||
## External URL at which Alertmanager will be reachable
|
||||
##
|
||||
externalUrl: ""
|
||||
|
@ -338,3 +340,6 @@ prometheus:
|
|||
# requests:
|
||||
# storage: 16Gi
|
||||
# selector: {}
|
||||
|
||||
# default rules are in templates/general.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -4,6 +4,8 @@ engine: gotpl
|
|||
maintainers:
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
name: prometheus-operator
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
|
|
|
@ -26,4 +26,4 @@ data:
|
|||
- {{ .Release.Namespace | quote }}
|
||||
endpoints:
|
||||
- port: http
|
||||
interval: 30s
|
||||
interval: 30s
|
|
@ -2,7 +2,7 @@ apiVersion: v1
|
|||
description: Prometheus instance created by the CoreOS Prometheus Operator
|
||||
engine: gotpl
|
||||
maintainers:
|
||||
- name: Michael Goodness
|
||||
- name: Giancarlo Rubio
|
||||
email: mgoodness@gmail.com
|
||||
name: prometheus
|
||||
sources:
|
||||
|
|
|
@ -55,6 +55,7 @@ Parameter | Description | Default
|
|||
`ingress.tls` | TLS configuration for Prometheus Ingress | `[]`
|
||||
`nodeSelector` | Node labels for pod assignment | `{}`
|
||||
`paused` | If true, the Operator won't process any Prometheus configuration changes | `false`
|
||||
`prometheusRules` | Prometheus rules | `[templates/prometheus.rules.yaml](templates/prometheus.rules.yaml)`
|
||||
`replicaCount` | Number of Prometheus replicas desired | `1`
|
||||
`resources` | Pod resource requests & limits | `{}`
|
||||
`retention` | How long to retain metrics | `24h`
|
||||
|
|
21
helm/prometheus/templates/configmap.yaml
Normal file
21
helm/prometheus/templates/configmap.yaml
Normal file
|
@ -0,0 +1,21 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
app: "prometheus"
|
||||
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
|
||||
heritage: {{ .Release.Service }}
|
||||
prometheus: {{ .Release.Name }}
|
||||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
prometheus.rules: |-
|
||||
{{- include "prometheus.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
73
helm/prometheus/templates/prometheus.rules.yaml
Normal file
73
helm/prometheus/templates/prometheus.rules.yaml
Normal file
|
@ -0,0 +1,73 @@
|
|||
{{ define "prometheus.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{
|
||||
$labels.pod}}`}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
|
||||
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
|
||||
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected
|
||||
to any Alertmanagers
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
|
||||
reload failures over the last four hours.'
|
||||
summary: Prometheus has issues reloading data blocks from disk
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
|
||||
compaction failures over the last four hours.'
|
||||
summary: Prometheus has issues compacting sample blocks
|
||||
- alert: PrometheusTSDBWALCorruptions
|
||||
expr: tsdb_wal_corruptions_total > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead
|
||||
log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
||||
{{ end }}
|
|
@ -10,7 +10,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -248,77 +248,6 @@ storageSpec: {}
|
|||
# requests:
|
||||
# storage: 50Gi
|
||||
# selector: {}
|
||||
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
|
||||
ruleFiles:
|
||||
prometheus.rules: |-
|
||||
groups:
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
||||
$labels.pod}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
||||
to any Alertmanagers
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||
reload failures over the last four hours.'
|
||||
summary: Prometheus has issues reloading data blocks from disk
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||
compaction failures over the last four hours.'
|
||||
summary: Prometheus has issues compacting sample blocks
|
||||
- alert: PrometheusTSDBWALCorruptions
|
||||
expr: tsdb_wal_corruptions_total > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
||||
log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
||||
|
||||
# default rules are in templates/prometheus.rules.yaml
|
||||
# prometheusRules: {}
|
Loading…
Add table
Add a link
Reference in a new issue