mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 03:38:43 +00:00
Merge pull request #797 from gianrubio/helm-prometheus-2.0
Bump prometheus chart to v2.0
This commit is contained in:
commit
39856066c3
66 changed files with 7185 additions and 775 deletions
.gitignore
helm
alertmanager
exporter-kube-api
exporter-kube-controller-manager
exporter-kube-dns
exporter-kube-etcd
exporter-kube-scheduler
exporter-kube-state
exporter-kubelets
exporter-kubernetes
exporter-node
grafana
hack
kube-prometheus
prometheus-operator
prometheus
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -8,3 +8,4 @@ requirements.lock
|
|||
.DS_Store
|
||||
__pycache__
|
||||
.env/
|
||||
.history/
|
|
@ -4,7 +4,9 @@ engine: gotpl
|
|||
maintainers:
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
name: alertmanager
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
version: 0.0.5
|
||||
version: 0.0.6
|
||||
|
|
|
@ -46,13 +46,14 @@ Parameter | Description | Default
|
|||
`config` | Alertmanager configuration directives | `{}`
|
||||
`externalUrl` | External URL at which Alertmanager will be reachable | `""`
|
||||
`image.repository` | Image | `quay.io/prometheus/alertmanager`
|
||||
`image.tag` | Image tag | `v0.5.1`
|
||||
`image.tag` | Image tag | `v0.12.0`
|
||||
`ingress.enabled` | If true, Alertmanager Ingress will be created | `false`
|
||||
`ingress.annotations` | Annotations for Alertmanager Ingress` | `{}`
|
||||
`ingress.fqdn` | Alertmanager Ingress fully-qualified domain name | `""`
|
||||
`ingress.tls` | TLS configuration for Alertmanager Ingress | `[]`
|
||||
`nodeSelector` | Node labels for pod assignment | `{}`
|
||||
`paused` | If true, the Operator won't process any Alertmanager configuration changes | `false`
|
||||
`prometheusRules` | Prometheus rules | `[templates/alertmanager.rules.yaml](templates/alertmanager.rules.yaml)`
|
||||
`replicaCount` | Number of Alertmanager replicas desired | `1`
|
||||
`resources` | Pod resource requests & limits | `{}`
|
||||
`service.annotations` | Annotations to be added to the Alertmanager Service | `{}`
|
||||
|
|
32
helm/alertmanager/templates/alertmanager.rules.yaml
Normal file
32
helm/alertmanager/templates/alertmanager.rules.yaml
Normal file
|
@ -0,0 +1,32 @@
|
|||
{{ define "alertmanager.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{`{{$labels.service}}`}}` are out of sync.
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{`{{ $labels.namespace
|
||||
}}`}}/{{`{{ $labels.pod}}`}}.
|
||||
{{ end }}
|
21
helm/alertmanager/templates/configmap.yaml
Normal file
21
helm/alertmanager/templates/configmap.yaml
Normal file
|
@ -0,0 +1,21 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
app: "alertmanager"
|
||||
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
|
||||
heritage: {{ .Release.Service }}
|
||||
prometheus: {{ .Release.Name }}
|
||||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
alertmanager.rules: |-
|
||||
{{- include "alertmanager.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -11,7 +11,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
alertmanager: {{ .Release.Name }}
|
||||
|
|
|
@ -4,18 +4,18 @@
|
|||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
receivers:
|
||||
- name: webhook
|
||||
webhook_configs:
|
||||
- url: 'http://alertmanagerwh:30500/'
|
||||
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_interval: 5m
|
||||
group_wait: 30s
|
||||
receiver: webhook
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|
||||
## Alertmanager template files to include
|
||||
#
|
||||
|
@ -49,7 +49,7 @@ selfServiceMonitor: true
|
|||
##
|
||||
image:
|
||||
repository: quay.io/prometheus/alertmanager
|
||||
tag: v0.7.1
|
||||
tag: v0.12.0
|
||||
|
||||
## Labels to be added to the Alertmanager
|
||||
##
|
||||
|
@ -150,3 +150,6 @@ storageSpec: {}
|
|||
# requests:
|
||||
# storage: 50Gi
|
||||
# selector: {}
|
||||
|
||||
# default rules are in templates/alertmanager.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,21 +0,0 @@
|
|||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
|
@ -1,7 +0,0 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kube-api
|
||||
version: 0.1.1
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
|
@ -1,16 +0,0 @@
|
|||
{{/* vim: set filetype=mustache: */}}
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
*/}}
|
||||
{{- define "fullname" -}}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride -}}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
|
@ -1,21 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: {{ template "name" . }}
|
||||
component: kube-api
|
||||
heritage: {{ .Release.Service }}
|
||||
release: {{ .Release.Name }}
|
||||
chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
|
||||
name: {{ template "fullname" . }}
|
||||
namespace: kube-system
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: https-metrics
|
||||
port: 443
|
||||
protocol: TCP
|
||||
targetPort: 443
|
||||
selector:
|
||||
k8s-app: kube-apiserver
|
||||
type: ClusterIP
|
|
@ -1,29 +0,0 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
labels:
|
||||
chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
|
||||
component: kube-api
|
||||
heritage: "{{ .Release.Service }}"
|
||||
release: "{{ .Release.Name }}"
|
||||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
component: kube-api
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "kube-system"
|
||||
endpoints:
|
||||
- port: https-metrics
|
||||
interval: 15s
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
# Skip verification until we have resolved why the certificate validation
|
||||
# for the kubelet on API server nodes fail.
|
||||
insecureSkipVerify: true
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
@ -1,93 +0,0 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
ruleFiles:
|
||||
kube-api.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,node,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{`{{ $labels.verb }}`}} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
||||
|
||||
### API latency ###
|
||||
|
||||
# Raw metrics are in microseconds. Convert to seconds.
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(
|
||||
0.9,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(
|
||||
0.5,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
|
||||
### File descriptor alerts
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS
|
||||
{
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS
|
||||
{
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
|
||||
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
ALERT K8STooManyOpenFiles
|
||||
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
|
||||
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
|
||||
}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kube-controller-manager
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kube-controller-manager.rules: |-
|
||||
{{- include "kube-controller-manager.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -0,0 +1,15 @@
|
|||
{{ define "kube-controller-manager.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
{{ end }}
|
|
@ -1,17 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
ruleFiles:
|
||||
kube-controller-manager.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="{{ template "fullname" . }}"}) or (count by(cluster) (up{job="{{ template "fullname" . }}"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
}
|
||||
# default rules are in templates/kube-controller-manager.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart singleton for kube-state-metrics
|
||||
name: exporter-kube-dns
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,7 +9,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kube-etcd
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
etcd3.rules: |-
|
||||
{{- include "etcd3.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
125
helm/exporter-kube-etcd/templates/etcd3.rules.yaml
Normal file
125
helm/exporter-kube-etcd/templates/etcd3.rules.yaml
Normal file
|
@ -0,0 +1,125 @@
|
|||
{{ define "etcd3.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: ./etcd3.rules
|
||||
rules:
|
||||
- alert: InsufficientMembers
|
||||
expr: count(up{job="{{ template "fullname" . }}"} == 0) > (count(up{job="{{ template "fullname" . }}"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: NoLeader
|
||||
expr: etcd_server_has_leader{job="{{ template "fullname" . }}"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{`{{ $labels.instance }}`}} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="{{ template "fullname" . }}"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
|
||||
on etcd instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
|
||||
on etcd instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="{{ template "fullname" . }}",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{`{{ $labels.instance }}`}} gRPC requests to {{`{{ $labels.grpc_method
|
||||
}}`}} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
|
||||
instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
|
||||
instance {{`{{ $labels.instance }}`}}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method
|
||||
}}`}} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} member communication with
|
||||
{{`{{ $labels.To }}`}} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="{{ template "fullname" . }}"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} proposal
|
||||
failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{`{{ $labels.instance }}`}} commit durations are high
|
||||
summary: high commit durations
|
||||
{{ end }}
|
|
@ -1,111 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
ruleFiles:
|
||||
kube-etcd.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
### General cluster availability ###
|
||||
|
||||
# alert if another failed peer will result in an unavailable cluster
|
||||
ALERT InsufficientPeers
|
||||
IF count(up{job="{{ template "fullname" . }}"} == 0) > (count(up{job="{{ template "fullname" . }}"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Etcd cluster small",
|
||||
description = "If one more etcd peer goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
### HTTP requests alerts ###
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="{{ template "fullname" . }}", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="{{ template "fullname" . }}", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS
|
||||
{
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}",
|
||||
}
|
||||
|
||||
# alert if 50% of requests get a 4xx response
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="{{ template "fullname" . }}", code=~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS
|
||||
{
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed with 4xx responses on etcd instance {{`{{ $labels.instance }}`}}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS
|
||||
{
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on ectd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method }}`}} are slow",
|
||||
}
|
||||
|
||||
ALERT K8SApiServerEtcdAccessLatency
|
||||
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Access to etcd is slow",
|
||||
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
|
||||
}
|
||||
|
||||
### etcd proposal alerts ###
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposal_failed_total{job="{{ template "fullname" . }}"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of failed proposals within the etcd cluster are happening",
|
||||
description = "etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
### etcd disk io latency alerts
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "ectd instance {{`{{ $labels.instance }}`}} fync durations are high",
|
||||
}
|
||||
# default rules are in templates/etcd3.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart singleton for kube-state-metrics
|
||||
name: exporter-kube-scheduler
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kube-scheduler.rules: |-
|
||||
{{- include "kube-scheduler.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -0,0 +1,60 @@
|
|||
{{ define "kube-scheduler.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: K8SSchedulerDown
|
||||
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S scheduler. New pods are not being assigned
|
||||
to nodes.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
||||
summary: Scheduler is down
|
||||
{{ end }}
|
|
@ -1,40 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
ruleFiles:
|
||||
kube-controller-manager.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
### Scheduling latency ###
|
||||
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
ALERT K8SSchedulerDown
|
||||
IF absent(up{job="{{ template "fullname" . }}"}) or (count by(cluster) (up{job="{{ template "fullname" . }}"} == 1) == 0)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Scheduler is down",
|
||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||
}
|
||||
# default rules are in templates/kube-scheduler.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -3,5 +3,7 @@ description: A Helm chart singleton for kube-state-metrics
|
|||
name: exporter-kube-state
|
||||
version: 0.1.3
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
|
|
|
@ -15,6 +15,7 @@ metadata:
|
|||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- namespaces
|
||||
- nodes
|
||||
- pods
|
||||
- services
|
||||
|
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kube-state-metrics.rules: |-
|
||||
{{- include "kube-state-metrics.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
|
@ -0,0 +1,57 @@
|
|||
{{ define "kube-state-metrics.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kube-state-metrics.rules
|
||||
rules:
|
||||
- alert: DeploymentGenerationMismatch
|
||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Observed deployment generation does not match expected one for
|
||||
deployment {{`{{$labels.namespaces}}`}}{{`{{$labels.deployment}}`}}
|
||||
- alert: DeploymentReplicasNotUpdated
|
||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
||||
unless (kube_deployment_spec_paused == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Replicas are not updated and available for deployment {{`{{$labels.namespaces}}`}}/{{`{{$labels.deployment}}`}}
|
||||
- alert: DaemonSetRolloutStuck
|
||||
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
|
||||
* 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Only {{`{{$value}}`}}% of desired pods scheduled and ready for daemon
|
||||
set {{`{{$labels.namespaces}}`}}/{{`{{$labels.daemonset}}`}}
|
||||
- alert: K8SDaemonSetsNotScheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not scheduled.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: DaemonSetsMissScheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are running where they are not supposed
|
||||
to run.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: PodFrequentlyRestarting
|
||||
expr: increase(kube_pod_container_status_restarts[1h]) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Pod {{`{{$labels.namespaces}}`}}/{{`{{$labels.pod}}`}} is was restarted {{`{{$value}}`}}
|
||||
times within the last hour
|
||||
{{ end }}
|
|
@ -9,7 +9,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -6,7 +6,7 @@ rbacEnable: true
|
|||
kube_state_metrics:
|
||||
image:
|
||||
repository: gcr.io/google_containers/kube-state-metrics
|
||||
tag: v1.0.1
|
||||
tag: v1.1.0
|
||||
pullPolicy: IfNotPresent
|
||||
service:
|
||||
name: kube-state-metrics
|
||||
|
@ -25,37 +25,6 @@ addon_resizer:
|
|||
requests:
|
||||
cpu: 100m
|
||||
memory: 30Mi
|
||||
ruleFiles:
|
||||
kube-state.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_condition{condition="Ready", status="true"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{`{{ $labels.node }}`}} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0) > 1
|
||||
AND
|
||||
(
|
||||
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0)
|
||||
/
|
||||
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"})
|
||||
) > 0.2
|
||||
FOR 1m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many K8s nodes are Not Ready",
|
||||
description = "{{`{{ $value }}`}} K8s nodes (more than 10% of cluster {{`{{ $labels.cluster }}`}}) are in the NotReady state.",
|
||||
}
|
||||
# default rules are in templates/kube-state-metrics.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kubelets
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kubelet.rules: |-
|
||||
{{- include "kubelet.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
49
helm/exporter-kubelets/templates/kubelet.rules.yaml
Normal file
49
helm/exporter-kubelets/templates/kubelet.rules.yaml
Normal file
|
@ -0,0 +1,49 @@
|
|||
{{ define "kubelet.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The Kubelet on {{`{{ $labels.node }}`}} has not checked in with the API,
|
||||
or has set itself to NotReady, for more than an hour
|
||||
summary: Node status is NotReady
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of Kubernetes nodes are not ready'
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets.
|
||||
- alert: K8SKubeletDown
|
||||
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
||||
* 100 > 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets, or all Kubelets
|
||||
have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{`{{$labels.instance}}`}} is running {{`{{$value}}`}} pods, close
|
||||
to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
{{ end }}
|
|
@ -1,164 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
ruleFiles:
|
||||
kubelets.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
### Container resources ###
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_spec_memory_limit_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_cpu_shares =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_spec_cpu_shares{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:cpu_usage:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_cpu_usage_seconds_total{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_usage_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_working_set:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_working_set_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_rss:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_rss{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_cache:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_cache{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:disk_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_disk_usage_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_memory_failures_total{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_oom:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_memory_failcnt{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
### Cluster resources ###
|
||||
|
||||
cluster:memory_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_memory_limit_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
cluster:memory_used:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_memory_usage_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
cluster:cpu_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_cpu_shares{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
|
||||
)
|
||||
|
||||
ALERT K8SNodeDown
|
||||
IF up{job="kubelet"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{`{{ $labels.job }}`}} for more than one hour",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{`{{$labels.instance}}`}} is running {{`{{$value}}`}} pods, close to the limit of 110",
|
||||
}
|
||||
# default rules are in templates/kubelet.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-kubernetes
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
kubernetes.rules: |-
|
||||
{{- include "kubernetes.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
88
helm/exporter-kubernetes/templates/kubernetes.rules.yaml
Normal file
88
helm/exporter-kubernetes/templates/kubernetes.rules.yaml
Normal file
|
@ -0,0 +1,88 @@
|
|||
{{ define "kubernetes.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: kubernetes.rules
|
||||
rules:
|
||||
- record: pod_name:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(pod_name)
|
||||
- record: pod_name:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: pod_name:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
BY (pod_name)
|
||||
- record: pod_name:container_fs_usage_bytes:sum
|
||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: namespace:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
||||
BY (namespace)
|
||||
- record: cluster:memory_usage:ratio
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:container_spec_cpu_shares:ratio
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: cluster:container_cpu_usage:ratio
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
|
||||
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
|
||||
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: API server returns errors for {{`{{ $value }}`}}% of requests
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: API server returns errors for {{`{{ $value }}`}}% of requests
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="kubernetes"} == 1)
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: No API servers are reachable or all have disappeared from service
|
||||
discovery
|
||||
{{ end }}
|
|
@ -1,30 +1,2 @@
|
|||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
ruleFiles:
|
||||
kubernetes.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
ALERT K8SApiserverDown
|
||||
IF up{job="kubernetes"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "An API server could not be scraped.",
|
||||
}
|
||||
|
||||
# Disable for non HA kubernetes setups.
|
||||
ALERT K8SApiserverDown
|
||||
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
|
||||
FOR 5m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
# default rules are in templates/kubernetes.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -1,7 +1,9 @@
|
|||
apiVersion: v1
|
||||
description: A Helm chart for Kubernetes
|
||||
name: exporter-node
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
maintainers:
|
||||
- name: Cloud Posse LLC
|
||||
email: hello@cloudposse.com
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
node.rules: |-
|
||||
{{- include "node.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
46
helm/exporter-node/templates/node.rules.yaml
Normal file
46
helm/exporter-node/templates/node.rules.yaml
Normal file
|
@ -0,0 +1,46 @@
|
|||
{{ define "node.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
|
||||
full within the next 24 hours (mounted at {{`{{$labels.mountpoint}}`}})
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
|
||||
full within the next 2 hours (mounted at {{`{{$labels.mountpoint}}`}})
|
||||
{{ end }}
|
|
@ -9,7 +9,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
replicaCount: 1
|
||||
image:
|
||||
repository: quay.io/prometheus/node-exporter
|
||||
tag: v0.14.0
|
||||
tag: v0.15.2
|
||||
pullPolicy: IfNotPresent
|
||||
service:
|
||||
type: ClusterIP
|
||||
|
@ -16,94 +16,6 @@ resources:
|
|||
requests:
|
||||
cpu: 100m
|
||||
memory: 30Mi
|
||||
ruleFiles:
|
||||
node.rules: |-
|
||||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
cluster:node_cpu_use:percent =
|
||||
100 * sum by (cluster) (
|
||||
rate(node_cpu{mode!="idle"}[5m])
|
||||
) / sum by (cluster) (
|
||||
machine_cpu_cores
|
||||
)
|
||||
|
||||
ALERT K8SKubeletNodeExporterDown
|
||||
IF up{job="{{ template "fullname" . }}"} == 0
|
||||
FOR 15m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet node_exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a {{`{{ $labels.job }}`}} for more than one hour.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{`{{ $value }}`}}% full.",
|
||||
}
|
||||
|
||||
ALERT K8SConntrackTableFull
|
||||
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Number of tracked connections is near the limit",
|
||||
description = "The nf_conntrack table is {{`{{ $value }}`}}% full.",
|
||||
}
|
||||
|
||||
# To catch the conntrack sysctl de-tuning when it happens
|
||||
ALERT K8SConntrackTuningMissing
|
||||
IF node_nf_conntrack_udp_timeout > 10
|
||||
FOR 10m
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node does not have the correct conntrack tunings",
|
||||
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeOutOfDisk
|
||||
IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node ran out of disk space.",
|
||||
description = "{{`{{ $labels.node }}`}} has run out of disk space.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeMemoryPressure
|
||||
IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under memory pressure.",
|
||||
description = "{{`{{ $labels.node }}`}} is under memory pressure.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeDiskPressure
|
||||
IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under disk pressure.",
|
||||
description = "{{`{{ $labels.node }}`}} is under disk pressure.",
|
||||
}
|
||||
# default rules are in templates/node.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -6,4 +6,4 @@ maintainers:
|
|||
name: grafana
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
version: 0.0.4
|
||||
version: 0.0.5
|
||||
|
|
|
@ -9,7 +9,9 @@ metadata:
|
|||
name: {{ template "grafana.server.fullname" . }}
|
||||
data:
|
||||
{{- if .Values.serverDashboardFiles }}
|
||||
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
|
||||
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
|
||||
{{ else }}
|
||||
{{- include "grafana-dashboards.yaml.tpl" . | indent 2}}
|
||||
{{- end }}
|
||||
{{- if .Values.dataSource }}
|
||||
{{ toYaml .Values.dataSource | indent 2 }}
|
||||
|
|
6305
helm/grafana/templates/grafana-dashboards.yaml
Normal file
6305
helm/grafana/templates/grafana-dashboards.yaml
Normal file
File diff suppressed because it is too large
Load diff
|
@ -42,7 +42,7 @@ service:
|
|||
##
|
||||
image:
|
||||
repository: grafana/grafana
|
||||
tag: 4.4.1
|
||||
tag: 4.6.3
|
||||
|
||||
grafanaWatcher:
|
||||
repository: quay.io/coreos/grafana-watcher
|
||||
|
@ -86,4 +86,4 @@ ingress:
|
|||
# Set datasource in beginning
|
||||
dataSource: {}
|
||||
|
||||
serverDashboardFiles: {}
|
||||
serverDashboardFiles: {}
|
73
helm/hack/sync_kube_prometheus.py
Executable file
73
helm/hack/sync_kube_prometheus.py
Executable file
|
@ -0,0 +1,73 @@
|
|||
#!/usr/bin/env python
|
||||
import os
|
||||
import re
|
||||
from ruamel import yaml
|
||||
|
||||
def escape(s):
|
||||
return s.replace("{{","{{`{{").replace("}}","}}`}}")
|
||||
|
||||
def get_header(file_name):
|
||||
return "{{ define \"" + file_name + ".tpl\" }}\n"
|
||||
|
||||
#####
|
||||
## Step 1 - Sync prometheus alert rules, create template file
|
||||
####
|
||||
charts = [
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml',
|
||||
'destination': 'helm/alertmanager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source': 'contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-controller-manager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-scheduler/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-state-metrics.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-state/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml',
|
||||
'destination': 'helm/exporter-node/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml',
|
||||
'destination': 'helm/prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml',
|
||||
'destination': 'helm/exporter-kube-etcd/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml',
|
||||
'destination': 'helm/kube-prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml',
|
||||
'destination': 'helm/exporter-kubelets/', 'job_replace_by': 'kubelet'},
|
||||
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml',
|
||||
'destination': 'helm/exporter-kubernetes/', 'job_replace_by': 'kubernetes'},
|
||||
]
|
||||
|
||||
# read the rules, create a new template file
|
||||
for chart in charts:
|
||||
|
||||
_, name = os.path.split(chart['source'])
|
||||
lines = get_header(name)
|
||||
|
||||
f = open(chart['source'], 'r')
|
||||
lines += escape(f.read())
|
||||
lines = re.sub("job=\"(.*?)\"", "job=\"" + chart['job_replace_by'] + "\"", lines) #replace the job name by chart variable
|
||||
|
||||
lines += "{{ end }}" # footer
|
||||
|
||||
new_f = "{}/templates/{}".format(chart['destination'], name)
|
||||
|
||||
# recreate the file
|
||||
with open(new_f, 'w') as f:
|
||||
f.write(lines)
|
||||
|
||||
print "Generated {}".format(new_f)
|
||||
|
||||
######
|
||||
## Step 2 - Parse grafana dashboards, create a template file
|
||||
######
|
||||
|
||||
with open('contrib/kube-prometheus/manifests/grafana/grafana-dashboards.yaml', 'r') as s:
|
||||
data = yaml.load(s, Loader=yaml.RoundTripLoader)['data']
|
||||
|
||||
# prometheus datasource it's not required now
|
||||
del data['prometheus-datasource.json']
|
||||
|
||||
data_s = get_header("grafana-dashboards.yaml.tpl")
|
||||
data_s += escape(yaml.dump(data, Dumper=yaml.RoundTripDumper))
|
||||
data_s += "{{ end }}" # footer
|
||||
|
||||
with open('helm/grafana/templates/grafana-dashboards.yaml', 'w') as f:
|
||||
f.write(data_s)
|
|
@ -4,6 +4,8 @@ engine: gotpl
|
|||
maintainers:
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
name: kube-prometheus
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
|
|
|
@ -1,46 +1,31 @@
|
|||
dependencies:
|
||||
- name: alertmanager
|
||||
version: 0.0.5
|
||||
version: 0.0.6
|
||||
#e2e-repository: file://../alertmanager
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: prometheus-operator
|
||||
version: 0.0.7
|
||||
#e2e-repository: file://../prometheus-operator
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: grafana
|
||||
version: 0.0.4
|
||||
#e2e-repository: file://../grafana
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
|
||||
- name: prometheus
|
||||
version: 0.0.5
|
||||
version: 0.0.8
|
||||
#e2e-repository: file://../prometheus
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-api
|
||||
version: 0.1.1
|
||||
#e2e-repository: file://../exporter-kube-api
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
|
||||
- name: exporter-kube-controller-manager
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-controller-manager
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-dns
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-dns
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-etcd
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-etcd
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kube-scheduler
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kube-scheduler
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
|
@ -50,23 +35,23 @@ dependencies:
|
|||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kubelets
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kubelets
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-kubernetes
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-kubernetes
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
|
||||
- name: exporter-node
|
||||
version: 0.1.1
|
||||
version: 0.1.2
|
||||
#e2e-repository: file://../exporter-node
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
condition: deployExporterNode
|
||||
|
||||
- name: grafana
|
||||
version: 0.0.4
|
||||
version: 0.0.5
|
||||
#e2e-repository: file://../grafana
|
||||
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
|
||||
condition: deployGrafana
|
||||
|
|
|
@ -9,8 +9,13 @@ metadata:
|
|||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.ruleFiles }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ tpl $val $root | indent 4}}
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
general.rules: |-
|
||||
{{- include "general.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
41
helm/kube-prometheus/templates/general.rules.yaml
Normal file
41
helm/kube-prometheus/templates/general.rules.yaml
Normal file
|
@ -0,0 +1,41 @@
|
|||
{{ define "general.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $value }}`}}% of {{`{{ $labels.job }}`}} targets are down.'
|
||||
summary: Targets are down
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
annotations:
|
||||
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
|
||||
pipeline is functional.
|
||||
summary: Alerting DeadMansSwitch
|
||||
- record: fd_utilization
|
||||
expr: process_open_fds / process_max_fds
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
|
||||
will exhaust in file/socket descriptors within the next 4 hours'
|
||||
summary: file descriptors soon exhausted
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
|
||||
will exhaust in file/socket descriptors within the next hour'
|
||||
summary: file descriptors soon exhausted
|
||||
{{ end }}
|
|
@ -11,18 +11,18 @@ alertmanager:
|
|||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
receivers:
|
||||
- name: webhook
|
||||
webhook_configs:
|
||||
- url: 'http://alertmanagerwh:30500/'
|
||||
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_interval: 5m
|
||||
group_wait: 30s
|
||||
receiver: webhook
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|
||||
## External URL at which Alertmanager will be reachable
|
||||
##
|
||||
|
@ -32,7 +32,7 @@ alertmanager:
|
|||
##
|
||||
image:
|
||||
repository: quay.io/prometheus/alertmanager
|
||||
tag: v0.5.1
|
||||
tag: v0.9.1
|
||||
|
||||
ingress:
|
||||
## If true, Alertmanager Ingress will be created
|
||||
|
@ -147,7 +147,7 @@ prometheus:
|
|||
##
|
||||
image:
|
||||
repository: quay.io/prometheus/prometheus
|
||||
tag: v1.5.2
|
||||
tag: v2.0.0
|
||||
|
||||
ingress:
|
||||
## If true, Prometheus Ingress will be created
|
||||
|
@ -337,3 +337,6 @@ prometheus:
|
|||
# requests:
|
||||
# storage: 16Gi
|
||||
# selector: {}
|
||||
|
||||
# default rules are in templates/general.rules.yaml
|
||||
# prometheusRules: {}
|
|
@ -4,7 +4,9 @@ engine: gotpl
|
|||
maintainers:
|
||||
- name: Michael Goodness
|
||||
email: mgoodness@gmail.com
|
||||
- name: Giancarlo Rubio
|
||||
email: gianrubio@gmail.com
|
||||
name: prometheus-operator
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
version: 0.0.7
|
||||
version: 0.0.8
|
||||
|
|
|
@ -83,7 +83,7 @@ Parameter | Description | Default
|
|||
`global.hyperkube.tag` | Hyperkube image tag | `v1.7.6_coreos.0`
|
||||
`global.hyperkube.pullPolicy` | Hyperkube image pull policy | `IfNotPresent`
|
||||
`image.repository` | Image | `quay.io/coreos/prometheus-operator`
|
||||
`image.tag` | Image tag | `v0.13.0`
|
||||
`image.tag` | Image tag | `v0.15.0`
|
||||
`image.pullPolicy` | Image pull policy | `IfNotPresent`
|
||||
`kubeletService.enable` | If true, the operator will create a service for scraping kubelets | `true`
|
||||
`kubeletService.namespace` | The namespace in which the kubelet service should be created | `kube-system`
|
||||
|
|
|
@ -26,4 +26,4 @@ data:
|
|||
- {{ .Release.Namespace | quote }}
|
||||
endpoints:
|
||||
- port: http
|
||||
interval: 30s
|
||||
interval: 30s
|
|
@ -22,7 +22,7 @@ configmapReload:
|
|||
##
|
||||
image:
|
||||
repository: quay.io/coreos/prometheus-operator
|
||||
tag: v0.14.1
|
||||
tag: v0.15.0
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
## If enabled, prometheus-operator will create a service for scraping kubelets
|
||||
|
|
|
@ -2,9 +2,9 @@ apiVersion: v1
|
|||
description: Prometheus instance created by the CoreOS Prometheus Operator
|
||||
engine: gotpl
|
||||
maintainers:
|
||||
- name: Michael Goodness
|
||||
- name: Giancarlo Rubio
|
||||
email: mgoodness@gmail.com
|
||||
name: prometheus
|
||||
sources:
|
||||
- https://github.com/coreos/prometheus-operator
|
||||
version: 0.0.5
|
||||
version: 0.0.8
|
||||
|
|
|
@ -55,6 +55,7 @@ Parameter | Description | Default
|
|||
`ingress.tls` | TLS configuration for Prometheus Ingress | `[]`
|
||||
`nodeSelector` | Node labels for pod assignment | `{}`
|
||||
`paused` | If true, the Operator won't process any Prometheus configuration changes | `false`
|
||||
`prometheusRules` | Prometheus rules | `[templates/prometheus.rules.yaml](templates/prometheus.rules.yaml)`
|
||||
`replicaCount` | Number of Prometheus replicas desired | `1`
|
||||
`resources` | Pod resource requests & limits | `{}`
|
||||
`retention` | How long to retain metrics | `24h`
|
||||
|
|
21
helm/prometheus/templates/configmap.yaml
Normal file
21
helm/prometheus/templates/configmap.yaml
Normal file
|
@ -0,0 +1,21 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
app: "prometheus"
|
||||
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
|
||||
heritage: {{ .Release.Service }}
|
||||
prometheus: {{ .Release.Name }}
|
||||
release: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
data:
|
||||
{{- if .Values.prometheusRules }}
|
||||
{{- $root := . }}
|
||||
{{- range $key, $val := .Values.prometheusRules }}
|
||||
{{ $key }}: |-
|
||||
{{ $val | indent 4}}
|
||||
{{- end }}
|
||||
{{ else }}
|
||||
prometheus.rules: |-
|
||||
{{- include "prometheus.rules.yaml.tpl" . | indent 4}}
|
||||
{{ end }}
|
73
helm/prometheus/templates/prometheus.rules.yaml
Normal file
73
helm/prometheus/templates/prometheus.rules.yaml
Normal file
|
@ -0,0 +1,73 @@
|
|||
{{ define "prometheus.rules.yaml.tpl" }}
|
||||
groups:
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{
|
||||
$labels.pod}}`}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
|
||||
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
|
||||
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected
|
||||
to any Alertmanagers
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
|
||||
reload failures over the last four hours.'
|
||||
summary: Prometheus has issues reloading data blocks from disk
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
|
||||
compaction failures over the last four hours.'
|
||||
summary: Prometheus has issues compacting sample blocks
|
||||
- alert: PrometheusTSDBWALCorruptions
|
||||
expr: tsdb_wal_corruptions_total > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead
|
||||
log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
||||
{{ end }}
|
|
@ -10,7 +10,7 @@ metadata:
|
|||
prometheus: {{ .Release.Name }}
|
||||
name: {{ template "fullname" . }}
|
||||
spec:
|
||||
jobLabel: {{ template "name" . }}
|
||||
jobLabel: {{ template "fullname" . }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "name" . }}
|
||||
|
|
|
@ -31,7 +31,7 @@ selfServiceMonitor: true
|
|||
##
|
||||
image:
|
||||
repository: quay.io/prometheus/prometheus
|
||||
tag: v1.7.1
|
||||
tag: v2.0.0
|
||||
|
||||
## Labels to be added to the Prometheus
|
||||
##
|
||||
|
@ -248,3 +248,6 @@ storageSpec: {}
|
|||
# requests:
|
||||
# storage: 50Gi
|
||||
# selector: {}
|
||||
|
||||
# default rules are in templates/prometheus.rules.yaml
|
||||
# prometheusRules: {}
|
Loading…
Add table
Add a link
Reference in a new issue