1
0
Fork 0
mirror of https://github.com/prometheus-operator/prometheus-operator.git synced 2025-04-21 03:38:43 +00:00

Merge pull request from gianrubio/helm-prometheus-2.0

Bump prometheus chart to v2.0
This commit is contained in:
Frederic Branczyk 2018-01-03 16:33:45 +01:00 committed by GitHub
commit 39856066c3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
66 changed files with 7185 additions and 775 deletions

1
.gitignore vendored
View file

@ -8,3 +8,4 @@ requirements.lock
.DS_Store
__pycache__
.env/
.history/

View file

@ -4,7 +4,9 @@ engine: gotpl
maintainers:
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com
name: alertmanager
sources:
- https://github.com/coreos/prometheus-operator
version: 0.0.5
version: 0.0.6

View file

@ -46,13 +46,14 @@ Parameter | Description | Default
`config` | Alertmanager configuration directives | `{}`
`externalUrl` | External URL at which Alertmanager will be reachable | `""`
`image.repository` | Image | `quay.io/prometheus/alertmanager`
`image.tag` | Image tag | `v0.5.1`
`image.tag` | Image tag | `v0.12.0`
`ingress.enabled` | If true, Alertmanager Ingress will be created | `false`
`ingress.annotations` | Annotations for Alertmanager Ingress` | `{}`
`ingress.fqdn` | Alertmanager Ingress fully-qualified domain name | `""`
`ingress.tls` | TLS configuration for Alertmanager Ingress | `[]`
`nodeSelector` | Node labels for pod assignment | `{}`
`paused` | If true, the Operator won't process any Alertmanager configuration changes | `false`
`prometheusRules` | Prometheus rules | `[templates/alertmanager.rules.yaml](templates/alertmanager.rules.yaml)`
`replicaCount` | Number of Alertmanager replicas desired | `1`
`resources` | Pod resource requests & limits | `{}`
`service.annotations` | Annotations to be added to the Alertmanager Service | `{}`

View file

@ -0,0 +1,32 @@
{{ define "alertmanager.rules.yaml.tpl" }}
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
"alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster
`{{`{{$labels.service}}`}}` are out of sync.
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery.
- alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{`{{ $labels.namespace
}}`}}/{{`{{ $labels.pod}}`}}.
{{ end }}

View file

@ -0,0 +1,21 @@
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: "alertmanager"
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
heritage: {{ .Release.Service }}
prometheus: {{ .Release.Name }}
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
alertmanager.rules: |-
{{- include "alertmanager.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -11,7 +11,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
alertmanager: {{ .Release.Name }}

View file

@ -4,18 +4,18 @@
config:
global:
resolve_timeout: 5m
receivers:
- name: webhook
webhook_configs:
- url: 'http://alertmanagerwh:30500/'
route:
group_by: ['job']
group_interval: 5m
group_wait: 30s
receiver: webhook
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
receiver: 'null'
receivers:
- name: 'null'
## Alertmanager template files to include
#
@ -49,7 +49,7 @@ selfServiceMonitor: true
##
image:
repository: quay.io/prometheus/alertmanager
tag: v0.7.1
tag: v0.12.0
## Labels to be added to the Alertmanager
##
@ -150,3 +150,6 @@ storageSpec: {}
# requests:
# storage: 50Gi
# selector: {}
# default rules are in templates/alertmanager.rules.yaml
# prometheusRules: {}

View file

@ -1,21 +0,0 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj

View file

@ -1,7 +0,0 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kube-api
version: 0.1.1
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com

View file

@ -1,16 +0,0 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "fullname" -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}

View file

@ -1,21 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
app: {{ template "name" . }}
component: kube-api
heritage: {{ .Release.Service }}
release: {{ .Release.Name }}
chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
name: {{ template "fullname" . }}
namespace: kube-system
spec:
clusterIP: None
ports:
- name: https-metrics
port: 443
protocol: TCP
targetPort: 443
selector:
k8s-app: kube-apiserver
type: ClusterIP

View file

@ -1,29 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
component: kube-api
heritage: "{{ .Release.Service }}"
release: "{{ .Release.Name }}"
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}
component: kube-api
namespaceSelector:
matchNames:
- "kube-system"
endpoints:
- port: https-metrics
interval: 15s
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# Skip verification until we have resolved why the certificate validation
# for the kubelet on API server nodes fail.
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token

View file

@ -1,93 +0,0 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
ruleFiles:
kube-api.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
ALERT K8SApiServerLatency
IF histogram_quantile(
0.99,
sum without (instance,node,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH"})
) / 1e6 > 1.0
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Kubernetes apiserver latency is high",
description = "99th percentile Latency for {{`{{ $labels.verb }}`}} requests to the kube-apiserver is higher than 1s.",
}
### API latency ###
# Raw metrics are in microseconds. Convert to seconds.
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(
0.99,
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
) / 1e6
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(
0.9,
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
) / 1e6
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(
0.5,
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
) / 1e6
### File descriptor alerts
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
FOR 10m
LABELS
{
severity = "warning"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS
{
summary = "file descriptors soon exhausted",
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80
FOR 10m
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kube-controller-manager
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{ $val | indent 4}}
{{- end }}
{{ else }}
kube-controller-manager.rules: |-
{{- include "kube-controller-manager.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,15 @@
{{ define "kube-controller-manager.rules.yaml.tpl" }}
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
{{ end }}

View file

@ -1,17 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
ruleFiles:
kube-controller-manager.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
ALERT K8SControllerManagerDown
IF absent(up{job="{{ template "fullname" . }}"}) or (count by(cluster) (up{job="{{ template "fullname" . }}"} == 1) == 0)
FOR 5m
LABELS {
service = "k8s",
severity = "critical",
}
ANNOTATIONS {
summary = "Controller manager is down",
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
}
# default rules are in templates/kube-controller-manager.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart singleton for kube-state-metrics
name: exporter-kube-dns
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,7 +9,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kube-etcd
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{- end }}
{{ else }}
etcd3.rules: |-
{{- include "etcd3.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,125 @@
{{ define "etcd3.rules.yaml.tpl" }}
groups:
- name: ./etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="{{ template "fullname" . }}"} == 0) > (count(up{job="{{ template "fullname" . }}"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="{{ template "fullname" . }}"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{`{{ $labels.instance }}`}} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="{{ template "fullname" . }}"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
on etcd instance {{`{{ $labels.instance }}`}}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
on etcd instance {{`{{ $labels.instance }}`}}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="{{ template "fullname" . }}",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{`{{ $labels.instance }}`}} gRPC requests to {{`{{ $labels.grpc_method
}}`}} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
instance {{`{{ $labels.instance }}`}}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
instance {{`{{ $labels.instance }}`}}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method
}}`}} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} member communication with
{{`{{ $labels.To }}`}} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="{{ template "fullname" . }}"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} proposal
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} commit durations are high
summary: high commit durations
{{ end }}

View file

@ -1,111 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
ruleFiles:
kube-etcd.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
### General cluster availability ###
# alert if another failed peer will result in an unavailable cluster
ALERT InsufficientPeers
IF count(up{job="{{ template "fullname" . }}"} == 0) > (count(up{job="{{ template "fullname" . }}"}) / 2 - 1)
FOR 3m
LABELS {
severity = "critical"
}
ANNOTATIONS {
summary = "Etcd cluster small",
description = "If one more etcd peer goes down the cluster will be unavailable",
}
### HTTP requests alerts ###
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="{{ template "fullname" . }}", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m])) > 0.01
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of HTTP requests are failing",
description = "{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}",
}
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="{{ template "fullname" . }}", code!~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m])) > 0.05
FOR 5m
LABELS {
severity = "critical"
}
ANNOTATIONS
{
summary = "a high number of HTTP requests are failing",
description = "{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}",
}
# alert if 50% of requests get a 4xx response
ALERT HighNumberOfFailedHTTPRequests
IF sum by(method) (rate(etcd_http_failed_total{job="{{ template "fullname" . }}", code=~"4[0-9]{2}"}[5m])) / sum by(method) (rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m])) > 0.5
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS
{
summary = "a high number of HTTP requests are failing",
description = "{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed with 4xx responses on etcd instance {{`{{ $labels.instance }}`}}",
}
# alert if the 99th percentile of HTTP requests take more than 150ms
ALERT HTTPRequestsSlow
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
FOR 10m
LABELS
{
severity = "warning"
}
ANNOTATIONS {
summary = "slow HTTP requests",
description = "on ectd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method }}`}} are slow",
}
ALERT K8SApiServerEtcdAccessLatency
IF etcd_request_latencies_summary{quantile="0.99"} / 1e6 > 1.0
FOR 15m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Access to etcd is slow",
description = "99th percentile latency for apiserver to access etcd is higher than 1s.",
}
### etcd proposal alerts ###
# alert if there are several failed proposals within an hour
ALERT HighNumberOfFailedProposals
IF increase(etcd_server_proposal_failed_total{job="{{ template "fullname" . }}"}[1h]) > 5
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "a high number of failed proposals within the etcd cluster are happening",
description = "etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} proposal failures within the last hour",
}
### etcd disk io latency alerts
# alert if 99th percentile of fsync durations is higher than 500ms
ALERT HighFsyncDurations
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
FOR 10m
LABELS {
severity = "warning"
}
ANNOTATIONS {
summary = "high fsync durations",
description = "ectd instance {{`{{ $labels.instance }}`}} fync durations are high",
}
# default rules are in templates/etcd3.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart singleton for kube-state-metrics
name: exporter-kube-scheduler
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{- end }}
{{ else }}
kube-scheduler.rules: |-
{{- include "kube-scheduler.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,60 @@
{{ define "kube-scheduler.rules.yaml.tpl" }}
groups:
- name: kube-scheduler.rules
rules:
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- alert: K8SSchedulerDown
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S scheduler. New pods are not being assigned
to nodes.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
summary: Scheduler is down
{{ end }}

View file

@ -1,40 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
ruleFiles:
kube-controller-manager.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
### Scheduling latency ###
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
ALERT K8SSchedulerDown
IF absent(up{job="{{ template "fullname" . }}"}) or (count by(cluster) (up{job="{{ template "fullname" . }}"} == 1) == 0)
FOR 5m
LABELS {
service = "k8s",
severity = "critical",
}
ANNOTATIONS {
summary = "Scheduler is down",
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
}
# default rules are in templates/kube-scheduler.rules.yaml
# prometheusRules: {}

View file

@ -3,5 +3,7 @@ description: A Helm chart singleton for kube-state-metrics
name: exporter-kube-state
version: 0.1.3
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -15,6 +15,7 @@ metadata:
rules:
- apiGroups: [""]
resources:
- namespaces
- nodes
- pods
- services

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{ $val | indent 4}}
{{- end }}
{{ else }}
kube-state-metrics.rules: |-
{{- include "kube-state-metrics.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,57 @@
{{ define "kube-state-metrics.rules.yaml.tpl" }}
groups:
- name: kube-state-metrics.rules
rules:
- alert: DeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 15m
labels:
severity: warning
annotations:
description: Observed deployment generation does not match expected one for
deployment {{`{{$labels.namespaces}}`}}{{`{{$labels.deployment}}`}}
- alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
severity: warning
annotations:
description: Replicas are not updated and available for deployment {{`{{$labels.namespaces}}`}}/{{`{{$labels.deployment}}`}}
- alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100
for: 15m
labels:
severity: warning
annotations:
description: Only {{`{{$value}}`}}% of desired pods scheduled and ready for daemon
set {{`{{$labels.namespaces}}`}}/{{`{{$labels.daemonset}}`}}
- alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are running where they are not supposed
to run.
summary: Daemonsets are not scheduled correctly
- alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
description: Pod {{`{{$labels.namespaces}}`}}/{{`{{$labels.pod}}`}} is was restarted {{`{{$value}}`}}
times within the last hour
{{ end }}

View file

@ -9,7 +9,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -6,7 +6,7 @@ rbacEnable: true
kube_state_metrics:
image:
repository: gcr.io/google_containers/kube-state-metrics
tag: v1.0.1
tag: v1.1.0
pullPolicy: IfNotPresent
service:
name: kube-state-metrics
@ -25,37 +25,6 @@ addon_resizer:
requests:
cpu: 100m
memory: 30Mi
ruleFiles:
kube-state.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
ALERT K8SNodeNotReady
IF kube_node_status_condition{condition="Ready", status="true"} == 0
FOR 1h
LABELS {
service = "k8s",
severity = "warning",
}
ANNOTATIONS {
summary = "Node status is NotReady",
description = "The Kubelet on {{`{{ $labels.node }}`}} has not checked in with the API, or has set itself to NotReady, for more than an hour",
}
ALERT K8SManyNodesNotReady
IF
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0) > 1
AND
(
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"} == 0)
/
count by (cluster) (kube_node_status_condition{condition="Ready", status="true"})
) > 0.2
FOR 1m
LABELS {
service = "k8s",
severity = "critical",
}
ANNOTATIONS {
summary = "Many K8s nodes are Not Ready",
description = "{{`{{ $value }}`}} K8s nodes (more than 10% of cluster {{`{{ $labels.cluster }}`}}) are in the NotReady state.",
}
# default rules are in templates/kube-state-metrics.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kubelets
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{ $val | indent 4}}
{{- end }}
{{ else }}
kubelet.rules: |-
{{- include "kubelet.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,49 @@
{{ define "kubelet.rules.yaml.tpl" }}
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{`{{ $labels.node }}`}} has not checked in with the API,
or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{`{{ $value }}`}}% of Kubernetes nodes are not ready'
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets.
- alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets, or all Kubelets
have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{`{{$labels.instance}}`}} is running {{`{{$value}}`}} pods, close
to the limit of 110
summary: Kubelet is close to pod limit
{{ end }}

View file

@ -1,164 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
ruleFiles:
kubelets.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
### Container resources ###
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_spec_memory_limit_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:spec_cpu_shares =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_spec_cpu_shares{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:cpu_usage:rate =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
irate(
container_cpu_usage_seconds_total{container_name!=""}[5m]
),
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_usage:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_usage_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_working_set:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_working_set_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_rss:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_rss{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_cache:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_memory_cache{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:disk_usage:bytes =
sum by (cluster,namespace,controller,pod_name,container_name) (
label_replace(
container_disk_usage_bytes{container_name!=""},
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
irate(
container_memory_failures_total{container_name!=""}[5m]
),
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
cluster_namespace_controller_pod_container:memory_oom:rate =
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
label_replace(
irate(
container_memory_failcnt{container_name!=""}[5m]
),
"controller", "$1",
"pod_name", "^(.*)-[a-z0-9]+"
)
)
### Cluster resources ###
cluster:memory_allocation:percent =
100 * sum by (cluster) (
container_spec_memory_limit_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
cluster:memory_used:percent =
100 * sum by (cluster) (
container_memory_usage_bytes{pod_name!=""}
) / sum by (cluster) (
machine_memory_bytes
)
cluster:cpu_allocation:percent =
100 * sum by (cluster) (
container_spec_cpu_shares{pod_name!=""}
) / sum by (cluster) (
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
)
ALERT K8SNodeDown
IF up{job="kubelet"} == 0
FOR 1h
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Kubelet cannot be scraped",
description = "Prometheus could not scrape a {{`{{ $labels.job }}`}} for more than one hour",
}
ALERT K8SKubeletDown
IF absent(up{job="kubelet"}) or count by (cluster) (up{job="kubelet"} == 0) / count by (cluster) (up{job="kubelet"}) > 0.1
FOR 1h
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Many Kubelets cannot be scraped",
description = "Prometheus failed to scrape more than 10% of kubelets, or all Kubelets have disappeared from service discovery.",
}
# Some verbs excluded because they are expected to be long-lasting:
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
ALERT K8SKubeletTooManyPods
IF kubelet_running_pod_count > 100
LABELS {
service = "k8s",
severity = "warning",
}
ANNOTATIONS {
summary = "Kubelet is close to pod limit",
description = "Kubelet {{`{{$labels.instance}}`}} is running {{`{{$value}}`}} pods, close to the limit of 110",
}
# default rules are in templates/kubelet.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kubernetes
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{ $val | indent 4}}
{{- end }}
{{ else }}
kubernetes.rules: |-
{{- include "kubernetes.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,88 @@
{{ define "kubernetes.rules.yaml.tpl" }}
groups:
- name: kubernetes.rules
rules:
- record: pod_name:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(pod_name)
- record: pod_name:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: pod_name:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
BY (pod_name)
- record: pod_name:container_fs_usage_bytes:sum
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: namespace:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- record: namespace:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
- record: namespace:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
BY (namespace)
- record: cluster:memory_usage:ratio
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
/ sum(machine_cpu_cores)
- record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
/ sum(machine_cpu_cores)
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.99"
- record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.9"
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.5"
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{`{{ $value }}`}}% of requests
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{`{{ $value }}`}}% of requests
- alert: K8SApiserverDown
expr: absent(up{job="kubernetes"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service
discovery
{{ end }}

View file

@ -1,30 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
ruleFiles:
kubernetes.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
ALERT K8SApiserverDown
IF up{job="kubernetes"} == 0
FOR 15m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "API server unreachable",
description = "An API server could not be scraped.",
}
# Disable for non HA kubernetes setups.
ALERT K8SApiserverDown
IF absent({job="kubernetes"}) or (count by(cluster) (up{job="kubernetes"} == 1) < count by(cluster) (up{job="kubernetes"}))
FOR 5m
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "API server unreachable",
description = "Prometheus failed to scrape multiple API servers, or all API servers have disappeared from service discovery.",
}
# default rules are in templates/kubernetes.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-node
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{ $val | indent 4}}
{{- end }}
{{ else }}
node.rules: |-
{{- include "node.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,46 @@
{{ define "node.rules.yaml.tpl" }}
groups:
- name: node.rules
rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- alert: NodeExporterDown
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
full within the next 24 hours (mounted at {{`{{$labels.mountpoint}}`}})
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
full within the next 2 hours (mounted at {{`{{$labels.mountpoint}}`}})
{{ end }}

View file

@ -9,7 +9,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -3,7 +3,7 @@
replicaCount: 1
image:
repository: quay.io/prometheus/node-exporter
tag: v0.14.0
tag: v0.15.2
pullPolicy: IfNotPresent
service:
type: ClusterIP
@ -16,94 +16,6 @@ resources:
requests:
cpu: 100m
memory: 30Mi
ruleFiles:
node.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
cluster:node_cpu_use:percent =
100 * sum by (cluster) (
rate(node_cpu{mode!="idle"}[5m])
) / sum by (cluster) (
machine_cpu_cores
)
ALERT K8SKubeletNodeExporterDown
IF up{job="{{ template "fullname" . }}"} == 0
FOR 15m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Kubelet node_exporter cannot be scraped",
description = "Prometheus could not scrape a {{`{{ $labels.job }}`}} for more than one hour.",
}
ALERT K8SConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 50
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{`{{ $value }}`}}% full.",
}
ALERT K8SConntrackTableFull
IF 100*node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 90
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Number of tracked connections is near the limit",
description = "The nf_conntrack table is {{`{{ $value }}`}}% full.",
}
# To catch the conntrack sysctl de-tuning when it happens
ALERT K8SConntrackTuningMissing
IF node_nf_conntrack_udp_timeout > 10
FOR 10m
LABELS {
service = "k8s",
severity = "warning",
}
ANNOTATIONS {
summary = "Node does not have the correct conntrack tunings",
description = "Nodes keep un-setting the correct tunings, investigate when it happens.",
}
ALERT K8SNodeOutOfDisk
IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "Node ran out of disk space.",
description = "{{`{{ $labels.node }}`}} has run out of disk space.",
}
ALERT K8SNodeMemoryPressure
IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under memory pressure.",
description = "{{`{{ $labels.node }}`}} is under memory pressure.",
}
ALERT K8SNodeDiskPressure
IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Node is under disk pressure.",
description = "{{`{{ $labels.node }}`}} is under disk pressure.",
}
# default rules are in templates/node.rules.yaml
# prometheusRules: {}

View file

@ -6,4 +6,4 @@ maintainers:
name: grafana
sources:
- https://github.com/coreos/prometheus-operator
version: 0.0.4
version: 0.0.5

View file

@ -9,7 +9,9 @@ metadata:
name: {{ template "grafana.server.fullname" . }}
data:
{{- if .Values.serverDashboardFiles }}
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
{{ else }}
{{- include "grafana-dashboards.yaml.tpl" . | indent 2}}
{{- end }}
{{- if .Values.dataSource }}
{{ toYaml .Values.dataSource | indent 2 }}

File diff suppressed because it is too large Load diff

View file

@ -42,7 +42,7 @@ service:
##
image:
repository: grafana/grafana
tag: 4.4.1
tag: 4.6.3
grafanaWatcher:
repository: quay.io/coreos/grafana-watcher
@ -86,4 +86,4 @@ ingress:
# Set datasource in beginning
dataSource: {}
serverDashboardFiles: {}
serverDashboardFiles: {}

View file

@ -0,0 +1,73 @@
#!/usr/bin/env python
import os
import re
from ruamel import yaml
def escape(s):
return s.replace("{{","{{`{{").replace("}}","}}`}}")
def get_header(file_name):
return "{{ define \"" + file_name + ".tpl\" }}\n"
#####
## Step 1 - Sync prometheus alert rules, create template file
####
charts = [
{'source':'contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml',
'destination': 'helm/alertmanager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source': 'contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml',
'destination': 'helm/exporter-kube-controller-manager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml',
'destination': 'helm/exporter-kube-scheduler/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-state-metrics.rules.yaml',
'destination': 'helm/exporter-kube-state/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml',
'destination': 'helm/exporter-node/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml',
'destination': 'helm/prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml',
'destination': 'helm/exporter-kube-etcd/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml',
'destination': 'helm/kube-prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml',
'destination': 'helm/exporter-kubelets/', 'job_replace_by': 'kubelet'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml',
'destination': 'helm/exporter-kubernetes/', 'job_replace_by': 'kubernetes'},
]
# read the rules, create a new template file
for chart in charts:
_, name = os.path.split(chart['source'])
lines = get_header(name)
f = open(chart['source'], 'r')
lines += escape(f.read())
lines = re.sub("job=\"(.*?)\"", "job=\"" + chart['job_replace_by'] + "\"", lines) #replace the job name by chart variable
lines += "{{ end }}" # footer
new_f = "{}/templates/{}".format(chart['destination'], name)
# recreate the file
with open(new_f, 'w') as f:
f.write(lines)
print "Generated {}".format(new_f)
######
## Step 2 - Parse grafana dashboards, create a template file
######
with open('contrib/kube-prometheus/manifests/grafana/grafana-dashboards.yaml', 'r') as s:
data = yaml.load(s, Loader=yaml.RoundTripLoader)['data']
# prometheus datasource it's not required now
del data['prometheus-datasource.json']
data_s = get_header("grafana-dashboards.yaml.tpl")
data_s += escape(yaml.dump(data, Dumper=yaml.RoundTripDumper))
data_s += "{{ end }}" # footer
with open('helm/grafana/templates/grafana-dashboards.yaml', 'w') as f:
f.write(data_s)

View file

@ -4,6 +4,8 @@ engine: gotpl
maintainers:
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com
name: kube-prometheus
sources:
- https://github.com/coreos/prometheus-operator

View file

@ -1,46 +1,31 @@
dependencies:
- name: alertmanager
version: 0.0.5
version: 0.0.6
#e2e-repository: file://../alertmanager
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: prometheus-operator
version: 0.0.7
#e2e-repository: file://../prometheus-operator
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: grafana
version: 0.0.4
#e2e-repository: file://../grafana
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: prometheus
version: 0.0.5
version: 0.0.8
#e2e-repository: file://../prometheus
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-api
version: 0.1.1
#e2e-repository: file://../exporter-kube-api
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-controller-manager
version: 0.1.1
version: 0.1.2
#e2e-repository: file://../exporter-kube-controller-manager
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-dns
version: 0.1.1
version: 0.1.2
#e2e-repository: file://../exporter-kube-dns
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-etcd
version: 0.1.1
version: 0.1.2
#e2e-repository: file://../exporter-kube-etcd
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-scheduler
version: 0.1.1
version: 0.1.2
#e2e-repository: file://../exporter-kube-scheduler
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
@ -50,23 +35,23 @@ dependencies:
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kubelets
version: 0.1.1
version: 0.1.2
#e2e-repository: file://../exporter-kubelets
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kubernetes
version: 0.1.1
version: 0.1.2
#e2e-repository: file://../exporter-kubernetes
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-node
version: 0.1.1
version: 0.1.2
#e2e-repository: file://../exporter-node
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
condition: deployExporterNode
- name: grafana
version: 0.0.4
version: 0.0.5
#e2e-repository: file://../grafana
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
condition: deployGrafana

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ tpl $val $root | indent 4}}
{{ $val | indent 4}}
{{- end }}
{{ else }}
general.rules: |-
{{- include "general.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,41 @@
{{ define "general.rules.yaml.tpl" }}
groups:
- name: general.rules
rules:
- alert: TargetDown
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $value }}`}}% of {{`{{ $labels.job }}`}} targets are down.'
summary: Targets are down
- alert: DeadMansSwitch
expr: vector(1)
labels:
severity: none
annotations:
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
pipeline is functional.
summary: Alerting DeadMansSwitch
- record: fd_utilization
expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
will exhaust in file/socket descriptors within the next 4 hours'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[10m], 3600) > 1
for: 10m
labels:
severity: critical
annotations:
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted
{{ end }}

View file

@ -11,18 +11,18 @@ alertmanager:
config:
global:
resolve_timeout: 5m
receivers:
- name: webhook
webhook_configs:
- url: 'http://alertmanagerwh:30500/'
route:
group_by: ['job']
group_interval: 5m
group_wait: 30s
receiver: webhook
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
receiver: 'null'
receivers:
- name: 'null'
## External URL at which Alertmanager will be reachable
##
@ -32,7 +32,7 @@ alertmanager:
##
image:
repository: quay.io/prometheus/alertmanager
tag: v0.5.1
tag: v0.9.1
ingress:
## If true, Alertmanager Ingress will be created
@ -147,7 +147,7 @@ prometheus:
##
image:
repository: quay.io/prometheus/prometheus
tag: v1.5.2
tag: v2.0.0
ingress:
## If true, Prometheus Ingress will be created
@ -337,3 +337,6 @@ prometheus:
# requests:
# storage: 16Gi
# selector: {}
# default rules are in templates/general.rules.yaml
# prometheusRules: {}

View file

@ -4,7 +4,9 @@ engine: gotpl
maintainers:
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com
name: prometheus-operator
sources:
- https://github.com/coreos/prometheus-operator
version: 0.0.7
version: 0.0.8

View file

@ -83,7 +83,7 @@ Parameter | Description | Default
`global.hyperkube.tag` | Hyperkube image tag | `v1.7.6_coreos.0`
`global.hyperkube.pullPolicy` | Hyperkube image pull policy | `IfNotPresent`
`image.repository` | Image | `quay.io/coreos/prometheus-operator`
`image.tag` | Image tag | `v0.13.0`
`image.tag` | Image tag | `v0.15.0`
`image.pullPolicy` | Image pull policy | `IfNotPresent`
`kubeletService.enable` | If true, the operator will create a service for scraping kubelets | `true`
`kubeletService.namespace` | The namespace in which the kubelet service should be created | `kube-system`

View file

@ -26,4 +26,4 @@ data:
- {{ .Release.Namespace | quote }}
endpoints:
- port: http
interval: 30s
interval: 30s

View file

@ -22,7 +22,7 @@ configmapReload:
##
image:
repository: quay.io/coreos/prometheus-operator
tag: v0.14.1
tag: v0.15.0
pullPolicy: IfNotPresent
## If enabled, prometheus-operator will create a service for scraping kubelets

View file

@ -2,9 +2,9 @@ apiVersion: v1
description: Prometheus instance created by the CoreOS Prometheus Operator
engine: gotpl
maintainers:
- name: Michael Goodness
- name: Giancarlo Rubio
email: mgoodness@gmail.com
name: prometheus
sources:
- https://github.com/coreos/prometheus-operator
version: 0.0.5
version: 0.0.8

View file

@ -55,6 +55,7 @@ Parameter | Description | Default
`ingress.tls` | TLS configuration for Prometheus Ingress | `[]`
`nodeSelector` | Node labels for pod assignment | `{}`
`paused` | If true, the Operator won't process any Prometheus configuration changes | `false`
`prometheusRules` | Prometheus rules | `[templates/prometheus.rules.yaml](templates/prometheus.rules.yaml)`
`replicaCount` | Number of Prometheus replicas desired | `1`
`resources` | Pod resource requests & limits | `{}`
`retention` | How long to retain metrics | `24h`

View file

@ -0,0 +1,21 @@
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: "prometheus"
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
heritage: {{ .Release.Service }}
prometheus: {{ .Release.Name }}
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
prometheus.rules: |-
{{- include "prometheus.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,73 @@
{{ define "prometheus.rules.yaml.tpl" }}
groups:
- name: prometheus.rules
rules:
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
- alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{
$labels.pod}}`}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
- alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected
to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted
{{ end }}

View file

@ -10,7 +10,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -31,7 +31,7 @@ selfServiceMonitor: true
##
image:
repository: quay.io/prometheus/prometheus
tag: v1.7.1
tag: v2.0.0
## Labels to be added to the Prometheus
##
@ -248,3 +248,6 @@ storageSpec: {}
# requests:
# storage: 50Gi
# selector: {}
# default rules are in templates/prometheus.rules.yaml
# prometheusRules: {}