1
0
Fork 0
mirror of https://github.com/prometheus-operator/prometheus-operator.git synced 2025-04-21 03:38:43 +00:00

Delete chart exporter-kube-api because it has been replaced by kube-controller-manager alerts

This commit is contained in:
Giancarlo Rubio 2017-12-23 09:41:12 +01:00
parent 3923139e40
commit 59421f7018
64 changed files with 7171 additions and 7201 deletions

1
.gitignore vendored
View file

@ -8,3 +8,4 @@ requirements.lock
.DS_Store
__pycache__
.env/
.history/

View file

@ -10,4 +10,4 @@ groups:
description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
summary: Controller manager is down

View file

@ -203,7 +203,7 @@ addArrayToConfigMap() {
# Dashboard foot
test "$type" = "dashboard" && cat $DASHBOARD_FOOT_FILE
done
#echo "---"
echo "---"
IFS=$OLDIFS
return 0

View file

@ -4,6 +4,8 @@ engine: gotpl
maintainers:
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com
name: alertmanager
sources:
- https://github.com/coreos/prometheus-operator

View file

@ -46,13 +46,14 @@ Parameter | Description | Default
`config` | Alertmanager configuration directives | `{}`
`externalUrl` | External URL at which Alertmanager will be reachable | `""`
`image.repository` | Image | `quay.io/prometheus/alertmanager`
`image.tag` | Image tag | `v0.9.1`
`image.tag` | Image tag | `v0.12.0`
`ingress.enabled` | If true, Alertmanager Ingress will be created | `false`
`ingress.annotations` | Annotations for Alertmanager Ingress` | `{}`
`ingress.fqdn` | Alertmanager Ingress fully-qualified domain name | `""`
`ingress.tls` | TLS configuration for Alertmanager Ingress | `[]`
`nodeSelector` | Node labels for pod assignment | `{}`
`paused` | If true, the Operator won't process any Alertmanager configuration changes | `false`
`prometheusRules` | Prometheus rules | `[templates/alertmanager.rules.yaml](templates/alertmanager.rules.yaml)`
`replicaCount` | Number of Alertmanager replicas desired | `1`
`resources` | Pod resource requests & limits | `{}`
`service.annotations` | Annotations to be added to the Alertmanager Service | `{}`

View file

@ -0,0 +1,32 @@
{{ define "alertmanager.rules.yaml.tpl" }}
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
"alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster
`{{`{{$labels.service}}`}}` are out of sync.
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery.
- alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{`{{ $labels.namespace
}}`}}/{{`{{ $labels.pod}}`}}.
{{ end }}

View file

@ -0,0 +1,21 @@
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: "alertmanager"
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
heritage: {{ .Release.Service }}
prometheus: {{ .Release.Name }}
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
alertmanager.rules: |-
{{- include "alertmanager.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -11,7 +11,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
alertmanager: {{ .Release.Name }}

View file

@ -49,7 +49,7 @@ selfServiceMonitor: true
##
image:
repository: quay.io/prometheus/alertmanager
tag: v0.9.1
tag: v0.12.0
## Labels to be added to the Alertmanager
##
@ -150,36 +150,6 @@ storageSpec: {}
# requests:
# storage: 50Gi
# selector: {}
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
alertmanager.rules: |-
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
"alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster
`{{$labels.service}}` are out of sync.
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery.
- alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
# default rules are in templates/alertmanager.rules.yaml
# prometheusRules: {}

View file

@ -1,21 +0,0 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj

View file

@ -1,7 +0,0 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kube-api
version: 0.1.1
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com

View file

@ -1,16 +0,0 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "fullname" -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}

View file

@ -1,21 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
app: {{ template "name" . }}
component: kube-api
heritage: {{ .Release.Service }}
release: {{ .Release.Name }}
chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}
name: {{ template "fullname" . }}
namespace: kube-system
spec:
clusterIP: None
ports:
- name: https-metrics
port: 443
protocol: TCP
targetPort: 443
selector:
k8s-app: kube-apiserver
type: ClusterIP

View file

@ -1,29 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
component: kube-api
heritage: "{{ .Release.Service }}"
release: "{{ .Release.Name }}"
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}
component: kube-api
namespaceSelector:
matchNames:
- "kube-system"
endpoints:
- port: https-metrics
interval: 15s
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# Skip verification until we have resolved why the certificate validation
# for the kubelet on API server nodes fail.
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token

View file

@ -1,93 +0,0 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
ruleFiles:
kube-api.rules: |-
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
ALERT K8SApiServerLatency
IF histogram_quantile(
0.99,
sum without (instance,node,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH"})
) / 1e6 > 1.0
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "Kubernetes apiserver latency is high",
description = "99th percentile Latency for {{`{{ $labels.verb }}`}} requests to the kube-apiserver is higher than 1s.",
}
### API latency ###
# Raw metrics are in microseconds. Convert to seconds.
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
histogram_quantile(
0.99,
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
) / 1e6
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
histogram_quantile(
0.9,
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
) / 1e6
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
histogram_quantile(
0.5,
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
) / 1e6
### File descriptor alerts
instance:fd_utilization = process_open_fds / process_max_fds
# alert if file descriptors are likely to exhaust within the next 4 hours
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
FOR 10m
LABELS
{
severity = "warning"
}
ANNOTATIONS {
summary = "file descriptors soon exhausted",
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
}
# alert if file descriptors are likely to exhaust within the next hour
ALERT FdExhaustionClose
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
FOR 10m
LABELS {
severity = "critical"
}
ANNOTATIONS
{
summary = "file descriptors soon exhausted",
description = "{{`{{ $labels.job }}`}} instance {{`{{ $labels.instance }}`}} will exhaust in file descriptors soon",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 50
FOR 10m
LABELS {
service = "k8s",
severity = "warning"
}
ANNOTATIONS {
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
}
ALERT K8STooManyOpenFiles
IF 100*process_open_fds{job=~"kubelets|kubernetes"} / process_max_fds > 80
FOR 10m
LABELS {
service = "k8s",
severity = "critical"
}
ANNOTATIONS {
summary = "{{`{{ $labels.job }}`}} has too many open file descriptors",
description = "{{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors.",
}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kube-controller-manager
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
kube-controller-manager.rules: |-
{{- include "kube-controller-manager.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,15 @@
{{ define "kube-controller-manager.rules.yaml.tpl" }}
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
{{ end }}

View file

@ -1,16 +1,2 @@
# WARNING: Don't touch this file, it's automatically generated by helm/hack/sync-kube-prometheus.sh
ruleFiles:
kube-controller-manager.rules: |-
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
# default rules are in templates/kube-controller-manager.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart singleton for kube-state-metrics
name: exporter-kube-dns
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,7 +9,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kube-etcd
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{ tpl $val $root | indent 4}}
{{- end }}
{{ else }}
etcd3.rules: |-
{{- include "etcd3.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,125 @@
{{ define "etcd3.rules.yaml.tpl" }}
groups:
- name: ./etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="{{ template "fullname" . }}"} == 0) > (count(up{job="{{ template "fullname" . }}"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="{{ template "fullname" . }}"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{`{{ $labels.instance }}`}} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="{{ template "fullname" . }}"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
on etcd instance {{`{{ $labels.instance }}`}}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="{{ template "fullname" . }}"}[5m])) BY (grpc_service, grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed
on etcd instance {{`{{ $labels.instance }}`}}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="{{ template "fullname" . }}",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{`{{ $labels.instance }}`}} gRPC requests to {{`{{ $labels.grpc_method
}}`}} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
instance {{`{{ $labels.instance }}`}}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="{{ template "fullname" . }}"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="{{ template "fullname" . }}"}[5m]))
BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd
instance {{`{{ $labels.instance }}`}}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method
}}`}} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} member communication with
{{`{{ $labels.To }}`}} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="{{ template "fullname" . }}"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} proposal
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{`{{ $labels.instance }}`}} commit durations are high
summary: high commit durations
{{ end }}

View file

@ -1,128 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
etcd3.rules: |-
groups:
- name: ./etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
}} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with
{{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
# default rules are in templates/etcd3.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart singleton for kube-state-metrics
name: exporter-kube-scheduler
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{ tpl $val $root | indent 4}}
{{- end }}
{{ else }}
kube-scheduler.rules: |-
{{- include "kube-scheduler.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,60 @@
{{ define "kube-scheduler.rules.yaml.tpl" }}
groups:
- name: kube-scheduler.rules
rules:
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- alert: K8SSchedulerDown
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S scheduler. New pods are not being assigned
to nodes.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
summary: Scheduler is down
{{ end }}

View file

@ -1,63 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
kube-scheduler.rules: |-
groups:
- name: kube-scheduler.rules
rules:
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- alert: K8SSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S scheduler. New pods are not being assigned
to nodes.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
summary: Scheduler is down
# default rules are in templates/kube-scheduler.rules.yaml
# prometheusRules: {}

View file

@ -3,5 +3,7 @@ description: A Helm chart singleton for kube-state-metrics
name: exporter-kube-state
version: 0.1.3
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
kube-state-metrics.rules: |-
{{- include "kube-state-metrics.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,57 @@
{{ define "kube-state-metrics.rules.yaml.tpl" }}
groups:
- name: kube-state-metrics.rules
rules:
- alert: DeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 15m
labels:
severity: warning
annotations:
description: Observed deployment generation does not match expected one for
deployment {{`{{$labels.namespaces}}`}}{{`{{$labels.deployment}}`}}
- alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
severity: warning
annotations:
description: Replicas are not updated and available for deployment {{`{{$labels.namespaces}}`}}/{{`{{$labels.deployment}}`}}
- alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100
for: 15m
labels:
severity: warning
annotations:
description: Only {{`{{$value}}`}}% of desired pods scheduled and ready for daemon
set {{`{{$labels.namespaces}}`}}/{{`{{$labels.daemonset}}`}}
- alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are running where they are not supposed
to run.
summary: Daemonsets are not scheduled correctly
- alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
description: Pod {{`{{$labels.namespaces}}`}}/{{`{{$labels.pod}}`}} is was restarted {{`{{$value}}`}}
times within the last hour
{{ end }}

View file

@ -9,7 +9,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -25,61 +25,6 @@ addon_resizer:
requests:
cpu: 100m
memory: 30Mi
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
kube-state-metrics.rules: |-
groups:
- name: kube-state-metrics.rules
rules:
- alert: DeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 15m
labels:
severity: warning
annotations:
description: Observed deployment generation does not match expected one for
deployment {{$labels.namespaces}}{{$labels.deployment}}
- alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
severity: warning
annotations:
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
- alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100
for: 15m
labels:
severity: warning
annotations:
description: Only {{$value}}% of desired pods scheduled and ready for daemon
set {{$labels.namespaces}}/{{$labels.daemonset}}
- alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are running where they are not supposed
to run.
summary: Daemonsets are not scheduled correctly
- alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
times within the last hour
# default rules are in templates/kube-state-metrics.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kubelets
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
kubelet.rules: |-
{{- include "kubelet.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,49 @@
{{ define "kubelet.rules.yaml.tpl" }}
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{`{{ $labels.node }}`}} has not checked in with the API,
or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{`{{ $value }}`}}% of Kubernetes nodes are not ready'
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets.
- alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{`{{ $value }}`}}% of kubelets, or all Kubelets
have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{`{{$labels.instance}}`}} is running {{`{{$value}}`}} pods, close
to the limit of 110
summary: Kubelet is close to pod limit
{{ end }}

View file

@ -1,52 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
kubelet.rules: |-
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }}% of Kubernetes nodes are not ready'
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
- alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
to the limit of 110
summary: Kubelet is close to pod limit
# default rules are in templates/kubelet.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-kubernetes
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
kubernetes.rules: |-
{{- include "kubernetes.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,88 @@
{{ define "kubernetes.rules.yaml.tpl" }}
groups:
- name: kubernetes.rules
rules:
- record: pod_name:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(pod_name)
- record: pod_name:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: pod_name:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
BY (pod_name)
- record: pod_name:container_fs_usage_bytes:sum
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: namespace:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- record: namespace:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
- record: namespace:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
BY (namespace)
- record: cluster:memory_usage:ratio
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
/ sum(machine_cpu_cores)
- record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
/ sum(machine_cpu_cores)
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.99"
- record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.9"
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.5"
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{`{{ $value }}`}} seconds
for {{`{{$labels.verb}}`}} {{`{{$labels.resource}}`}}
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{`{{ $value }}`}}% of requests
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{`{{ $value }}`}}% of requests
- alert: K8SApiserverDown
expr: absent(up{job="kubernetes"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service
discovery
{{ end }}

View file

@ -1,91 +1,2 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
kubernetes.rules: |-
groups:
- name: kubernetes.rules
rules:
- record: pod_name:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(pod_name)
- record: pod_name:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: pod_name:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
BY (pod_name)
- record: pod_name:container_fs_usage_bytes:sum
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: namespace:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- record: namespace:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
- record: namespace:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
BY (namespace)
- record: cluster:memory_usage:ratio
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
/ sum(machine_cpu_cores)
- record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
/ sum(machine_cpu_cores)
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.99"
- record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.9"
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.5"
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{ $value }}% of requests
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{ $value }}% of requests
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service
discovery
# default rules are in templates/kubernetes.rules.yaml
# prometheusRules: {}

View file

@ -1,7 +1,9 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: exporter-node
version: 0.1.1
version: 0.1.2
maintainers:
- name: Cloud Posse LLC
email: hello@cloudposse.com
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
node.rules: |-
{{- include "node.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,46 @@
{{ define "node.rules.yaml.tpl" }}
groups:
- name: node.rules
rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- alert: NodeExporterDown
expr: absent(up{job="{{ template "fullname" . }}"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
full within the next 24 hours (mounted at {{`{{$labels.mountpoint}}`}})
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{`{{$labels.device}}`}} on node {{`{{$labels.instance}}`}} is running
full within the next 2 hours (mounted at {{`{{$labels.mountpoint}}`}})
{{ end }}

View file

@ -9,7 +9,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -3,7 +3,7 @@
replicaCount: 1
image:
repository: quay.io/prometheus/node-exporter
tag: v0.15.1
tag: v0.15.2
pullPolicy: IfNotPresent
service:
type: ClusterIP
@ -16,50 +16,6 @@ resources:
requests:
cpu: 100m
memory: 30Mi
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
node.rules: |-
groups:
- name: node.rules
rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 24 hours (mounted at {{$labels.mountpoint}})
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 2 hours (mounted at {{$labels.mountpoint}})
# default rules are in templates/node.rules.yaml
# prometheusRules: {}

View file

@ -9,7 +9,9 @@ metadata:
name: {{ template "grafana.server.fullname" . }}
data:
{{- if .Values.serverDashboardFiles }}
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
{{ toYaml .Values.serverDashboardFiles | indent 2 }}
{{ else }}
{{- include "grafana-dashboards.yaml.tpl" . | indent 2}}
{{- end }}
{{- if .Values.dataSource }}
{{ toYaml .Values.dataSource | indent 2 }}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,63 +1,73 @@
#!/usr/bin/env python
import os
import re
from ruamel import yaml
def escape(s):
return s.replace("{{","{{`{{").replace("}}","}}`}}")
def get_header(file_name):
return "{{ define \"" + file_name + ".tpl\" }}\n"
#####
## This script read the kube-prometheus rules and convert into helm charts format
## Step 1 - Sync prometheus alert rules, create template file
####
### ----------------------------
### Sync all prometheus rules
###
charts = [{'file_name': 'alertmanager', 'search_var': 'ruleFiles',
'source':'contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml',
'destination': 'helm/alertmanager/values.yaml'},
{'file_name': 'kube-controller-manager', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml',
'destination': 'helm/exporter-kube-controller-manager/values.yaml'},
{'file_name': 'kube-scheduler', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml',
'destination': 'helm/exporter-kube-scheduler/values.yaml'},
{'file_name': 'kube-state-metrics', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-state-metrics.rules.yaml',
'destination': 'helm/exporter-kube-state/values.yaml'},
{'file_name': 'node', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml',
'destination': 'helm/exporter-node/values.yaml'},
{'file_name': 'prometheus', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml',
'destination': 'helm/prometheus/values.yaml'},
{'file_name': 'etcd3', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml',
'destination': 'helm/exporter-kube-etcd/values.yaml'},
# //TODO add {'file_name': 'general', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml',
# 'destination': 'helm/kube-prometheus/general_rules.yaml'},
{'file_name': 'kubelet', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml',
'destination': 'helm/exporter-kubelets/values.yaml'},
{'file_name': 'kubernetes', 'search_var': 'ruleFiles', 'source':'contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml',
'destination': 'helm/exporter-kubernetes/values.yaml'},
###
### Sync grafana dashboards
###
{'file_name': 'grafana-dashboards-0', 'search_var': 'serverDashboardFiles', 'source':'contrib/kube-prometheus/manifests/grafana/grafana-dashboards.yaml',
'destination': 'helm/grafana/values.yaml'},
charts = [
{'source':'contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml',
'destination': 'helm/alertmanager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source': 'contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml',
'destination': 'helm/exporter-kube-controller-manager/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml',
'destination': 'helm/exporter-kube-scheduler/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kube-state-metrics.rules.yaml',
'destination': 'helm/exporter-kube-state/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml',
'destination': 'helm/exporter-node/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml',
'destination': 'helm/prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml',
'destination': 'helm/exporter-kube-etcd/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml',
'destination': 'helm/kube-prometheus/', 'job_replace_by': '{{ template \"fullname\" . }}'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml',
'destination': 'helm/exporter-kubelets/', 'job_replace_by': 'kubelet'},
{'source':'contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml',
'destination': 'helm/exporter-kubernetes/', 'job_replace_by': 'kubernetes'},
]
# read the rules, create a new template file
for chart in charts:
lines = ""
## parse current values.yaml file
f = open(chart['destination'], 'r')
for l in f.readlines():
# stop reading file after the rule
if "{}:".format(chart['search_var']) in l:
break
lines+= l
_, name = os.path.split(chart['source'])
lines = get_header(name)
lines+= "{}:\n".format(chart['search_var'])
lines+= " {}.rules: |-\n".format(chart['file_name'])
## parse kube-prometheus rule
f = open(chart['source'])
for l in f.readlines():
lines += " {}".format(l)
f = open(chart['source'], 'r')
lines += escape(f.read())
lines = re.sub("job=\"(.*?)\"", "job=\"" + chart['job_replace_by'] + "\"", lines) #replace the job name by chart variable
lines += "{{ end }}" # footer
new_f = "{}/templates/{}".format(chart['destination'], name)
# recreate the file
with open(chart['destination'], 'w') as f:
with open(new_f, 'w') as f:
f.write(lines)
print "Generated {}".format(new_f)
### ----------------------------
### 2
###
######
## Step 2 - Parse grafana dashboards, create a template file
######
with open('contrib/kube-prometheus/manifests/grafana/grafana-dashboards.yaml', 'r') as s:
data = yaml.load(s, Loader=yaml.RoundTripLoader)['data']
# prometheus datasource it's not required now
del data['prometheus-datasource.json']
data_s = get_header("grafana-dashboards.yaml.tpl")
data_s += escape(yaml.dump(data, Dumper=yaml.RoundTripDumper))
data_s += "{{ end }}" # footer
with open('helm/grafana/templates/grafana-dashboards.yaml', 'w') as f:
f.write(data_s)

View file

@ -4,6 +4,8 @@ engine: gotpl
maintainers:
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com
name: kube-prometheus
sources:
- https://github.com/coreos/prometheus-operator

View file

@ -1,62 +1,63 @@
dependencies:
- name: alertmanager
version: 0.0.6
repository: file://../alertmanager
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
#e2e-repository: file://../alertmanager
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: prometheus
version: 0.0.8
repository: file://../prometheus
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
#e2e-repository: file://../prometheus
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-api
version: 0.1.1
repository: file://../exporter-kube-api
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-controller-manager
version: 0.1.1
repository: file://../exporter-kube-controller-manager
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
version: 0.1.2
#e2e-repository: file://../exporter-kube-controller-manager
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-dns
version: 0.1.1
repository: file://../exporter-kube-dns
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
version: 0.1.2
#e2e-repository: file://../exporter-kube-dns
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-etcd
version: 0.1.1
repository: file://../exporter-kube-etcd
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
version: 0.1.2
#e2e-repository: file://../exporter-kube-etcd
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-scheduler
version: 0.1.1
repository: file://../exporter-kube-scheduler
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
version: 0.1.2
#e2e-repository: file://../exporter-kube-scheduler
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kube-state
version: 0.1.3
repository: file://../exporter-kube-state
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
#e2e-repository: file://../exporter-kube-state
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kubelets
version: 0.1.1
repository: file://../exporter-kubelets
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
version: 0.1.2
#e2e-repository: file://../exporter-kubelets
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-kubernetes
version: 0.1.1
repository: file://../exporter-kubernetes
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
version: 0.1.2
#e2e-repository: file://../exporter-kubernetes
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
- name: exporter-node
version: 0.1.1
repository: file://../exporter-node
# repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
version: 0.1.2
#e2e-repository: file://../exporter-node
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
condition: deployExporterNode
- name: grafana
version: 0.0.5
repository: file://../grafana
#repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
#e2e-repository: file://../grafana
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
condition: deployGrafana
- name: prometheus-operator
version: 0.0.8
#e2e-repository: file://../prometheus-operator
repository: https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
condition: deployPrometheusOperator

View file

@ -9,8 +9,13 @@ metadata:
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.ruleFiles }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
general.rules: |-
{{- include "general.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,41 @@
{{ define "general.rules.yaml.tpl" }}
groups:
- name: general.rules
rules:
- alert: TargetDown
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $value }}`}}% of {{`{{ $labels.job }}`}} targets are down.'
summary: Targets are down
- alert: DeadMansSwitch
expr: vector(1)
labels:
severity: none
annotations:
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
pipeline is functional.
summary: Alerting DeadMansSwitch
- record: fd_utilization
expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
for: 10m
labels:
severity: warning
annotations:
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
will exhaust in file/socket descriptors within the next 4 hours'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[10m], 3600) > 1
for: 10m
labels:
severity: critical
annotations:
description: '{{`{{ $labels.job }}`}}: {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} instance
will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted
{{ end }}

View file

@ -4,6 +4,9 @@ deployExporterNode: True
# Grafana
deployGrafana: True
# Prometheus operator
deployPrometheusOperator: False
alertmanager:
## Alertmanager configuration directives
## Ref: https://prometheus.io/docs/alerting/configuration/
@ -24,7 +27,6 @@ alertmanager:
receivers:
- name: 'null'
## External URL at which Alertmanager will be reachable
##
externalUrl: ""
@ -338,3 +340,6 @@ prometheus:
# requests:
# storage: 16Gi
# selector: {}
# default rules are in templates/general.rules.yaml
# prometheusRules: {}

View file

@ -4,6 +4,8 @@ engine: gotpl
maintainers:
- name: Michael Goodness
email: mgoodness@gmail.com
- name: Giancarlo Rubio
email: gianrubio@gmail.com
name: prometheus-operator
sources:
- https://github.com/coreos/prometheus-operator

View file

@ -26,4 +26,4 @@ data:
- {{ .Release.Namespace | quote }}
endpoints:
- port: http
interval: 30s
interval: 30s

View file

@ -2,7 +2,7 @@ apiVersion: v1
description: Prometheus instance created by the CoreOS Prometheus Operator
engine: gotpl
maintainers:
- name: Michael Goodness
- name: Giancarlo Rubio
email: mgoodness@gmail.com
name: prometheus
sources:

View file

@ -55,6 +55,7 @@ Parameter | Description | Default
`ingress.tls` | TLS configuration for Prometheus Ingress | `[]`
`nodeSelector` | Node labels for pod assignment | `{}`
`paused` | If true, the Operator won't process any Prometheus configuration changes | `false`
`prometheusRules` | Prometheus rules | `[templates/prometheus.rules.yaml](templates/prometheus.rules.yaml)`
`replicaCount` | Number of Prometheus replicas desired | `1`
`resources` | Pod resource requests & limits | `{}`
`retention` | How long to retain metrics | `24h`

View file

@ -0,0 +1,21 @@
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: "prometheus"
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
heritage: {{ .Release.Service }}
prometheus: {{ .Release.Name }}
release: {{ .Release.Name }}
name: {{ template "fullname" . }}
data:
{{- if .Values.prometheusRules }}
{{- $root := . }}
{{- range $key, $val := .Values.prometheusRules }}
{{ $key }}: |-
{{ $val | indent 4}}
{{- end }}
{{ else }}
prometheus.rules: |-
{{- include "prometheus.rules.yaml.tpl" . | indent 4}}
{{ end }}

View file

@ -0,0 +1,73 @@
{{ define "prometheus.rules.yaml.tpl" }}
groups:
- name: prometheus.rules
rules:
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
- alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{
$labels.pod}}`}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{
$labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
- alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected
to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted
{{ end }}

View file

@ -10,7 +10,7 @@ metadata:
prometheus: {{ .Release.Name }}
name: {{ template "fullname" . }}
spec:
jobLabel: {{ template "name" . }}
jobLabel: {{ template "fullname" . }}
selector:
matchLabels:
app: {{ template "name" . }}

View file

@ -248,77 +248,6 @@ storageSpec: {}
# requests:
# storage: 50Gi
# selector: {}
# WARNING: Don't change this file after this lines, it's automatically appended by helm/hack/sync_kube_prometheus.py
ruleFiles:
prometheus.rules: |-
groups:
- name: prometheus.rules
rules:
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted
# default rules are in templates/prometheus.rules.yaml
# prometheusRules: {}