mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 11:48:53 +00:00
*: bump Prometheus to v2.0.0-rc.1
This commit is contained in:
parent
74b41e3013
commit
3fbcf77287
33 changed files with 961 additions and 1246 deletions
Documentation
Makefilecontrib/kube-prometheus
assets/prometheus/rules
alertmanager.rulesalertmanager.rules.yamletcd3.rulesetcd3.rules.yamlgeneral.rulesgeneral.rules.yamlkube-apiserver.ruleskube-apiserver.rules.yamlkube-controller-manager.ruleskube-controller-manager.rules.yamlkube-scheduler.ruleskube-scheduler.rules.yamlkubelet.ruleskubelet.rules.yamlkubernetes.ruleskubernetes.rules.yamlnode.rulesnode.rules.yamlprometheus.rulesprometheus.rules.yaml
hack/scripts
manifests/prometheus
example
non-rbac
rbac/prometheus
user-guides
pkg/prometheus
|
@ -31,4 +31,4 @@ The versions of Prometheus compatible to be run with the Prometheus Operator are
|
|||
* v1.7.1
|
||||
* v1.7.2
|
||||
* v1.8.0
|
||||
* v2.0.0-beta.4
|
||||
* v2.0.0-rc.1
|
||||
|
|
|
@ -147,9 +147,12 @@ metadata:
|
|||
role: prometheus-rulefiles
|
||||
prometheus: example
|
||||
data:
|
||||
example.rules: |
|
||||
ALERT ExampleAlert
|
||||
IF vector(1)
|
||||
example.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./example.rules
|
||||
rules:
|
||||
- alert: ExampleAlert
|
||||
expr: vector(1)
|
||||
```
|
||||
|
||||
That example `ConfigMap` always immediately triggers an alert, which is only for demonstration purposes. To validate that everything is working properly have a look at each of the Prometheus web UIs.
|
||||
|
|
|
@ -299,7 +299,7 @@ metadata:
|
|||
prometheus: k8s
|
||||
spec:
|
||||
replicas: 2
|
||||
version: v1.7.2
|
||||
version: v2.0.0-rc.1
|
||||
serviceAccountName: prometheus-k8s
|
||||
serviceMonitorSelector:
|
||||
matchExpressions:
|
||||
|
|
|
@ -255,7 +255,6 @@ spec:
|
|||
serviceMonitorSelector:
|
||||
matchLabels:
|
||||
team: frontend
|
||||
version: v1.7.1
|
||||
resources:
|
||||
requests:
|
||||
memory: 400Mi
|
||||
|
|
3
Makefile
3
Makefile
|
@ -35,6 +35,9 @@ e2e-test:
|
|||
go test -timeout 20m -v ./test/migration/ $(TEST_RUN_ARGS) --kubeconfig=$(KUBECONFIG) --operator-image=$(REPO):$(TAG) --namespace=$(NAMESPACE)
|
||||
go test -timeout 20m -v ./test/e2e/ $(TEST_RUN_ARGS) --kubeconfig=$(KUBECONFIG) --operator-image=$(REPO):$(TAG) --namespace=$(NAMESPACE)
|
||||
|
||||
e2e-test-only:
|
||||
go test -timeout 20m -v ./test/e2e/ $(TEST_RUN_ARGS) --kubeconfig=$(KUBECONFIG) --operator-image=$(REPO):$(TAG) --namespace=$(NAMESPACE)
|
||||
|
||||
e2e-status:
|
||||
kubectl get prometheus,alertmanager,servicemonitor,statefulsets,deploy,svc,endpoints,pods,cm,secrets,replicationcontrollers --all-namespaces
|
||||
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
ALERT AlertmanagerConfigInconsistent
|
||||
IF count_values by (service) ("config_hash", alertmanager_config_hash)
|
||||
/ on(service) group_left
|
||||
label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alertmanager configurations are inconsistent",
|
||||
description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
|
||||
}
|
||||
|
||||
ALERT AlertmanagerDownOrMissing
|
||||
IF label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
|
||||
/ on(job) group_right
|
||||
sum by(job) (up) != 1
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alertmanager down or not discovered",
|
||||
description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
|
||||
}
|
||||
|
||||
ALERT FailedReload
|
||||
IF alertmanager_config_last_reload_successful == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alertmanager configuration reload has failed",
|
||||
description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
groups:
|
||||
- name: ./alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{$labels.service}}` are out of sync.
|
||||
summary: Alertmanager configurations are inconsistent
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
summary: Alertmanager down or not discovered
|
||||
- alert: FailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
summary: Alertmanager configuration reload has failed
|
|
@ -1,177 +0,0 @@
|
|||
# general cluster availability
|
||||
|
||||
# alert if another failed member will result in an unavailable cluster
|
||||
ALERT InsufficientMembers
|
||||
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd cluster insufficient members",
|
||||
description = "If one more etcd member goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
# etcd leader alerts
|
||||
# ==================
|
||||
|
||||
# alert if any etcd instance has no leader
|
||||
ALERT NoLeader
|
||||
IF etcd_server_has_leader{job="etcd"} == 0
|
||||
FOR 1m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member has no leader",
|
||||
description = "etcd member {{ $labels.instance }} has no leader",
|
||||
}
|
||||
|
||||
# alert if there are lots of leader changes
|
||||
ALERT HighNumberOfLeaderChanges
|
||||
IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of leader changes within the etcd cluster are happening",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
|
||||
}
|
||||
|
||||
# gRPC request alerts
|
||||
# ===================
|
||||
|
||||
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of gRPC method calls take more than 150ms
|
||||
ALERT GRPCRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow gRPC requests",
|
||||
description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
|
||||
}
|
||||
|
||||
# HTTP requests alerts
|
||||
# ====================
|
||||
|
||||
# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedHTTPRequests
|
||||
IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of HTTP requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of HTTP requests take more than 150ms
|
||||
ALERT HTTPRequestsSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow HTTP requests",
|
||||
description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
|
||||
}
|
||||
|
||||
# etcd member communication alerts
|
||||
# ================================
|
||||
|
||||
# alert if 99th percentile of round trips take 150ms
|
||||
ALERT EtcdMemberCommunicationSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member communication is slow",
|
||||
description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
|
||||
}
|
||||
|
||||
# etcd proposal alerts
|
||||
# ====================
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of proposals within the etcd cluster are failing",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
# etcd disk io latency alerts
|
||||
# ===========================
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "etcd instance {{ $labels.instance }} fync durations are high",
|
||||
}
|
||||
|
||||
# alert if 99th percentile of commit durations is higher than 250ms
|
||||
ALERT HighCommitDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high commit durations",
|
||||
description = "etcd instance {{ $labels.instance }} commit durations are high",
|
||||
}
|
123
contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml
Normal file
123
contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml
Normal file
|
@ -0,0 +1,123 @@
|
|||
groups:
|
||||
- name: ./etcd3.rules
|
||||
rules:
|
||||
- alert: InsufficientMembers
|
||||
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: NoLeader
|
||||
expr: etcd_server_has_leader{job="etcd"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{ $labels.instance }} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
||||
}} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} member communication with
|
||||
{{ $labels.To }} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
||||
failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||
summary: high commit durations
|
|
@ -1,63 +0,0 @@
|
|||
### Up Alerting ###
|
||||
|
||||
Alert TargetDown
|
||||
IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Targets are down",
|
||||
description = "{{ $value }}% or more of {{ $labels.job }} targets are down."
|
||||
}
|
||||
|
||||
### Dead man's switch ###
|
||||
|
||||
ALERT DeadMansSwitch
|
||||
IF vector(1)
|
||||
LABELS {
|
||||
severity = "none",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Alerting DeadMansSwitch",
|
||||
description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
|
||||
}
|
||||
|
||||
### File descriptor alerts ###
|
||||
|
||||
ALERT TooManyOpenFileDescriptors
|
||||
IF 100 * (process_open_fds / process_max_fds) > 95
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "too many open file descriptors",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
|
||||
}
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
groups:
|
||||
- name: ./general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
|
||||
summary: Targets are down
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
annotations:
|
||||
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
|
||||
pipeline is functional.
|
||||
summary: Alerting DeadMansSwitch
|
||||
- alert: TooManyOpenFileDescriptors
|
||||
expr: 100 * (process_open_fds / process_max_fds) > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
|
||||
$labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
|
||||
summary: too many open file descriptors
|
||||
- record: instance:fd_utilization
|
||||
expr: process_open_fds / process_max_fds
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
|
||||
$labels.instance }}) instance will exhaust in file/socket descriptors soon'
|
||||
summary: file descriptors soon exhausted
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
|
||||
$labels.instance }}) instance will exhaust in file/socket descriptors soon'
|
||||
summary: file descriptors soon exhausted
|
|
@ -1,28 +0,0 @@
|
|||
ALERT K8SApiserverDown
|
||||
IF absent(up{job="apiserver"} == 1)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "API server unreachable",
|
||||
description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
# Some verbs excluded because they are expected to be long-lasting:
|
||||
# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
|
||||
#
|
||||
# apiserver_request_latencies' unit is microseconds
|
||||
ALERT K8SApiServerLatency
|
||||
IF histogram_quantile(
|
||||
0.99,
|
||||
sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
|
||||
) / 1e6 > 1.0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubernetes apiserver latency is high",
|
||||
description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
groups:
|
||||
- name: ./kube-apiserver.rules
|
||||
rules:
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape API server(s), or all API servers have
|
||||
disappeared from service discovery.
|
||||
summary: API server unreachable
|
||||
- alert: K8SApiServerLatency
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"})
|
||||
WITHOUT (instance, resource)) / 1e+06 > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 99th percentile Latency for {{ $labels.verb }} requests to the
|
||||
kube-apiserver is higher than 1s.
|
||||
summary: Kubernetes apiserver latency is high
|
|
@ -1,11 +0,0 @@
|
|||
ALERT K8SControllerManagerDown
|
||||
IF absent(up{job="kube-controller-manager"} == 1)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Controller manager is down",
|
||||
description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
|
||||
runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
groups:
|
||||
- name: ./kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
|
@ -1,11 +0,0 @@
|
|||
ALERT K8SSchedulerDown
|
||||
IF absent(up{job="kube-scheduler"} == 1)
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Scheduler is down",
|
||||
description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
|
||||
runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler",
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
groups:
|
||||
- name: ./kube-scheduler.rules
|
||||
rules:
|
||||
- alert: K8SSchedulerDown
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S scheduler. New pods are not being assigned
|
||||
to nodes.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
||||
summary: Scheduler is down
|
|
@ -1,60 +0,0 @@
|
|||
ALERT K8SNodeNotReady
|
||||
IF kube_node_status_condition{condition="Ready", status="true"} == 0
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node status is NotReady",
|
||||
description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
|
||||
}
|
||||
|
||||
ALERT K8SManyNodesNotReady
|
||||
IF
|
||||
count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1
|
||||
AND
|
||||
(
|
||||
count(kube_node_status_condition{condition="Ready", status="true"} == 0)
|
||||
/
|
||||
count(kube_node_status_condition{condition="Ready", status="true"})
|
||||
) > 0.2
|
||||
FOR 1m
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubernetes nodes are Not Ready",
|
||||
description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletDown
|
||||
IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity = "critical",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Many Kubelets cannot be scraped",
|
||||
description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SKubeletTooManyPods
|
||||
IF kubelet_running_pod_count > 100
|
||||
LABELS {
|
||||
severity = "warning",
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Kubelet is close to pod limit",
|
||||
description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
groups:
|
||||
- name: ./kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
||||
or has set itself to NotReady, for more than an hour
|
||||
summary: Node status is NotReady
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
|
||||
state).'
|
||||
summary: Many Kubernetes nodes are Not Ready
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletDown
|
||||
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
|
||||
> 0.1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
||||
have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||
to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
|
@ -1,171 +0,0 @@
|
|||
# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
|
||||
|
||||
### Container resources ###
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_spec_memory_limit_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:spec_cpu_shares =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_spec_cpu_shares{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:cpu_usage:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_cpu_usage_seconds_total{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_usage_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_working_set:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_working_set_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_rss:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_rss{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_cache:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_memory_cache{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:disk_usage:bytes =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name) (
|
||||
label_replace(
|
||||
container_disk_usage_bytes{container_name!=""},
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_pagefaults:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_memory_failures_total{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
cluster_namespace_controller_pod_container:memory_oom:rate =
|
||||
sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
|
||||
label_replace(
|
||||
irate(
|
||||
container_memory_failcnt{container_name!=""}[5m]
|
||||
),
|
||||
"controller", "$1",
|
||||
"pod_name", "^(.*)-[a-z0-9]+"
|
||||
)
|
||||
)
|
||||
|
||||
### Cluster resources ###
|
||||
|
||||
cluster:memory_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_memory_limit_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
cluster:memory_used:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_memory_usage_bytes{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
machine_memory_bytes
|
||||
)
|
||||
|
||||
cluster:cpu_allocation:percent =
|
||||
100 * sum by (cluster) (
|
||||
container_spec_cpu_shares{pod_name!=""}
|
||||
) / sum by (cluster) (
|
||||
container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
|
||||
)
|
||||
|
||||
cluster:node_cpu_use:percent =
|
||||
100 * sum by (cluster) (
|
||||
rate(node_cpu{mode!="idle"}[5m])
|
||||
) / sum by (cluster) (
|
||||
machine_cpu_cores
|
||||
)
|
||||
|
||||
### API latency ###
|
||||
|
||||
# Raw metrics are in microseconds. Convert to seconds.
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(
|
||||
0.99,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(
|
||||
0.9,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(
|
||||
0.5,
|
||||
sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
|
||||
) / 1e6
|
||||
|
||||
### Scheduling latency ###
|
||||
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
|
||||
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
|
||||
histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
|
||||
histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
||||
cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
|
||||
histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
|
|
@ -0,0 +1,115 @@
|
|||
groups:
|
||||
- name: ./kubernetes.rules
|
||||
rules:
|
||||
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
||||
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
||||
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
||||
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
||||
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
||||
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
||||
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
||||
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
||||
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
||||
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name, scope, type)
|
||||
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
||||
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name, scope, type)
|
||||
- record: cluster:memory_allocation:percent
|
||||
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
|
||||
/ sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:memory_used:percent
|
||||
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
|
||||
BY (cluster)
|
||||
- record: cluster:cpu_allocation:percent
|
||||
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
|
||||
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
||||
- record: cluster:node_cpu_use:percent
|
||||
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
|
||||
BY (cluster)
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
|
@ -1,43 +0,0 @@
|
|||
ALERT NodeExporterDown
|
||||
IF absent(up{job="node-exporter"} == 1)
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "node-exporter cannot be scraped",
|
||||
description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeOutOfDisk
|
||||
IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node ran out of disk space.",
|
||||
description = "{{ $labels.node }} has run out of disk space.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeMemoryPressure
|
||||
IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under memory pressure.",
|
||||
description = "{{ $labels.node }} is under memory pressure.",
|
||||
}
|
||||
|
||||
ALERT K8SNodeDiskPressure
|
||||
IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1
|
||||
LABELS {
|
||||
service = "k8s",
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Node is under disk pressure.",
|
||||
description = "{{ $labels.node }} is under disk pressure.",
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
groups:
|
||||
- name: ./node.rules
|
||||
rules:
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="node-exporter"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery.
|
||||
summary: node-exporter cannot be scraped
|
||||
- alert: K8SNodeOutOfDisk
|
||||
expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
|
||||
labels:
|
||||
service: k8s
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.node }} has run out of disk space.'
|
||||
summary: Node ran out of disk space.
|
||||
- alert: K8SNodeMemoryPressure
|
||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||
1
|
||||
labels:
|
||||
service: k8s
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.node }} is under memory pressure.'
|
||||
summary: Node is under memory pressure.
|
||||
- alert: K8SNodeDiskPressure
|
||||
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
|
||||
labels:
|
||||
service: k8s
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.node }} is under disk pressure.'
|
||||
summary: Node is under disk pressure.
|
|
@ -1,10 +0,0 @@
|
|||
ALERT FailedReload
|
||||
IF prometheus_config_last_reload_successful == 0
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "Prometheus configuration reload has failed",
|
||||
description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
groups:
|
||||
- name: ./prometheus.rules
|
||||
rules:
|
||||
- alert: FailedReload
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
summary: Prometheus configuration reload has failed
|
|
@ -11,7 +11,7 @@ metadata:
|
|||
data:
|
||||
EOF
|
||||
|
||||
for f in assets/prometheus/rules/*.rules
|
||||
for f in assets/prometheus/rules/*.rules.yaml
|
||||
do
|
||||
echo " $(basename $f): |+"
|
||||
cat $f | sed "s/^/ /g"
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -6,7 +6,7 @@ metadata:
|
|||
prometheus: k8s
|
||||
spec:
|
||||
replicas: 2
|
||||
version: v1.7.2
|
||||
version: v2.0.0-rc.1
|
||||
serviceAccountName: prometheus-k8s
|
||||
serviceMonitorSelector:
|
||||
matchExpressions:
|
||||
|
|
|
@ -6,7 +6,6 @@ metadata:
|
|||
prometheus: main
|
||||
spec:
|
||||
replicas: 2
|
||||
version: v1.5.2
|
||||
serviceMonitorSelector:
|
||||
matchExpressions:
|
||||
- {key: app, operator: In, values: [node-exporter, example-app]}
|
||||
|
|
|
@ -6,7 +6,6 @@ metadata:
|
|||
prometheus: prometheus
|
||||
spec:
|
||||
replicas: 2
|
||||
version: v1.5.2
|
||||
serviceAccountName: prometheus
|
||||
serviceMonitorSelector:
|
||||
matchLabels:
|
||||
|
|
|
@ -6,6 +6,9 @@ metadata:
|
|||
role: prometheus-rulefiles
|
||||
prometheus: example
|
||||
data:
|
||||
example.rules: |
|
||||
ALERT ExampleAlert
|
||||
IF vector(1)
|
||||
example.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./example.rules
|
||||
rules:
|
||||
- alert: ExampleAlert
|
||||
expr: vector(1)
|
||||
|
|
|
@ -6,7 +6,6 @@ spec:
|
|||
serviceMonitorSelector:
|
||||
matchLabels:
|
||||
team: frontend
|
||||
version: v1.7.1
|
||||
resources:
|
||||
requests:
|
||||
memory: 400Mi
|
||||
|
|
|
@ -37,7 +37,7 @@ import (
|
|||
|
||||
const (
|
||||
governingServiceName = "prometheus-operated"
|
||||
DefaultVersion = "v1.7.1"
|
||||
DefaultVersion = "v2.0.0-rc.1"
|
||||
defaultRetention = "24h"
|
||||
configMapsFilename = "configmaps.json"
|
||||
)
|
||||
|
@ -66,7 +66,7 @@ var (
|
|||
"v1.7.1",
|
||||
"v1.7.2",
|
||||
"v1.8.0",
|
||||
"v2.0.0-beta.4",
|
||||
"v2.0.0-rc.1",
|
||||
}
|
||||
)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue