*: bump Prometheus to v2.0.0-rc.1

2025-04-21 11:48:53 +00:00 · 2017-10-16 15:11:53 +02:00 · 2017-10-16 15:11:53 +02:00 · 3fbcf77287
commit 3fbcf77287
parent 74b41e3013
33 changed files with 961 additions and 1246 deletions
--- a/Documentation/compatibility.md
+++ b/Documentation/compatibility.md
@ -31,4 +31,4 @@ The versions of Prometheus compatible to be run with the Prometheus Operator are
 * v1.7.1
 * v1.7.2
 * v1.8.0
-* v2.0.0-beta.4
+* v2.0.0-rc.1
--- a/Documentation/user-guides/alerting.md
+++ b/Documentation/user-guides/alerting.md
@ -147,9 +147,12 @@ metadata:
    role: prometheus-rulefiles
    prometheus: example
 data:
-  example.rules: |
-    ALERT ExampleAlert
-    IF vector(1)
+  example.rules.yaml: |+
+    groups:
+    - name: ./example.rules
+      rules:
+      - alert: ExampleAlert
+        expr: vector(1)
 ```

 That example `ConfigMap` always immediately triggers an alert, which is only for demonstration purposes. To validate that everything is working properly have a look at each of the Prometheus web UIs.
--- a/Documentation/user-guides/cluster-monitoring.md
+++ b/Documentation/user-guides/cluster-monitoring.md
@ -299,7 +299,7 @@ metadata:
    prometheus: k8s
 spec:
  replicas: 2
-  version: v1.7.2
+  version: v2.0.0-rc.1
  serviceAccountName: prometheus-k8s
  serviceMonitorSelector:
    matchExpressions:
--- a/Documentation/user-guides/getting-started.md
+++ b/Documentation/user-guides/getting-started.md
@ -255,7 +255,6 @@ spec:
  serviceMonitorSelector:
    matchLabels:
      team: frontend
-  version: v1.7.1
  resources:
    requests:
      memory: 400Mi
--- a/3
+++ b/3
@ -35,6 +35,9 @@ e2e-test:
 	go test -timeout 20m -v ./test/migration/ $(TEST_RUN_ARGS) --kubeconfig=$(KUBECONFIG) --operator-image=$(REPO):$(TAG) --namespace=$(NAMESPACE)
 	go test -timeout 20m -v ./test/e2e/ $(TEST_RUN_ARGS) --kubeconfig=$(KUBECONFIG) --operator-image=$(REPO):$(TAG) --namespace=$(NAMESPACE)

+e2e-test-only:
+	go test -timeout 20m -v ./test/e2e/ $(TEST_RUN_ARGS) --kubeconfig=$(KUBECONFIG) --operator-image=$(REPO):$(TAG) --namespace=$(NAMESPACE)
+
 e2e-status:
 	kubectl get prometheus,alertmanager,servicemonitor,statefulsets,deploy,svc,endpoints,pods,cm,secrets,replicationcontrollers --all-namespaces

--- a/contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules
@ -1,36 +0,0 @@
-ALERT AlertmanagerConfigInconsistent
-  IF   count_values by (service) ("config_hash", alertmanager_config_hash)
-     / on(service) group_left
-       label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
-  FOR 5m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Alertmanager configurations are inconsistent",
-    description = "The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync."
-  }
-
-ALERT AlertmanagerDownOrMissing
-  IF   label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)")
-     / on(job) group_right
-       sum by(job) (up) != 1
-  FOR 5m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Alertmanager down or not discovered",
-    description = "An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery."
-  }
-
-ALERT FailedReload
-  IF alertmanager_config_last_reload_successful == 0
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Alertmanager configuration reload has failed",
-    description = "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/alertmanager.rules.yaml
@ -0,0 +1,33 @@
+groups:
+- name: ./alertmanager.rules
+  rules:
+  - alert: AlertmanagerConfigInconsistent
+    expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
+      GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
+      "alertmanager-$1", "alertmanager", "(.*)") != 1
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: The configuration of the instances of the Alertmanager cluster
+        `{{$labels.service}}` are out of sync.
+      summary: Alertmanager configurations are inconsistent
+  - alert: AlertmanagerDownOrMissing
+    expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
+      "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      description: An unexpected number of Alertmanagers are scraped or Alertmanagers
+        disappeared from discovery.
+      summary: Alertmanager down or not discovered
+  - alert: FailedReload
+    expr: alertmanager_config_last_reload_successful == 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
+        }}/{{ $labels.pod}}.
+      summary: Alertmanager configuration reload has failed
--- a/contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules
@ -1,177 +0,0 @@
-# general cluster availability
-
-# alert if another failed member will result in an unavailable cluster
-ALERT InsufficientMembers
-IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
-FOR 3m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "etcd cluster insufficient members",
-  description = "If one more etcd member goes down the cluster will be unavailable",
-}
-
-# etcd leader alerts
-# ==================
-
-# alert if any etcd instance has no leader
-ALERT NoLeader
-IF etcd_server_has_leader{job="etcd"} == 0
-FOR 1m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "etcd member has no leader",
-  description = "etcd member {{ $labels.instance }} has no leader",
-}
-
-# alert if there are lots of leader changes
-ALERT HighNumberOfLeaderChanges
-IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of leader changes within the etcd cluster are happening",
-  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
-}
-
-# gRPC request alerts
-# ===================
-
-# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
-ALERT HighNumberOfFailedGRPCRequests
-IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of gRPC requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
-ALERT HighNumberOfFailedGRPCRequests
-IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
-FOR 5m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "a high number of gRPC requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if the 99th percentile of gRPC method calls take more than 150ms
-ALERT GRPCRequestsSlow
-IF histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
-FOR 10m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "slow gRPC requests",
-  description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
-}
-
-# HTTP requests alerts
-# ====================
-
-# alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
-ALERT HighNumberOfFailedHTTPRequests
-IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
-  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of HTTP requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
-ALERT HighNumberOfFailedHTTPRequests
-IF sum by(method) (rate(etcd_http_failed_total{job="etcd"}[5m]))
-  / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
-FOR 5m
-LABELS {
-  severity = "critical"
-}
-ANNOTATIONS {
-  summary = "a high number of HTTP requests are failing",
-  description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
-}
-
-# alert if the 99th percentile of HTTP requests take more than 150ms
-ALERT HTTPRequestsSlow
-IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "slow HTTP requests",
-  description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
-}
-
-# etcd member communication alerts
-# ================================
-
-# alert if 99th percentile of round trips take 150ms
-ALERT EtcdMemberCommunicationSlow
-IF histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "etcd member communication is slow",
-  description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
-}
-
-# etcd proposal alerts
-# ====================
-
-# alert if there are several failed proposals within an hour
-ALERT HighNumberOfFailedProposals
-IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "a high number of proposals within the etcd cluster are failing",
-  description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
-}
-
-# etcd disk io latency alerts
-# ===========================
-
-# alert if 99th percentile of fsync durations is higher than 500ms
-ALERT HighFsyncDurations
-IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "high fsync durations",
-  description = "etcd instance {{ $labels.instance }} fync durations are high",
-}
-
-# alert if 99th percentile of commit durations is higher than 250ms
-ALERT HighCommitDurations
-IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
-FOR 10m
-LABELS {
-  severity = "warning"
-}
-ANNOTATIONS {
-  summary = "high commit durations",
-  description = "etcd instance {{ $labels.instance }} commit durations are high",
-}
--- a/contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/etcd3.rules.yaml
@ -0,0 +1,123 @@
+groups:
+- name: ./etcd3.rules
+  rules:
+  - alert: InsufficientMembers
+    expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+    for: 3m
+    labels:
+      severity: critical
+    annotations:
+      description: If one more etcd member goes down the cluster will be unavailable
+      summary: etcd cluster insufficient members
+  - alert: NoLeader
+    expr: etcd_server_has_leader{job="etcd"} == 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      description: etcd member {{ $labels.instance }} has no leader
+      summary: etcd member has no leader
+  - alert: HighNumberOfLeaderChanges
+    expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
+        changes within the last hour
+      summary: a high number of leader changes within the etcd cluster are happening
+  - alert: HighNumberOfFailedGRPCRequests
+    expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+      / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+        on etcd instance {{ $labels.instance }}'
+      summary: a high number of gRPC requests are failing
+  - alert: HighNumberOfFailedGRPCRequests
+    expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
+      / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
+        on etcd instance {{ $labels.instance }}'
+      summary: a high number of gRPC requests are failing
+  - alert: GRPCRequestsSlow
+    expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
+      > 0.15
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
+        }} are slow
+      summary: slow gRPC requests
+  - alert: HighNumberOfFailedHTTPRequests
+    expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+      BY (method) > 0.01
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+        instance {{ $labels.instance }}'
+      summary: a high number of HTTP requests are failing
+  - alert: HighNumberOfFailedHTTPRequests
+    expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+      BY (method) > 0.05
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
+        instance {{ $labels.instance }}'
+      summary: a high number of HTTP requests are failing
+  - alert: HTTPRequestsSlow
+    expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+      > 0.15
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
+        }} are slow
+      summary: slow HTTP requests
+  - alert: EtcdMemberCommunicationSlow
+    expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
+      > 0.15
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} member communication with
+        {{ $labels.To }} is slow
+      summary: etcd member communication is slow
+  - alert: HighNumberOfFailedProposals
+    expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
+        failures within the last hour
+      summary: a high number of proposals within the etcd cluster are failing
+  - alert: HighFsyncDurations
+    expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
+      > 0.5
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} fync durations are high
+      summary: high fsync durations
+  - alert: HighCommitDurations
+    expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
+      > 0.25
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: etcd instance {{ $labels.instance }} commit durations are high
+      summary: high commit durations
--- a/contrib/kube-prometheus/assets/prometheus/rules/general.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/general.rules
@ -1,63 +0,0 @@
-### Up Alerting ###
-
-Alert TargetDown
-  IF 100 * (count by(job) (up == 0) / count by(job) (up)) > 10
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Targets are down",
-    description = "{{ $value }}% or more of {{ $labels.job }} targets are down."
-  }
-
-### Dead man's switch ###
-
-ALERT DeadMansSwitch
-  IF vector(1)
-  LABELS {
-    severity = "none",
-  }
-  ANNOTATIONS {
-    summary = "Alerting DeadMansSwitch",
-    description = "This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.",
-  }
-
-### File descriptor alerts ###
-
-ALERT TooManyOpenFileDescriptors
-  IF 100 * (process_open_fds / process_max_fds) > 95
-  FOR 10m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "too many open file descriptors",
-    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.",
-  }
-
-instance:fd_utilization = process_open_fds / process_max_fds
-
-# alert if file descriptors are likely to exhaust within the next 4 hours
-ALERT FdExhaustionClose
-  IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "file descriptors soon exhausted",
-    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
-  }
-
-# alert if file descriptors are likely to exhaust within the next hour
-ALERT FdExhaustionClose
-  IF predict_linear(instance:fd_utilization[10m], 3600) > 1
-  FOR 10m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "file descriptors soon exhausted",
-    description = "{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.instance }}) instance will exhaust in file/socket descriptors soon",
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/general.rules.yaml
@ -0,0 +1,48 @@
+groups:
+- name: ./general.rules
+  rules:
+  - alert: TargetDown
+    expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
+      summary: Targets are down
+  - alert: DeadMansSwitch
+    expr: vector(1)
+    labels:
+      severity: none
+    annotations:
+      description: This is a DeadMansSwitch meant to ensure that the entire Alerting
+        pipeline is functional.
+      summary: Alerting DeadMansSwitch
+  - alert: TooManyOpenFileDescriptors
+    expr: 100 * (process_open_fds / process_max_fds) > 95
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+        $labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
+      summary: too many open file descriptors
+  - record: instance:fd_utilization
+    expr: process_open_fds / process_max_fds
+  - alert: FdExhaustionClose
+    expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+        $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+      summary: file descriptors soon exhausted
+  - alert: FdExhaustionClose
+    expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
+        $labels.instance }}) instance will exhaust in file/socket descriptors soon'
+      summary: file descriptors soon exhausted
--- a/contrib/kube-prometheus/assets/prometheus/rules/kube-apiserver.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kube-apiserver.rules
@ -1,28 +0,0 @@
-ALERT K8SApiserverDown
-  IF absent(up{job="apiserver"} == 1)
-  FOR 5m
-  LABELS {
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "API server unreachable",
-    description = "Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.",
-  }
-
-# Some verbs excluded because they are expected to be long-lasting:
-# WATCHLIST is long-poll, CONNECT is `kubectl exec`.
-#
-# apiserver_request_latencies' unit is microseconds
-ALERT K8SApiServerLatency
-  IF histogram_quantile(
-      0.99,
-      sum without (instance,resource) (apiserver_request_latencies_bucket{subresource!="log",verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
-    ) / 1e6 > 1.0
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Kubernetes apiserver latency is high",
-    description = "99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.",
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/kube-apiserver.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kube-apiserver.rules.yaml
@ -0,0 +1,22 @@
+groups:
+- name: ./kube-apiserver.rules
+  rules:
+  - alert: K8SApiserverDown
+    expr: absent(up{job="apiserver"} == 1)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: Prometheus failed to scrape API server(s), or all API servers have
+        disappeared from service discovery.
+      summary: API server unreachable
+  - alert: K8SApiServerLatency
+    expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"})
+      WITHOUT (instance, resource)) / 1e+06 > 1
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: 99th percentile Latency for {{ $labels.verb }} requests to the
+        kube-apiserver is higher than 1s.
+      summary: Kubernetes apiserver latency is high
--- a/contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules
@ -1,11 +0,0 @@
-ALERT K8SControllerManagerDown
-  IF absent(up{job="kube-controller-manager"} == 1)
-  FOR 5m
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Controller manager is down",
-    description = "There is no running K8S controller manager. Deployments and replication controllers are not making progress.",
-    runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager",
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kube-controller-manager.rules.yaml
@ -0,0 +1,13 @@
+groups:
+- name: ./kube-controller-manager.rules
+  rules:
+  - alert: K8SControllerManagerDown
+    expr: absent(up{job="kube-controller-manager"} == 1)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: There is no running K8S controller manager. Deployments and replication
+        controllers are not making progress.
+      runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
+      summary: Controller manager is down
--- a/contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules
@ -1,11 +0,0 @@
-ALERT K8SSchedulerDown
-  IF absent(up{job="kube-scheduler"} == 1)
-  FOR 5m
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Scheduler is down",
-    description = "There is no running K8S scheduler. New pods are not being assigned to nodes.",
-    runbook = "https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler",
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kube-scheduler.rules.yaml
@ -0,0 +1,13 @@
+groups:
+- name: ./kube-scheduler.rules
+  rules:
+  - alert: K8SSchedulerDown
+    expr: absent(up{job="kube-scheduler"} == 1)
+    for: 5m
+    labels:
+      severity: critical
+    annotations:
+      description: There is no running K8S scheduler. New pods are not being assigned
+        to nodes.
+      runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
+      summary: Scheduler is down
--- a/contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules
@ -1,60 +0,0 @@
-ALERT K8SNodeNotReady
-  IF kube_node_status_condition{condition="Ready", status="true"} == 0
-  FOR 1h
-  LABELS {
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Node status is NotReady",
-    description = "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour",
-  }
-
-ALERT K8SManyNodesNotReady
-  IF
-    count(kube_node_status_condition{condition="Ready", status="true"} == 0) > 1
-    AND
-      (
-        count(kube_node_status_condition{condition="Ready", status="true"} == 0)
-      /
-        count(kube_node_status_condition{condition="Ready", status="true"})
-      ) > 0.2
-  FOR 1m
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Many Kubernetes nodes are Not Ready",
-    description = "{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).",
-  }
-
-ALERT K8SKubeletDown
-  IF count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
-  FOR 1h
-  LABELS {
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Many Kubelets cannot be scraped",
-    description = "Prometheus failed to scrape {{ $value }}% of kubelets.",
-  }
-
-ALERT K8SKubeletDown
-  IF absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
-  FOR 1h
-  LABELS {
-    severity = "critical",
-  }
-  ANNOTATIONS {
-    summary = "Many Kubelets cannot be scraped",
-    description = "Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.",
-  }
-
-ALERT K8SKubeletTooManyPods
-  IF kubelet_running_pod_count > 100
-  LABELS {
-    severity = "warning",
-  }
-  ANNOTATIONS {
-    summary = "Kubelet is close to pod limit",
-    description = "Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110",
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kubelet.rules.yaml
@ -0,0 +1,49 @@
+groups:
+- name: ./kubelet.rules
+  rules:
+  - alert: K8SNodeNotReady
+    expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+    for: 1h
+    labels:
+      severity: warning
+    annotations:
+      description: The Kubelet on {{ $labels.node }} has not checked in with the API,
+        or has set itself to NotReady, for more than an hour
+      summary: Node status is NotReady
+  - alert: K8SManyNodesNotReady
+    expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
+      > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
+      0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
+        state).'
+      summary: Many Kubernetes nodes are Not Ready
+  - alert: K8SKubeletDown
+    expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+    for: 1h
+    labels:
+      severity: warning
+    annotations:
+      description: Prometheus failed to scrape {{ $value }}% of kubelets.
+      summary: Many Kubelets cannot be scraped
+  - alert: K8SKubeletDown
+    expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
+      > 0.1
+    for: 1h
+    labels:
+      severity: critical
+    annotations:
+      description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
+        have disappeared from service discovery.
+      summary: Many Kubelets cannot be scraped
+  - alert: K8SKubeletTooManyPods
+    expr: kubelet_running_pod_count > 100
+    labels:
+      severity: warning
+    annotations:
+      description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
+        to the limit of 110
+      summary: Kubelet is close to pod limit
--- a/contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules
@ -1,171 +0,0 @@
-# NOTE: These rules were kindly contributed by the SoundCloud engineering team.
-
-### Container resources ###
-
-cluster_namespace_controller_pod_container:spec_memory_limit_bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_spec_memory_limit_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:spec_cpu_shares =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_spec_cpu_shares{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:cpu_usage:rate =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      irate(
-        container_cpu_usage_seconds_total{container_name!=""}[5m]
-      ),
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_usage:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_usage_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_working_set:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_working_set_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_rss:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_rss{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_cache:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_memory_cache{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:disk_usage:bytes =
-  sum by (cluster,namespace,controller,pod_name,container_name) (
-    label_replace(
-      container_disk_usage_bytes{container_name!=""},
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_pagefaults:rate =
-  sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
-    label_replace(
-      irate(
-        container_memory_failures_total{container_name!=""}[5m]
-      ),
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-cluster_namespace_controller_pod_container:memory_oom:rate =
-  sum by (cluster,namespace,controller,pod_name,container_name,scope,type) (
-    label_replace(
-      irate(
-        container_memory_failcnt{container_name!=""}[5m]
-      ),
-      "controller", "$1",
-      "pod_name", "^(.*)-[a-z0-9]+"
-    )
-  )
-
-### Cluster resources ###
-
-cluster:memory_allocation:percent =
-  100 * sum by (cluster) (
-    container_spec_memory_limit_bytes{pod_name!=""}
-  ) / sum by (cluster) (
-    machine_memory_bytes
-  )
-
-cluster:memory_used:percent =
-  100 * sum by (cluster) (
-    container_memory_usage_bytes{pod_name!=""}
-  ) / sum by (cluster) (
-    machine_memory_bytes
-  )
-
-cluster:cpu_allocation:percent =
-  100 * sum by (cluster) (
-    container_spec_cpu_shares{pod_name!=""}
-  ) / sum by (cluster) (
-    container_spec_cpu_shares{id="/"} * on(cluster,instance) machine_cpu_cores
-  )
-
-cluster:node_cpu_use:percent =
-  100 * sum by (cluster) (
-    rate(node_cpu{mode!="idle"}[5m])
-  ) / sum by (cluster) (
-    machine_cpu_cores
-  )
-
-### API latency ###
-
-# Raw metrics are in microseconds. Convert to seconds.
-cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(
-    0.99,
-    sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-  ) / 1e6
-cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(
-    0.9,
-    sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-  ) / 1e6
-cluster_resource_verb:apiserver_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(
-    0.5,
-    sum by(le,cluster,job,resource,verb) (apiserver_request_latencies_bucket)
-  ) / 1e6
-
-### Scheduling latency ###
-
-cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(0.99,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(0.9,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_e2e_scheduling_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(0.5,sum by (le,cluster) (scheduler_e2e_scheduling_latency_microseconds_bucket)) / 1e6
-
-cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(0.99,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(0.9,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_scheduling_algorithm_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(0.5,sum by (le,cluster) (scheduler_scheduling_algorithm_latency_microseconds_bucket)) / 1e6
-
-cluster:scheduler_binding_latency:quantile_seconds{quantile="0.99"} =
-  histogram_quantile(0.99,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_binding_latency:quantile_seconds{quantile="0.9"} =
-  histogram_quantile(0.9,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
-cluster:scheduler_binding_latency:quantile_seconds{quantile="0.5"} =
-  histogram_quantile(0.5,sum by (le,cluster) (scheduler_binding_latency_microseconds_bucket)) / 1e6
--- a/contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/kubernetes.rules.yaml
@ -0,0 +1,115 @@
+groups:
+- name: ./kubernetes.rules
+  rules:
+  - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+    expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name)
+  - record: cluster_namespace_controller_pod_container:spec_cpu_shares
+    expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:cpu_usage:rate
+    expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name)
+  - record: cluster_namespace_controller_pod_container:memory_usage:bytes
+    expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
+    expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name)
+  - record: cluster_namespace_controller_pod_container:memory_rss:bytes
+    expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:memory_cache:bytes
+    expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:disk_usage:bytes
+    expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
+      "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
+      container_name)
+  - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+    expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name, scope, type)
+  - record: cluster_namespace_controller_pod_container:memory_oom:rate
+    expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
+      "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
+      controller, pod_name, container_name, scope, type)
+  - record: cluster:memory_allocation:percent
+    expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
+      / sum(machine_memory_bytes) BY (cluster)
+  - record: cluster:memory_used:percent
+    expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
+      BY (cluster)
+  - record: cluster:cpu_allocation:percent
+    expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
+      * ON(cluster, instance) machine_cpu_cores) BY (cluster)
+  - record: cluster:node_cpu_use:percent
+    expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
+      BY (cluster)
+  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
+      cluster, job, resource, verb)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
+      cluster, job, resource, verb)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
+      cluster, job, resource, verb)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
+  - record: cluster:scheduler_binding_latency:quantile_seconds
+    expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.99"
+  - record: cluster:scheduler_binding_latency:quantile_seconds
+    expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.9"
+  - record: cluster:scheduler_binding_latency:quantile_seconds
+    expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
+      BY (le, cluster)) / 1e+06
+    labels:
+      quantile: "0.5"
--- a/contrib/kube-prometheus/assets/prometheus/rules/node.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/node.rules
@ -1,43 +0,0 @@
-ALERT NodeExporterDown
-  IF absent(up{job="node-exporter"} == 1)
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "node-exporter cannot be scraped",
-    description = "Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery.",
-  }
-
-ALERT K8SNodeOutOfDisk
-  IF kube_node_status_condition{condition="OutOfDisk", status="true"} == 1
-  LABELS {
-    service = "k8s",
-    severity = "critical"
-  }
-  ANNOTATIONS {
-    summary = "Node ran out of disk space.",
-    description = "{{ $labels.node }} has run out of disk space.",
-  }
-
-ALERT K8SNodeMemoryPressure
-  IF kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Node is under memory pressure.",
-    description = "{{ $labels.node }} is under memory pressure.",
-  }
-
-ALERT K8SNodeDiskPressure
-  IF kube_node_status_condition{condition="DiskPressure", status="true"} == 1
-  LABELS {
-    service = "k8s",
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Node is under disk pressure.",
-    description = "{{ $labels.node }} is under disk pressure.",
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/node.rules.yaml
@ -0,0 +1,37 @@
+groups:
+- name: ./node.rules
+  rules:
+  - alert: NodeExporterDown
+    expr: absent(up{job="node-exporter"} == 1)
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Prometheus could not scrape a node-exporter for more than 10m,
+        or node-exporters have disappeared from discovery.
+      summary: node-exporter cannot be scraped
+  - alert: K8SNodeOutOfDisk
+    expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+    labels:
+      service: k8s
+      severity: critical
+    annotations:
+      description: '{{ $labels.node }} has run out of disk space.'
+      summary: Node ran out of disk space.
+  - alert: K8SNodeMemoryPressure
+    expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
+      1
+    labels:
+      service: k8s
+      severity: warning
+    annotations:
+      description: '{{ $labels.node }} is under memory pressure.'
+      summary: Node is under memory pressure.
+  - alert: K8SNodeDiskPressure
+    expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+    labels:
+      service: k8s
+      severity: warning
+    annotations:
+      description: '{{ $labels.node }} is under disk pressure.'
+      summary: Node is under disk pressure.
--- a/contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules
+++ b/contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules
@ -1,10 +0,0 @@
-ALERT FailedReload
-  IF prometheus_config_last_reload_successful == 0
-  FOR 10m
-  LABELS {
-    severity = "warning"
-  }
-  ANNOTATIONS {
-    summary = "Prometheus configuration reload has failed",
-    description = "Reloading Prometheus' configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}."
-  }
--- a/contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml
+++ b/contrib/kube-prometheus/assets/prometheus/rules/prometheus.rules.yaml
@ -0,0 +1,12 @@
+groups:
+- name: ./prometheus.rules
+  rules:
+  - alert: FailedReload
+    expr: prometheus_config_last_reload_successful == 0
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
+        }}/{{ $labels.pod}}.
+      summary: Prometheus configuration reload has failed
--- a/contrib/kube-prometheus/hack/scripts/generate-rules-configmap.sh
+++ b/contrib/kube-prometheus/hack/scripts/generate-rules-configmap.sh
@ -11,7 +11,7 @@ metadata:
 data:
 EOF

-for f in assets/prometheus/rules/*.rules
+for f in assets/prometheus/rules/*.rules.yaml
 do
  echo "  $(basename $f): |+"
  cat $f | sed "s/^/    /g"
--- a/contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-rules.yaml
+++ b/contrib/kube-prometheus/manifests/prometheus/prometheus-k8s-rules.yaml
--- a/contrib/kube-prometheus/manifests/prometheus/prometheus-k8s.yaml
+++ b/contrib/kube-prometheus/manifests/prometheus/prometheus-k8s.yaml
@ -6,7 +6,7 @@ metadata:
    prometheus: k8s
 spec:
  replicas: 2
-  version: v1.7.2
+  version: v2.0.0-rc.1
  serviceAccountName: prometheus-k8s
  serviceMonitorSelector:
    matchExpressions:
--- a/example/non-rbac/prometheus.yaml
+++ b/example/non-rbac/prometheus.yaml
@ -6,7 +6,6 @@ metadata:
    prometheus: main
 spec:
  replicas: 2
-  version: v1.5.2
  serviceMonitorSelector:
    matchExpressions:
    - {key: app, operator: In, values: [node-exporter, example-app]}
--- a/example/rbac/prometheus/prometheus.yaml
+++ b/example/rbac/prometheus/prometheus.yaml
@ -6,7 +6,6 @@ metadata:
    prometheus: prometheus
 spec:
  replicas: 2
-  version: v1.5.2
  serviceAccountName: prometheus
  serviceMonitorSelector:
    matchLabels:
--- a/example/user-guides/alerting/prometheus-example-rules.yaml
+++ b/example/user-guides/alerting/prometheus-example-rules.yaml
@ -6,6 +6,9 @@ metadata:
    role: prometheus-rulefiles
    prometheus: example
 data:
-  example.rules: |
-    ALERT ExampleAlert
-    IF vector(1)
+  example.rules.yaml: |+
+    groups:
+    - name: ./example.rules
+      rules:
+      - alert: ExampleAlert
+        expr: vector(1)
--- a/example/user-guides/getting-started/prometheus.yaml
+++ b/example/user-guides/getting-started/prometheus.yaml
@ -6,7 +6,6 @@ spec:
  serviceMonitorSelector:
    matchLabels:
      team: frontend
-  version: v1.7.1
  resources:
    requests:
      memory: 400Mi
--- a/pkg/prometheus/statefulset.go
+++ b/pkg/prometheus/statefulset.go
@ -37,7 +37,7 @@ import (

 const (
 	governingServiceName = "prometheus-operated"
-	DefaultVersion       = "v1.7.1"
+	DefaultVersion       = "v2.0.0-rc.1"
 	defaultRetention     = "24h"
 	configMapsFilename   = "configmaps.json"
 )
@ -66,7 +66,7 @@ var (
 		"v1.7.1",
 		"v1.7.2",
 		"v1.8.0",
-		"v2.0.0-beta.4",
+		"v2.0.0-rc.1",
 	}
 )