diff --git a/assets/etcd/alerts.yaml b/assets/etcd/alerts.yaml index 6ad096d..26724bd 100644 --- a/assets/etcd/alerts.yaml +++ b/assets/etcd/alerts.yaml @@ -141,39 +141,6 @@ groups: for: 10m labels: severity: warning - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: etcd has high number of failed HTTP requests. - expr: | - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - without (code) > 0.01 - for: 10m - labels: - severity: warning - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}.' - summary: etcd has high number of failed HTTP requests. - expr: | - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - without (code) > 0.05 - for: 10m - labels: - severity: critical - - alert: etcdHTTPRequestsSlow - annotations: - description: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method - }} are slow. - summary: etcd instance HTTP requests are slow. - expr: | - histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning - alert: etcdBackendQuotaLowSpace annotations: message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined diff --git a/assets/etcd/dashboards/etcd.json b/assets/etcd/dashboards/etcd.json index 8c89714..783bf01 100644 --- a/assets/etcd/dashboards/etcd.json +++ b/assets/etcd/dashboards/etcd.json @@ -1014,7 +1014,9 @@ "schemaVersion": 13, "sharedCrosshair": false, "style": "dark", - "tags": [ ], + "tags": [ + "etcd-mixin" + ], "templating": { "list": [ { diff --git a/assets/prometheus/alerts.yaml b/assets/prometheus/alerts.yaml index 913437e..6748794 100644 --- a/assets/prometheus/alerts.yaml +++ b/assets/prometheus/alerts.yaml @@ -121,12 +121,12 @@ groups: summary: Prometheus fails to send samples to remote storage. expr: | ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) / ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m])) ) ) * 100 diff --git a/site/content/etcd/_index.md b/site/content/etcd/_index.md index 89490db..f18d159 100644 --- a/site/content/etcd/_index.md +++ b/site/content/etcd/_index.md @@ -215,54 +215,6 @@ labels: severity: warning {{< /code >}} -##### etcdHighNumberOfFailedHTTPRequests - -{{< code lang="yaml" >}} -alert: etcdHighNumberOfFailedHTTPRequests -annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' - summary: etcd has high number of failed HTTP requests. -expr: | - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - without (code) > 0.01 -for: 10m -labels: - severity: warning -{{< /code >}} - -##### etcdHighNumberOfFailedHTTPRequests - -{{< code lang="yaml" >}} -alert: etcdHighNumberOfFailedHTTPRequests -annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}.' - summary: etcd has high number of failed HTTP requests. -expr: | - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - without (code) > 0.05 -for: 10m -labels: - severity: critical -{{< /code >}} - -##### etcdHTTPRequestsSlow - -{{< code lang="yaml" >}} -alert: etcdHTTPRequestsSlow -annotations: - description: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method - }} are slow. - summary: etcd instance HTTP requests are slow. -expr: | - histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 -for: 10m -labels: - severity: warning -{{< /code >}} - ##### etcdBackendQuotaLowSpace {{< code lang="yaml" >}} diff --git a/site/content/prometheus/_index.md b/site/content/prometheus/_index.md index 415f4c1..f81f313 100644 --- a/site/content/prometheus/_index.md +++ b/site/content/prometheus/_index.md @@ -190,12 +190,12 @@ annotations: summary: Prometheus fails to send samples to remote storage. expr: | ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) / ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m])) ) ) * 100