mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-15 17:50:48 +00:00
Merge pull request #12 from paulfantom/cert-manager
This commit is contained in:
commit
49b9510241
6 changed files with 1351 additions and 2 deletions
70
assets/cert-manager/alerts.yaml
Normal file
70
assets/cert-manager/alerts.yaml
Normal file
|
@ -0,0 +1,70 @@
|
|||
groups:
|
||||
- name: cert-manager
|
||||
rules:
|
||||
- alert: CertManagerAbsent
|
||||
annotations:
|
||||
description: New certificates will not be able to be minted, and existing ones
|
||||
can't be renewed until cert-manager is back.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
|
||||
summary: Cert Manager has dissapeared from Prometheus service discovery.
|
||||
expr: absent(up{job="cert-manager"})
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CertManagerCertExpirySoon
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: The domain that this cert covers will be unavailable after {{ $value
|
||||
| humanizeDuration }}. Clients using endpoints that this cert protects will
|
||||
start to fail in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpirySoon
|
||||
summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from
|
||||
expiry, it should have renewed over a week ago.
|
||||
expr: |
|
||||
avg by (exported_namespace, namespace, name) (
|
||||
certmanager_certificate_expiration_timestamp_seconds - time()
|
||||
) < (21 * 24 * 3600) # 21 days in seconds
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertManagerCertNotReady
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: This certificate has not been ready to serve traffic for at least
|
||||
10m. If the cert is being renewed or there is another valid cert, the ingress
|
||||
controller _may_ be able to serve that instead.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertNotReady
|
||||
summary: The cert `{{ $labels.name }}` is not ready to serve traffic.
|
||||
expr: |
|
||||
max by (name, exported_namespace, namespace, condition) (
|
||||
certmanager_certificate_ready_status{condition!="True"} == 1
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CertManagerCertExpiryMetricMissing
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: We are blind as to whether or not we can alert on certificates
|
||||
expiring. It could also be the case that there have not had any Certificate
|
||||
CRDs created.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpiryMetricMissing
|
||||
summary: The metric used to observe cert-manager cert expiry is missing.
|
||||
expr: absent(certmanager_certificate_expiration_timestamp_seconds)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: CertManagerHittingRateLimits
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: Depending on the rate limit, cert-manager may be unable to generate
|
||||
certificates for up to a week.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerHittingRateLimits
|
||||
summary: Cert manager hitting LetsEncrypt rate limits.
|
||||
expr: |
|
||||
sum by (host) (
|
||||
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
|
||||
) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
1158
assets/cert-manager/dashboards/cert-manager.json
Normal file
1158
assets/cert-manager/dashboards/cert-manager.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/cert-manager/rules.yaml
Normal file
1
assets/cert-manager/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
groups: []
|
|
@ -153,7 +153,7 @@
|
|||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})",
|
||||
"expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
|
@ -233,7 +233,7 @@
|
|||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})",
|
||||
"expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
|
|
|
@ -64,4 +64,7 @@ mixins:
|
|||
- name: prometheus-operator
|
||||
source: "https://github.com/prometheus-operator/prometheus-operator"
|
||||
subdir: "jsonnet/mixin"
|
||||
- name: cert-manager
|
||||
source: "https://gitlab.com/uneeq-oss/cert-manager-mixin.git"
|
||||
subdir: ""
|
||||
|
||||
|
|
117
site/content/cert-manager/_index.md
Normal file
117
site/content/cert-manager/_index.md
Normal file
|
@ -0,0 +1,117 @@
|
|||
---
|
||||
title: cert-manager
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [gitlab.com/uneeq-oss/cert-manager-mixin.git](https://gitlab.com/uneeq-oss/cert-manager-mixin.git)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/cert-manager/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### cert-manager
|
||||
|
||||
##### CertManagerAbsent
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CertManagerAbsent
|
||||
annotations:
|
||||
description: New certificates will not be able to be minted, and existing ones can't
|
||||
be renewed until cert-manager is back.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent
|
||||
summary: Cert Manager has dissapeared from Prometheus service discovery.
|
||||
expr: absent(up{job="cert-manager"})
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CertManagerCertExpirySoon
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CertManagerCertExpirySoon
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: The domain that this cert covers will be unavailable after {{ $value
|
||||
| humanizeDuration }}. Clients using endpoints that this cert protects will start
|
||||
to fail in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpirySoon
|
||||
summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry,
|
||||
it should have renewed over a week ago.
|
||||
expr: |
|
||||
avg by (exported_namespace, namespace, name) (
|
||||
certmanager_certificate_expiration_timestamp_seconds - time()
|
||||
) < (21 * 24 * 3600) # 21 days in seconds
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CertManagerCertNotReady
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CertManagerCertNotReady
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: This certificate has not been ready to serve traffic for at least 10m.
|
||||
If the cert is being renewed or there is another valid cert, the ingress controller
|
||||
_may_ be able to serve that instead.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertNotReady
|
||||
summary: The cert `{{ $labels.name }}` is not ready to serve traffic.
|
||||
expr: |
|
||||
max by (name, exported_namespace, namespace, condition) (
|
||||
certmanager_certificate_ready_status{condition!="True"} == 1
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CertManagerCertExpiryMetricMissing
|
||||
We are blind as to whether or not we can alert on certificates expiring.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CertManagerCertExpiryMetricMissing
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: We are blind as to whether or not we can alert on certificates expiring.
|
||||
It could also be the case that there have not had any Certificate CRDs created.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerCertExpiryMetricMissing
|
||||
summary: The metric used to observe cert-manager cert expiry is missing.
|
||||
expr: absent(certmanager_certificate_expiration_timestamp_seconds)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: info
|
||||
{{< /code >}}
|
||||
|
||||
##### CertManagerHittingRateLimits
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CertManagerHittingRateLimits
|
||||
annotations:
|
||||
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
||||
description: Depending on the rate limit, cert-manager may be unable to generate
|
||||
certificates for up to a week.
|
||||
runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#CertManagerHittingRateLimits
|
||||
summary: Cert manager hitting LetsEncrypt rate limits.
|
||||
expr: |
|
||||
sum by (host) (
|
||||
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
|
||||
) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [cert-manager](https://github.com/monitoring-mixins/website/blob/master/assets/cert-manager/dashboards/cert-manager.json)
|
Loading…
Reference in a new issue