2023-05-28 03:20:53 +00:00
|
|
|
groups:
|
|
|
|
- name: cert-manager
|
|
|
|
rules:
|
|
|
|
- alert: CertManagerAbsent
|
|
|
|
annotations:
|
|
|
|
description: New certificates will not be able to be minted, and existing ones
|
|
|
|
can't be renewed until cert-manager is back.
|
2023-06-12 13:32:17 +00:00
|
|
|
runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent
|
|
|
|
summary: Cert Manager has disappeared from Prometheus service discovery.
|
2023-05-28 03:20:53 +00:00
|
|
|
expr: absent(up{job="cert-manager"})
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- name: certificates
|
|
|
|
rules:
|
|
|
|
- alert: CertManagerCertExpirySoon
|
|
|
|
annotations:
|
|
|
|
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
|
|
|
description: The domain that this cert covers will be unavailable after {{ $value
|
|
|
|
| humanizeDuration }}. Clients using endpoints that this cert protects will
|
|
|
|
start to fail in {{ $value | humanizeDuration }}.
|
2023-06-12 13:32:17 +00:00
|
|
|
runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon
|
2023-05-28 03:20:53 +00:00
|
|
|
summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from
|
|
|
|
expiry, it should have renewed over a week ago.
|
|
|
|
expr: |
|
|
|
|
avg by (exported_namespace, namespace, name) (
|
|
|
|
certmanager_certificate_expiration_timestamp_seconds - time()
|
|
|
|
) < (21 * 24 * 3600) # 21 days in seconds
|
|
|
|
for: 1h
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: CertManagerCertNotReady
|
|
|
|
annotations:
|
|
|
|
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
|
|
|
description: This certificate has not been ready to serve traffic for at least
|
|
|
|
10m. If the cert is being renewed or there is another valid cert, the ingress
|
|
|
|
controller _may_ be able to serve that instead.
|
2023-06-12 13:32:17 +00:00
|
|
|
runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertnotready
|
2023-05-28 03:20:53 +00:00
|
|
|
summary: The cert `{{ $labels.name }}` is not ready to serve traffic.
|
|
|
|
expr: |
|
|
|
|
max by (name, exported_namespace, namespace, condition) (
|
|
|
|
certmanager_certificate_ready_status{condition!="True"} == 1
|
|
|
|
)
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: CertManagerHittingRateLimits
|
|
|
|
annotations:
|
|
|
|
dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager
|
|
|
|
description: Depending on the rate limit, cert-manager may be unable to generate
|
|
|
|
certificates for up to a week.
|
2023-06-12 13:32:17 +00:00
|
|
|
runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits
|
2023-05-28 03:20:53 +00:00
|
|
|
summary: Cert manager hitting LetsEncrypt rate limits.
|
|
|
|
expr: |
|
|
|
|
sum by (host) (
|
|
|
|
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
|
|
|
|
) > 0
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|