From da042cf1788a8c3a0da194732fd3735b76b405a3 Mon Sep 17 00:00:00 2001 From: imusmanmalik Date: Mon, 12 Jun 2023 15:32:17 +0200 Subject: [PATCH] feat: Add updated cert-manager mixin Signed-off-by: imusmanmalik --- assets/cert-manager/alerts.yaml | 10 +- .../cert-manager/dashboards/cert-manager.json | 112 +++++++++++++----- mixins.json | 2 +- site/content/cert-manager/_index.md | 12 +- site/static/mixins.json | 2 +- 5 files changed, 98 insertions(+), 40 deletions(-) diff --git a/assets/cert-manager/alerts.yaml b/assets/cert-manager/alerts.yaml index c0db9f5..fc81f4f 100644 --- a/assets/cert-manager/alerts.yaml +++ b/assets/cert-manager/alerts.yaml @@ -5,8 +5,8 @@ groups: annotations: description: New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent - summary: Cert Manager has dissapeared from Prometheus service discovery. + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent + summary: Cert Manager has disappeared from Prometheus service discovery. expr: absent(up{job="cert-manager"}) for: 10m labels: @@ -19,7 +19,7 @@ groups: description: The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago. expr: | @@ -35,7 +35,7 @@ groups: description: This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertnotready summary: The cert `{{ $labels.name }}` is not ready to serve traffic. expr: | max by (name, exported_namespace, namespace, condition) ( @@ -49,7 +49,7 @@ groups: dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager description: Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits summary: Cert manager hitting LetsEncrypt rate limits. expr: | sum by (host) ( diff --git a/assets/cert-manager/dashboards/cert-manager.json b/assets/cert-manager/dashboards/cert-manager.json index 2d6608b..a77023c 100644 --- a/assets/cert-manager/dashboards/cert-manager.json +++ b/assets/cert-manager/dashboards/cert-manager.json @@ -1,4 +1,33 @@ { + "_config": { + "certManagerCertExpiryDays": "21", + "certManagerJobLabel": "cert-manager", + "certManagerRunbookURLPattern": "https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#%s", + "dashboards": { + "certmanagerCertificateExpirationTimestampSecondsSelector": "", + "certmanagerCertificateReadyStatusSelector": "", + "certmanagerControllerSyncCallCountSelector": "", + "certmanagerHttpAcmeClientRequestCountSelector": "", + "certmanagerHttpAcmeClientRequestDurationSecondsCountSelector": "", + "certmanagerHttpAcmeClientRequestDurationSecondsSumSelector": "", + "clusterVariableSelector": "", + "containerCPUUsageSecondsTotalSelector": "container=\"cert-manager\"", + "containerCpuCfsPeriodsTotalSelector": "container=\"cert-manager\"", + "containerCpuCfsThrottledPeriodsTotalSelector": "container=\"cert-manager\"", + "containerMemoryUsageBytesSelector": "container=\"cert-manager\"", + "containerNetworkReceiveBytesTotalSelector": "namespace=\"cert-manager\"", + "containerNetworkTransmitBytesTotalSelector": "namespace=\"cert-manager\"", + "containerSelector": "container=\"cert-manager\"", + "defaultSelector": "", + "enableMultiCluster": false, + "kubePodContainerResourceLimitsCpuCoresSelector": "container=\"cert-manager\"", + "kubePodContainerResourceLimitsMemoryBytesSelector": "container=\"cert-manager\"", + "kubePodContainerResourceRequestsCpuCoresSelector": "container=\"cert-manager\"", + "kubePodContainerResourceRequestsMemoryBytesSelector": "container=\"cert-manager\"", + "namespaceSelector": "namespace=\"cert-manager\"" + }, + "grafanaExternalUrl": "https://grafana.example.com" + }, "annotations": { "list": [ { @@ -89,9 +118,9 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "sum by (condition) (certmanager_certificate_ready_status)", + "expr": "sum by (condition) (certmanager_certificate_ready_status{ })", "interval": "", - "legendFormat": "{{condition}}", + "legendFormat": "{ {condition } }", "refId": "A" } ], @@ -153,7 +182,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "min(certmanager_certificate_expiration_timestamp_seconds > 0) - time()", + "expr": "min(certmanager_certificate_expiration_timestamp_seconds{ } > 0) - time()", "hide": false, "instant": true, "interval": "", @@ -267,7 +296,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "label_join(avg by (name, namespace, condition, exported_namespace) (certmanager_certificate_ready_status == 1), \"namespaced_name\", \"-\", \"namespace\", \"exported_namespace\", \"name\")", + "expr": "label_join(avg by (name, namespace, condition, exported_namespace) (certmanager_certificate_ready_status{ } == 1), \"namespaced_name\", \"-\", \"namespace\", \"exported_namespace\", \"name\")", "format": "table", "instant": true, "interval": "", @@ -275,7 +304,7 @@ "refId": "A" }, { - "expr": "label_join(avg by (name, namespace, exported_namespace) (certmanager_certificate_expiration_timestamp_seconds) * 1000, \"namespaced_name\", \"-\", \"namespace\", \"exported_namespace\", \"name\")", + "expr": "label_join(avg by (name, namespace, exported_namespace) (certmanager_certificate_expiration_timestamp_seconds{ }) * 1000, \"namespaced_name\", \"-\", \"namespace\", \"exported_namespace\", \"name\")", "format": "table", "instant": true, "interval": "", @@ -392,9 +421,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (controller) (\n rate(certmanager_controller_sync_call_count[$__rate_interval])\n)", + "expr": "sum by (controller) (\n rate(certmanager_controller_sync_call_count{ }[$__rate_interval ])\n)", "interval": "", - "legendFormat": "{{controller}}", + "legendFormat": "{ {controller } }", "refId": "A" } ], @@ -493,9 +522,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (method, path, status) (\n rate(certmanager_http_acme_client_request_count[$__rate_interval])\n)", + "expr": "sum by (method, path, status) (\n rate(certmanager_http_acme_client_request_count{ }[$__rate_interval ])\n)", "interval": "", - "legendFormat": "{{method}} {{path}} {{status}}", + "legendFormat": "{ {method } } { {path } } { {status } }", "refId": "A" } ], @@ -594,9 +623,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (method, path, status) (rate(certmanager_http_acme_client_request_duration_seconds_sum[$__rate_interval]))\n/\nsum by (method, path, status) (rate(certmanager_http_acme_client_request_duration_seconds_count[$__rate_interval]))", + "expr": "sum by (method, path, status) (rate(certmanager_http_acme_client_request_duration_seconds_sum{ }[$__rate_interval ]))\n/\nsum by (method, path, status) (rate(certmanager_http_acme_client_request_duration_seconds_count{ }[$__rate_interval ]))", "interval": "", - "legendFormat": "{{method}} {{path}} {{status}}", + "legendFormat": "{ {method } } { {path } } { {status } }", "refId": "A" } ], @@ -712,30 +741,30 @@ "steppedLine": false, "targets": [ { - "expr": "avg by (pod) (rate(container_cpu_usage_seconds_total{container=\"cert-manager\"}[$__rate_interval]))", + "expr": "avg by (pod) (rate(container_cpu_usage_seconds_total{container=\"cert-manager\" }[$__rate_interval ]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, - "legendFormat": "CPU {{pod}}", + "legendFormat": "CPU { {pod } }", "refId": "A" }, { - "expr": "avg by (pod) (kube_pod_container_resource_limits_cpu_cores{container=\"cert-manager\"})", + "expr": "avg by (pod) (kube_pod_container_resource_limits_cpu_cores{container=\"cert-manager\" })", "format": "time_series", "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "Limit {{pod}}", + "legendFormat": "Limit { {pod } }", "refId": "B" }, { - "expr": "avg by (pod) (kube_pod_container_resource_requests_cpu_cores{container=\"cert-manager\"})", + "expr": "avg by (pod) (kube_pod_container_resource_requests_cpu_cores{container=\"cert-manager\" })", "format": "time_series", "hide": true, "interval": "", "intervalFactor": 1, - "legendFormat": "Request {{pod}}", + "legendFormat": "Request { {pod } }", "refId": "C" } ], @@ -841,12 +870,12 @@ "steppedLine": false, "targets": [ { - "expr": "avg by (pod) (\n rate(container_cpu_cfs_throttled_periods_total{container=\"cert-manager\"}[$__rate_interval])\n /\n rate(container_cpu_cfs_periods_total{container=\"cert-manager\"}[$__rate_interval])\n)", + "expr": "avg by (pod) (\n rate(container_cpu_cfs_throttled_periods_total{container=\"cert-manager\" }[$__rate_interval ])\n /\n rate(container_cpu_cfs_periods_total{container=\"cert-manager\" }[$__rate_interval ])\n)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "{ {pod } }", "refId": "A" } ], @@ -962,28 +991,28 @@ "steppedLine": false, "targets": [ { - "expr": "avg by (pod) (container_memory_usage_bytes{container=\"cert-manager\"})", + "expr": "avg by (pod) (container_memory_usage_bytes{container=\"cert-manager\" })", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "Memory {{pod}}", + "legendFormat": "Memory { {pod } }", "refId": "A" }, { - "expr": "avg by (pod) (kube_pod_container_resource_limits_memory_bytes{container=\"cert-manager\"})", + "expr": "avg by (pod) (kube_pod_container_resource_limits_memory_bytes{container=\"cert-manager\" })", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "Limit {{pod}}", + "legendFormat": "Limit { {pod } }", "refId": "B" }, { - "expr": "avg by (pod) (kube_pod_container_resource_requests_memory_bytes{container=\"cert-manager\"})", + "expr": "avg by (pod) (kube_pod_container_resource_requests_memory_bytes{container=\"cert-manager\" })", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "Request {{pod}}", + "legendFormat": "Request { {pod } }", "refId": "C" } ], @@ -1087,7 +1116,7 @@ "steppedLine": false, "targets": [ { - "expr": "avg(\n sum without (interface) (\n rate(container_network_receive_bytes_total{namespace=\"cert-manager\"}[$__rate_interval])\n )\n)", + "expr": "avg(\n sum without (interface) (\n rate(container_network_receive_bytes_total{namespace=\"cert-manager\" }[$__rate_interval ])\n )\n)", "format": "time_series", "hide": false, "interval": "", @@ -1096,7 +1125,7 @@ "refId": "A" }, { - "expr": "avg(\n sum without (interface) (\n rate(container_network_transmit_bytes_total{namespace=\"cert-manager\"}[$__rate_interval])\n )\n)", + "expr": "avg(\n sum without (interface) (\n rate(container_network_transmit_bytes_total{namespace=\"cert-manager\" }[$__rate_interval ])\n )\n)", "format": "time_series", "hide": false, "interval": "", @@ -1176,6 +1205,35 @@ "regex": "", "skipUrlSync": false, "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "definition": "", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [ ], + "query": { + "query": "label_values(certmanager_certificate_ready_status{ }, cluster)", + "refId": "Prometheus-cluster-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, diff --git a/mixins.json b/mixins.json index 2e7f7df..87a41b4 100644 --- a/mixins.json +++ b/mixins.json @@ -97,7 +97,7 @@ }, { "name": "cert-manager", - "source": "https://gitlab.com/uneeq-oss/cert-manager-mixin.git", + "source": "https://github.com/imusmanmalik/cert-manager-mixin.git", "subdir": "" }, { diff --git a/site/content/cert-manager/_index.md b/site/content/cert-manager/_index.md index 7da853d..ccb8586 100644 --- a/site/content/cert-manager/_index.md +++ b/site/content/cert-manager/_index.md @@ -7,7 +7,7 @@ title: cert-manager {{< panel style="danger" >}} -Jsonnet source code is available at [gitlab.com/uneeq-oss/cert-manager-mixin.git](https://gitlab.com/uneeq-oss/cert-manager-mixin.git) +Jsonnet source code is available at [github.com/imusmanmalik/cert-manager-mixin.git](https://github.com/imusmanmalik/cert-manager-mixin.git) {{< /panel >}} ## Alerts @@ -25,8 +25,8 @@ alert: CertManagerAbsent annotations: description: New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerabsent - summary: Cert Manager has dissapeared from Prometheus service discovery. + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerabsent + summary: Cert Manager has disappeared from Prometheus service discovery. expr: absent(up{job="cert-manager"}) for: 10m labels: @@ -44,7 +44,7 @@ annotations: description: The domain that this cert covers will be unavailable after {{ $value | humanizeDuration }}. Clients using endpoints that this cert protects will start to fail in {{ $value | humanizeDuration }}. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertexpirysoon summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from expiry, it should have renewed over a week ago. expr: | @@ -65,7 +65,7 @@ annotations: description: This certificate has not been ready to serve traffic for at least 10m. If the cert is being renewed or there is another valid cert, the ingress controller _may_ be able to serve that instead. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertnotready summary: The cert `{{ $labels.name }}` is not ready to serve traffic. expr: | max by (name, exported_namespace, namespace, condition) ( @@ -84,7 +84,7 @@ annotations: dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager description: Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits + runbook_url: https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagerhittingratelimits summary: Cert manager hitting LetsEncrypt rate limits. expr: | sum by (host) ( diff --git a/site/static/mixins.json b/site/static/mixins.json index 2e7f7df..87a41b4 100644 --- a/site/static/mixins.json +++ b/site/static/mixins.json @@ -97,7 +97,7 @@ }, { "name": "cert-manager", - "source": "https://gitlab.com/uneeq-oss/cert-manager-mixin.git", + "source": "https://github.com/imusmanmalik/cert-manager-mixin.git", "subdir": "" }, {