From 314aaa6710cda9dc268f1ab91e880a82df425d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Krupa=20=28paulfantom=29?= Date: Fri, 9 Dec 2022 14:20:49 +0100 Subject: [PATCH 1/2] *: add postgres-exporter mixin --- assets/postgres-exporter/alerts.yaml | 105 ++ .../dashboards/postgres-overview.json | 1438 +++++++++++++++++ assets/postgres-exporter/rules.yaml | 1 + mixins.json | 5 + site/content/postgres-exporter/_index.md | 157 ++ site/static/mixins.json | 5 + 6 files changed, 1711 insertions(+) create mode 100644 assets/postgres-exporter/alerts.yaml create mode 100644 assets/postgres-exporter/dashboards/postgres-overview.json create mode 100644 assets/postgres-exporter/rules.yaml create mode 100644 site/content/postgres-exporter/_index.md diff --git a/assets/postgres-exporter/alerts.yaml b/assets/postgres-exporter/alerts.yaml new file mode 100644 index 0000000..0187fca --- /dev/null +++ b/assets/postgres-exporter/alerts.yaml @@ -0,0 +1,105 @@ +groups: +- name: PostgreSQL + rules: + - alert: PostgreSQLMaxConnectionsReached + annotations: + description: '{{ $labels.instance }} is exceeding the currently configured maximum + Postgres connection limit (current value: {{ $value }}s). Services may be + degraded - please take immediate action (you probably need to increase max_connections + in the Docker image and re-deploy.' + summary: '{{ $labels.instance }} has maxed out Postgres connections.' + expr: | + sum by (instance) (pg_stat_activity_count{}) + >= + sum by (instance) (pg_settings_max_connections{}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{}) + for: 1m + labels: + severity: warning + - alert: PostgreSQLHighConnections + annotations: + description: '{{ $labels.instance }} is exceeding 80% of the currently configured + maximum Postgres connection limit (current value: {{ $value }}s). Please check + utilization graphs and confirm if this is normal service growth, abuse or + an otherwise temporary condition or if new resources need to be provisioned + (or the limits increased, which is mostly likely).' + summary: '{{ $labels.instance }} is over 80% of max Postgres connections.' + expr: | + sum by (instance) (pg_stat_activity_count{}) + > + ( + sum by (instance) (pg_settings_max_connections{}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{}) + ) * 0.8 + for: 10m + labels: + severity: warning + - alert: PostgreSQLDown + annotations: + description: '{{ $labels.instance }} is rejecting query requests from the exporter, + and thus probably not allowing DNS requests to work either. User services + should not be effected provided at least 1 node is still alive.' + summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}' + expr: pg_up{} != 1 + for: 1m + labels: + severity: warning + - alert: PostgreSQLSlowQueries + annotations: + description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for + database {{ $labels.datname }} with a value of {{ $value }} ' + summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database + {{ $labels.datname }} ' + expr: | + avg by (datname) ( + rate ( + pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m] + ) + ) > 2 * 60 + for: 2m + labels: + severity: warning + - alert: PostgreSQLQPS + annotations: + description: PostgreSQL high number of queries per second on {{ $labels.cluster + }} for database {{ $labels.datname }} with a value of {{ $value }} + summary: PostgreSQL high number of queries per second {{ $labels.cluster }} + for database {{ $labels.datname }} + expr: | + avg by (datname) ( + irate( + pg_stat_database_xact_commit{datname!~"template.*",}[5m] + ) + + + irate( + pg_stat_database_xact_rollback{datname!~"template.*",}[5m] + ) + ) > 10000 + for: 5m + labels: + severity: warning + - alert: PostgreSQLCacheHitRatio + annotations: + description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database + {{ $labels.datname }} with a value of {{ $value }} + summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database + {{ $labels.datname }} + expr: | + avg by (datname) ( + rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m]) + / + ( + rate( + pg_stat_database_blks_hit{datname!~"template.*",}[5m] + ) + + + rate( + pg_stat_database_blks_read{datname!~"template.*",}[5m] + ) + ) + ) < 0.98 + for: 5m + labels: + severity: warning diff --git a/assets/postgres-exporter/dashboards/postgres-overview.json b/assets/postgres-exporter/dashboards/postgres-overview.json new file mode 100644 index 0000000..9b08fa4 --- /dev/null +++ b/assets/postgres-exporter/dashboards/postgres-overview.json @@ -0,0 +1,1438 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Performance metrics for Postgres", + "editable": true, + "gnetId": 455, + "graphTooltip": 0, + "id": 1, + "iteration": 1603191461722, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Postgres Overview", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 20, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_fetched{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "fetched", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_returned{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "returned", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_inserted{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "inserted", + "measurement": "postgresql", + "policy": "default", + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_updated{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "updated", + "measurement": "postgresql", + "policy": "default", + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_deleted{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "deleted", + "measurement": "postgresql", + "policy": "default", + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Rows", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Postgres Overview", + "decimals": 0, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 0 + }, + "height": "55px", + "id": 11, + "interval": null, + "isNew": true, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_xact_commit{datname=~\"$db\",instance=~\"$instance\"}[5m])) + sum(irate(pg_stat_database_xact_rollback{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "xact_commit" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 1800, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": "", + "title": "QPS", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Postgres Overview", + "decimals": 1, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_alloc_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_alloc", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_backend_fsync_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_backend_fsync", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_backend_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_backend", + "measurement": "postgresql", + "policy": "default", + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_clean_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_clean", + "measurement": "postgresql", + "policy": "default", + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_checkpoint_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_checkpoint", + "measurement": "postgresql", + "policy": "default", + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Buffers", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Postgres Overview", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 3, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "conflicts", + "dsType": "prometheus", + "expr": "sum(rate(pg_stat_database_deadlocks{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "deadlocks", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "conflicts" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "deadlocks", + "dsType": "prometheus", + "expr": "sum(rate(pg_stat_database_conflicts{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "conflicts", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "deadlocks" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Conflicts/Deadlocks", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Postgres Overview", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 12, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": true, + "pluginVersion": "7.2.1", + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(pg_stat_database_blks_hit{datname=~\"$db\",instance=~\"$instance\"}) / (sum(pg_stat_database_blks_hit{datname=~\"$db\",instance=~\"$instance\"}) + sum(pg_stat_database_blks_read{datname=~\"$db\",instance=~\"$instance\"}))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "cache hit rate", + "refId": "A", + "step": 240 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Cache hit ratio", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Postgres Overview", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "hiddenSeries": false, + "id": 13, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "pg_stat_database_numbackends{datname=~\"$db\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Number of active connections", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 26, + "style": "dark", + "tags": [ + "postgres" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "Postgres Overview", + "definition": "", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ ], + "query": "label_values(up{job=~\"postgres.*\"},instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "Postgres Overview", + "definition": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\",datname!~\"template.*|postgres\"},datname)", + "hide": 0, + "includeAll": true, + "label": "db", + "multi": false, + "name": "db", + "options": [ ], + "query": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\",datname!~\"template.*|postgres\"},datname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "Postgres Overview", + "value": "Postgres Overview" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "postgres", + "value": "postgres" + }, + "datasource": "$datasource", + "definition": "label_values(pg_up, job)", + "hide": 0, + "includeAll": false, + "label": "job", + "multi": false, + "name": "job", + "options": [ + { + "selected": true, + "text": "postgres", + "value": "postgres" + } + ], + "query": "label_values(pg_up, job)", + "refresh": 0, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Postgres Overview", + "uid": "wGgaPlciz", + "version": 5 +} diff --git a/assets/postgres-exporter/rules.yaml b/assets/postgres-exporter/rules.yaml new file mode 100644 index 0000000..19765bd --- /dev/null +++ b/assets/postgres-exporter/rules.yaml @@ -0,0 +1 @@ +null diff --git a/mixins.json b/mixins.json index 2e7f7df..749641a 100644 --- a/mixins.json +++ b/mixins.json @@ -119,6 +119,11 @@ "name": "promscale", "source": "https://github.com/timescale/promscale", "subdir": "docs/mixin" + }, + { + "name": "postgres-exporter", + "source": "https://github.com/prometheus-community/postgres_exporter", + "subdir": "postgres_mixin" } ] } diff --git a/site/content/postgres-exporter/_index.md b/site/content/postgres-exporter/_index.md new file mode 100644 index 0000000..d5113fc --- /dev/null +++ b/site/content/postgres-exporter/_index.md @@ -0,0 +1,157 @@ +--- +title: postgres-exporter +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/prometheus-community/postgres_exporter](https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/postgres-exporter/alerts.yaml). +{{< /panel >}} + +### PostgreSQL + +##### PostgreSQLMaxConnectionsReached + +{{< code lang="yaml" >}} +alert: PostgreSQLMaxConnectionsReached +annotations: + description: '{{ $labels.instance }} is exceeding the currently configured maximum + Postgres connection limit (current value: {{ $value }}s). Services may be degraded + - please take immediate action (you probably need to increase max_connections + in the Docker image and re-deploy.' + summary: '{{ $labels.instance }} has maxed out Postgres connections.' +expr: | + sum by (instance) (pg_stat_activity_count{}) + >= + sum by (instance) (pg_settings_max_connections{}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{}) +for: 1m +labels: + severity: warning +{{< /code >}} + +##### PostgreSQLHighConnections + +{{< code lang="yaml" >}} +alert: PostgreSQLHighConnections +annotations: + description: '{{ $labels.instance }} is exceeding 80% of the currently configured + maximum Postgres connection limit (current value: {{ $value }}s). Please check + utilization graphs and confirm if this is normal service growth, abuse or an otherwise + temporary condition or if new resources need to be provisioned (or the limits + increased, which is mostly likely).' + summary: '{{ $labels.instance }} is over 80% of max Postgres connections.' +expr: | + sum by (instance) (pg_stat_activity_count{}) + > + ( + sum by (instance) (pg_settings_max_connections{}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{}) + ) * 0.8 +for: 10m +labels: + severity: warning +{{< /code >}} + +##### PostgreSQLDown + +{{< code lang="yaml" >}} +alert: PostgreSQLDown +annotations: + description: '{{ $labels.instance }} is rejecting query requests from the exporter, + and thus probably not allowing DNS requests to work either. User services should + not be effected provided at least 1 node is still alive.' + summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}' +expr: pg_up{} != 1 +for: 1m +labels: + severity: warning +{{< /code >}} + +##### PostgreSQLSlowQueries + +{{< code lang="yaml" >}} +alert: PostgreSQLSlowQueries +annotations: + description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database + {{ $labels.datname }} with a value of {{ $value }} ' + summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ + $labels.datname }} ' +expr: | + avg by (datname) ( + rate ( + pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m] + ) + ) > 2 * 60 +for: 2m +labels: + severity: warning +{{< /code >}} + +##### PostgreSQLQPS + +{{< code lang="yaml" >}} +alert: PostgreSQLQPS +annotations: + description: PostgreSQL high number of queries per second on {{ $labels.cluster + }} for database {{ $labels.datname }} with a value of {{ $value }} + summary: PostgreSQL high number of queries per second {{ $labels.cluster }} for + database {{ $labels.datname }} +expr: | + avg by (datname) ( + irate( + pg_stat_database_xact_commit{datname!~"template.*",}[5m] + ) + + + irate( + pg_stat_database_xact_rollback{datname!~"template.*",}[5m] + ) + ) > 10000 +for: 5m +labels: + severity: warning +{{< /code >}} + +##### PostgreSQLCacheHitRatio + +{{< code lang="yaml" >}} +alert: PostgreSQLCacheHitRatio +annotations: + description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database + {{ $labels.datname }} with a value of {{ $value }} + summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ + $labels.datname }} +expr: | + avg by (datname) ( + rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m]) + / + ( + rate( + pg_stat_database_blks_hit{datname!~"template.*",}[5m] + ) + + + rate( + pg_stat_database_blks_read{datname!~"template.*",}[5m] + ) + ) + ) < 0.98 +for: 5m +labels: + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [postgres-overview](https://github.com/monitoring-mixins/website/blob/master/assets/postgres-exporter/dashboards/postgres-overview.json) diff --git a/site/static/mixins.json b/site/static/mixins.json index 2e7f7df..749641a 100644 --- a/site/static/mixins.json +++ b/site/static/mixins.json @@ -119,6 +119,11 @@ "name": "promscale", "source": "https://github.com/timescale/promscale", "subdir": "docs/mixin" + }, + { + "name": "postgres-exporter", + "source": "https://github.com/prometheus-community/postgres_exporter", + "subdir": "postgres_mixin" } ] } From a51b164bd40de7eff7d12ce3ab9598618304c417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Krupa=20=28paulfantom=29?= Date: Fri, 9 Dec 2022 14:55:02 +0100 Subject: [PATCH 2/2] *: add grafana mixin --- assets/grafana/alerts.yaml | 15 + .../grafana/dashboards/grafana-overview.json | 553 ++++++++++++++++++ assets/grafana/rules.yaml | 6 + mixins.json | 5 + site/content/grafana/_index.md | 58 ++ site/static/mixins.json | 5 + 6 files changed, 642 insertions(+) create mode 100644 assets/grafana/alerts.yaml create mode 100644 assets/grafana/dashboards/grafana-overview.json create mode 100644 assets/grafana/rules.yaml create mode 100644 site/content/grafana/_index.md diff --git a/assets/grafana/alerts.yaml b/assets/grafana/alerts.yaml new file mode 100644 index 0000000..ac37e43 --- /dev/null +++ b/assets/grafana/alerts.yaml @@ -0,0 +1,15 @@ +groups: +- name: GrafanaAlerts + rules: + - alert: GrafanaRequestsFailing + annotations: + message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is + experiencing {{ $value | humanize }}% errors' + expr: | + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / ignoring (status_code) + sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) + > 50 + for: 5m + labels: + severity: warning diff --git a/assets/grafana/dashboards/grafana-overview.json b/assets/grafana/dashboards/grafana-overview.json new file mode 100644 index 0000000..7a9f6d4 --- /dev/null +++ b/assets/grafana/dashboards/grafana-overview.json @@ -0,0 +1,553 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ ], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3085, + "iteration": 1631554945276, + "links": [ ], + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ ] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": { }, + "textMode": "auto" + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Firing Alerts", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ ] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": { }, + "textMode": "auto" + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Dashboards", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": null, + "displayMode": "auto" + }, + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 10, + "options": { + "showHeader": true + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Build Info", + "transformations": [ + { + "id": "labelsToFields", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "branch": true, + "container": true, + "goversion": true, + "namespace": true, + "pod": true, + "revision": true + }, + "indexByName": { + "Time": 7, + "Value": 11, + "branch": 4, + "container": 8, + "edition": 2, + "goversion": 6, + "instance": 1, + "job": 0, + "namespace": 9, + "pod": 10, + "revision": 5, + "version": 3 + }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ ] + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (status_code) (irate(grafana_http_request_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[1m])) ", + "interval": "", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "$$hashKey": "object:157", + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:158", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ ] + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1", + "interval": "", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1", + "interval": "", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(irate(grafana_http_request_duration_seconds_sum{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) * 1 / sum(irate(grafana_http_request_duration_seconds_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Average", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "$$hashKey": "object:210", + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:211", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 30, + "style": "dark", + "tags": [ ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "dev-cortex", + "value": "dev-cortex" + }, + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": [ + "default/grafana" + ], + "value": [ + "default/grafana" + ] + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, job)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "job", + "options": [ ], + "query": { + "query": "label_values(grafana_build_info, job)", + "refId": "Billing Admin-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, instance)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [ ], + "query": { + "query": "label_values(grafana_build_info, instance)", + "refId": "Billing Admin-instance-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Grafana Overview", + "uid": "6be0s85Mk", + "version": 2 +} diff --git a/assets/grafana/rules.yaml b/assets/grafana/rules.yaml new file mode 100644 index 0000000..8776c2c --- /dev/null +++ b/assets/grafana/rules.yaml @@ -0,0 +1,6 @@ +groups: +- name: grafana_rules + rules: + - expr: | + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) + record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m diff --git a/mixins.json b/mixins.json index 749641a..8f9813c 100644 --- a/mixins.json +++ b/mixins.json @@ -124,6 +124,11 @@ "name": "postgres-exporter", "source": "https://github.com/prometheus-community/postgres_exporter", "subdir": "postgres_mixin" + }, + { + "name": "grafana", + "source": "https://github.com/grafana/grafana", + "subdir": "grafana-mixin" } ] } diff --git a/site/content/grafana/_index.md b/site/content/grafana/_index.md new file mode 100644 index 0000000..cd9c7f0 --- /dev/null +++ b/site/content/grafana/_index.md @@ -0,0 +1,58 @@ +--- +title: grafana +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/grafana/grafana](https://github.com/grafana/grafana/tree/master/grafana-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/alerts.yaml). +{{< /panel >}} + +### GrafanaAlerts + +##### GrafanaRequestsFailing + +{{< code lang="yaml" >}} +alert: GrafanaRequestsFailing +annotations: + message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing + {{ $value | humanize }}% errors' +expr: | + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / ignoring (status_code) + sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) + > 50 +for: 5m +labels: + severity: warning +{{< /code >}} + +## Recording rules + +{{< panel style="warning" >}} +Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/rules.yaml). +{{< /panel >}} + +### grafana_rules + +##### namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m + +{{< code lang="yaml" >}} +expr: | + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) +record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [grafana-overview](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/dashboards/grafana-overview.json) diff --git a/site/static/mixins.json b/site/static/mixins.json index 749641a..8f9813c 100644 --- a/site/static/mixins.json +++ b/site/static/mixins.json @@ -124,6 +124,11 @@ "name": "postgres-exporter", "source": "https://github.com/prometheus-community/postgres_exporter", "subdir": "postgres_mixin" + }, + { + "name": "grafana", + "source": "https://github.com/grafana/grafana", + "subdir": "grafana-mixin" } ] }