1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00

Merge pull request #25 from paulfantom/postgres-exporter

add postgres-exporter, grafana, and mimir mixins
This commit is contained in:
Paweł Krupa 2023-10-30 09:45:02 +01:00 committed by GitHub
commit c6cb85e987
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 2353 additions and 0 deletions

View file

@ -0,0 +1,15 @@
groups:
- name: GrafanaAlerts
rules:
- alert: GrafanaRequestsFailing
annotations:
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is
experiencing {{ $value | humanize }}% errors'
expr: |
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
/ ignoring (status_code)
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
> 50
for: 5m
labels:
severity: warning

View file

@ -0,0 +1,553 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [ ],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 3085,
"iteration": 1631554945276,
"links": [ ],
"panels": [
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"mappings": [ ],
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 0
},
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": { },
"textMode": "auto"
},
"pluginVersion": "8.1.3",
"targets": [
{
"expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Firing Alerts",
"type": "stat"
},
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 0
},
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": { },
"textMode": "auto"
},
"pluginVersion": "8.1.3",
"targets": [
{
"expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Dashboards",
"type": "stat"
},
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {
"align": null,
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 5,
"w": 12,
"x": 12,
"y": 0
},
"id": 10,
"options": {
"showHeader": true
},
"pluginVersion": "8.1.3",
"targets": [
{
"expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Build Info",
"transformations": [
{
"id": "labelsToFields",
"options": { }
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value": true,
"branch": true,
"container": true,
"goversion": true,
"namespace": true,
"pod": true,
"revision": true
},
"indexByName": {
"Time": 7,
"Value": 11,
"branch": 4,
"container": 8,
"edition": 2,
"goversion": 6,
"instance": 1,
"job": 0,
"namespace": 9,
"pod": 10,
"revision": 5,
"version": 3
},
"renameByName": { }
}
}
],
"type": "table"
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"links": [ ]
},
"overrides": [ ]
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 5
},
"hiddenSeries": false,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.1.3",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (status_code) (irate(grafana_http_request_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[1m])) ",
"interval": "",
"legendFormat": "{{status_code}}",
"refId": "A"
}
],
"thresholds": [ ],
"timeFrom": null,
"timeRegions": [ ],
"timeShift": null,
"title": "RPS",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"$$hashKey": "object:157",
"format": "reqps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:158",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"links": [ ]
},
"overrides": [ ]
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 5
},
"hiddenSeries": false,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.1.3",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "histogram_quantile(0.99, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1",
"interval": "",
"legendFormat": "99th Percentile",
"refId": "A"
},
{
"exemplar": true,
"expr": "histogram_quantile(0.50, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1",
"interval": "",
"legendFormat": "50th Percentile",
"refId": "B"
},
{
"exemplar": true,
"expr": "sum(irate(grafana_http_request_duration_seconds_sum{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) * 1 / sum(irate(grafana_http_request_duration_seconds_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval]))",
"interval": "",
"legendFormat": "Average",
"refId": "C"
}
],
"thresholds": [ ],
"timeFrom": null,
"timeRegions": [ ],
"timeShift": null,
"title": "Request Latency",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"$$hashKey": "object:210",
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:211",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 30,
"style": "dark",
"tags": [ ],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "dev-cortex",
"value": "dev-cortex"
},
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "datasource",
"options": [ ],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": [
"default/grafana"
],
"value": [
"default/grafana"
]
},
"datasource": "$datasource",
"definition": "label_values(grafana_build_info, job)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "job",
"options": [ ],
"query": {
"query": "label_values(grafana_build_info, job)",
"refId": "Billing Admin-job-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"definition": "label_values(grafana_build_info, instance)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "instance",
"options": [ ],
"query": {
"query": "label_values(grafana_build_info, instance)",
"refId": "Billing Admin-instance-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Grafana Overview",
"uid": "6be0s85Mk",
"version": 2
}

View file

@ -0,0 +1,6 @@
groups:
- name: grafana_rules
rules:
- expr: |
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m

View file

@ -0,0 +1,105 @@
groups:
- name: PostgreSQL
rules:
- alert: PostgreSQLMaxConnectionsReached
annotations:
description: '{{ $labels.instance }} is exceeding the currently configured maximum
Postgres connection limit (current value: {{ $value }}s). Services may be
degraded - please take immediate action (you probably need to increase max_connections
in the Docker image and re-deploy.'
summary: '{{ $labels.instance }} has maxed out Postgres connections.'
expr: |
sum by (instance) (pg_stat_activity_count{})
>=
sum by (instance) (pg_settings_max_connections{})
-
sum by (instance) (pg_settings_superuser_reserved_connections{})
for: 1m
labels:
severity: warning
- alert: PostgreSQLHighConnections
annotations:
description: '{{ $labels.instance }} is exceeding 80% of the currently configured
maximum Postgres connection limit (current value: {{ $value }}s). Please check
utilization graphs and confirm if this is normal service growth, abuse or
an otherwise temporary condition or if new resources need to be provisioned
(or the limits increased, which is mostly likely).'
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.'
expr: |
sum by (instance) (pg_stat_activity_count{})
>
(
sum by (instance) (pg_settings_max_connections{})
-
sum by (instance) (pg_settings_superuser_reserved_connections{})
) * 0.8
for: 10m
labels:
severity: warning
- alert: PostgreSQLDown
annotations:
description: '{{ $labels.instance }} is rejecting query requests from the exporter,
and thus probably not allowing DNS requests to work either. User services
should not be effected provided at least 1 node is still alive.'
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}'
expr: pg_up{} != 1
for: 1m
labels:
severity: warning
- alert: PostgreSQLSlowQueries
annotations:
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for
database {{ $labels.datname }} with a value of {{ $value }} '
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database
{{ $labels.datname }} '
expr: |
avg by (datname) (
rate (
pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m]
)
) > 2 * 60
for: 2m
labels:
severity: warning
- alert: PostgreSQLQPS
annotations:
description: PostgreSQL high number of queries per second on {{ $labels.cluster
}} for database {{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL high number of queries per second {{ $labels.cluster }}
for database {{ $labels.datname }}
expr: |
avg by (datname) (
irate(
pg_stat_database_xact_commit{datname!~"template.*",}[5m]
)
+
irate(
pg_stat_database_xact_rollback{datname!~"template.*",}[5m]
)
) > 10000
for: 5m
labels:
severity: warning
- alert: PostgreSQLCacheHitRatio
annotations:
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database
{{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database
{{ $labels.datname }}
expr: |
avg by (datname) (
rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m])
/
(
rate(
pg_stat_database_blks_hit{datname!~"template.*",}[5m]
)
+
rate(
pg_stat_database_blks_read{datname!~"template.*",}[5m]
)
)
) < 0.98
for: 5m
labels:
severity: warning

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
null

View file

@ -119,6 +119,16 @@
"name": "promscale",
"source": "https://github.com/timescale/promscale",
"subdir": "docs/mixin"
},
{
"name": "postgres-exporter",
"source": "https://github.com/prometheus-community/postgres_exporter",
"subdir": "postgres_mixin"
},
{
"name": "grafana",
"source": "https://github.com/grafana/grafana",
"subdir": "grafana-mixin"
}
]
}

View file

@ -0,0 +1,58 @@
---
title: grafana
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/grafana/grafana](https://github.com/grafana/grafana/tree/master/grafana-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/alerts.yaml).
{{< /panel >}}
### GrafanaAlerts
##### GrafanaRequestsFailing
{{< code lang="yaml" >}}
alert: GrafanaRequestsFailing
annotations:
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing
{{ $value | humanize }}% errors'
expr: |
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
/ ignoring (status_code)
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
> 50
for: 5m
labels:
severity: warning
{{< /code >}}
## Recording rules
{{< panel style="warning" >}}
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/rules.yaml).
{{< /panel >}}
### grafana_rules
##### namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
{{< code lang="yaml" >}}
expr: |
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [grafana-overview](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/dashboards/grafana-overview.json)

View file

@ -0,0 +1,157 @@
---
title: postgres-exporter
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/prometheus-community/postgres_exporter](https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/postgres-exporter/alerts.yaml).
{{< /panel >}}
### PostgreSQL
##### PostgreSQLMaxConnectionsReached
{{< code lang="yaml" >}}
alert: PostgreSQLMaxConnectionsReached
annotations:
description: '{{ $labels.instance }} is exceeding the currently configured maximum
Postgres connection limit (current value: {{ $value }}s). Services may be degraded
- please take immediate action (you probably need to increase max_connections
in the Docker image and re-deploy.'
summary: '{{ $labels.instance }} has maxed out Postgres connections.'
expr: |
sum by (instance) (pg_stat_activity_count{})
>=
sum by (instance) (pg_settings_max_connections{})
-
sum by (instance) (pg_settings_superuser_reserved_connections{})
for: 1m
labels:
severity: warning
{{< /code >}}
##### PostgreSQLHighConnections
{{< code lang="yaml" >}}
alert: PostgreSQLHighConnections
annotations:
description: '{{ $labels.instance }} is exceeding 80% of the currently configured
maximum Postgres connection limit (current value: {{ $value }}s). Please check
utilization graphs and confirm if this is normal service growth, abuse or an otherwise
temporary condition or if new resources need to be provisioned (or the limits
increased, which is mostly likely).'
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.'
expr: |
sum by (instance) (pg_stat_activity_count{})
>
(
sum by (instance) (pg_settings_max_connections{})
-
sum by (instance) (pg_settings_superuser_reserved_connections{})
) * 0.8
for: 10m
labels:
severity: warning
{{< /code >}}
##### PostgreSQLDown
{{< code lang="yaml" >}}
alert: PostgreSQLDown
annotations:
description: '{{ $labels.instance }} is rejecting query requests from the exporter,
and thus probably not allowing DNS requests to work either. User services should
not be effected provided at least 1 node is still alive.'
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}'
expr: pg_up{} != 1
for: 1m
labels:
severity: warning
{{< /code >}}
##### PostgreSQLSlowQueries
{{< code lang="yaml" >}}
alert: PostgreSQLSlowQueries
annotations:
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database
{{ $labels.datname }} with a value of {{ $value }} '
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{
$labels.datname }} '
expr: |
avg by (datname) (
rate (
pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m]
)
) > 2 * 60
for: 2m
labels:
severity: warning
{{< /code >}}
##### PostgreSQLQPS
{{< code lang="yaml" >}}
alert: PostgreSQLQPS
annotations:
description: PostgreSQL high number of queries per second on {{ $labels.cluster
}} for database {{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL high number of queries per second {{ $labels.cluster }} for
database {{ $labels.datname }}
expr: |
avg by (datname) (
irate(
pg_stat_database_xact_commit{datname!~"template.*",}[5m]
)
+
irate(
pg_stat_database_xact_rollback{datname!~"template.*",}[5m]
)
) > 10000
for: 5m
labels:
severity: warning
{{< /code >}}
##### PostgreSQLCacheHitRatio
{{< code lang="yaml" >}}
alert: PostgreSQLCacheHitRatio
annotations:
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database
{{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{
$labels.datname }}
expr: |
avg by (datname) (
rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m])
/
(
rate(
pg_stat_database_blks_hit{datname!~"template.*",}[5m]
)
+
rate(
pg_stat_database_blks_read{datname!~"template.*",}[5m]
)
)
) < 0.98
for: 5m
labels:
severity: warning
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [postgres-overview](https://github.com/monitoring-mixins/website/blob/master/assets/postgres-exporter/dashboards/postgres-overview.json)

View file

@ -119,6 +119,16 @@
"name": "promscale",
"source": "https://github.com/timescale/promscale",
"subdir": "docs/mixin"
},
{
"name": "postgres-exporter",
"source": "https://github.com/prometheus-community/postgres_exporter",
"subdir": "postgres_mixin"
},
{
"name": "grafana",
"source": "https://github.com/grafana/grafana",
"subdir": "grafana-mixin"
}
]
}