mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
Merge pull request #25 from paulfantom/postgres-exporter
add postgres-exporter, grafana, and mimir mixins
This commit is contained in:
commit
c6cb85e987
10 changed files with 2353 additions and 0 deletions
15
assets/grafana/alerts.yaml
Normal file
15
assets/grafana/alerts.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
groups:
|
||||
- name: GrafanaAlerts
|
||||
rules:
|
||||
- alert: GrafanaRequestsFailing
|
||||
annotations:
|
||||
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is
|
||||
experiencing {{ $value | humanize }}% errors'
|
||||
expr: |
|
||||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
|
||||
/ ignoring (status_code)
|
||||
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
|
||||
> 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
553
assets/grafana/dashboards/grafana-overview.json
Normal file
553
assets/grafana/dashboards/grafana-overview.json
Normal file
|
@ -0,0 +1,553 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"target": {
|
||||
"limit": 100,
|
||||
"matchAny": false,
|
||||
"tags": [ ],
|
||||
"type": "dashboard"
|
||||
},
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": 3085,
|
||||
"iteration": 1631554945276,
|
||||
"links": [ ],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [ ],
|
||||
"noValue": "0",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"mean"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": { },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "8.1.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Firing Alerts",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"mean"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": { },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "8.1.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Dashboards",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"align": null,
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "8.1.3",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Build Info",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": { }
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"branch": true,
|
||||
"container": true,
|
||||
"goversion": true,
|
||||
"namespace": true,
|
||||
"pod": true,
|
||||
"revision": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Time": 7,
|
||||
"Value": 11,
|
||||
"branch": 4,
|
||||
"container": 8,
|
||||
"edition": 2,
|
||||
"goversion": 6,
|
||||
"instance": 1,
|
||||
"job": 0,
|
||||
"namespace": 9,
|
||||
"pod": 10,
|
||||
"revision": 5,
|
||||
"version": 3
|
||||
},
|
||||
"renameByName": { }
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"links": [ ]
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "8.1.3",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (status_code) (irate(grafana_http_request_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[1m])) ",
|
||||
"interval": "",
|
||||
"legendFormat": "{{status_code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [ ],
|
||||
"timeShift": null,
|
||||
"title": "RPS",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:157",
|
||||
"format": "reqps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:158",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"links": [ ]
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "8.1.3",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "histogram_quantile(0.99, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1",
|
||||
"interval": "",
|
||||
"legendFormat": "99th Percentile",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "histogram_quantile(0.50, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1",
|
||||
"interval": "",
|
||||
"legendFormat": "50th Percentile",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"exemplar": true,
|
||||
"expr": "sum(irate(grafana_http_request_duration_seconds_sum{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) * 1 / sum(irate(grafana_http_request_duration_seconds_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval]))",
|
||||
"interval": "",
|
||||
"legendFormat": "Average",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [ ],
|
||||
"timeShift": null,
|
||||
"title": "Request Latency",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:210",
|
||||
"format": "ms",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:211",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 30,
|
||||
"style": "dark",
|
||||
"tags": [ ],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "dev-cortex",
|
||||
"value": "dev-cortex"
|
||||
},
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": [
|
||||
"default/grafana"
|
||||
],
|
||||
"value": [
|
||||
"default/grafana"
|
||||
]
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(grafana_build_info, job)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"options": [ ],
|
||||
"query": {
|
||||
"query": "label_values(grafana_build_info, job)",
|
||||
"refId": "Billing Admin-job-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(grafana_build_info, instance)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"options": [ ],
|
||||
"query": {
|
||||
"query": "label_values(grafana_build_info, instance)",
|
||||
"refId": "Billing Admin-instance-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Grafana Overview",
|
||||
"uid": "6be0s85Mk",
|
||||
"version": 2
|
||||
}
|
6
assets/grafana/rules.yaml
Normal file
6
assets/grafana/rules.yaml
Normal file
|
@ -0,0 +1,6 @@
|
|||
groups:
|
||||
- name: grafana_rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
|
||||
record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
|
105
assets/postgres-exporter/alerts.yaml
Normal file
105
assets/postgres-exporter/alerts.yaml
Normal file
|
@ -0,0 +1,105 @@
|
|||
groups:
|
||||
- name: PostgreSQL
|
||||
rules:
|
||||
- alert: PostgreSQLMaxConnectionsReached
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is exceeding the currently configured maximum
|
||||
Postgres connection limit (current value: {{ $value }}s). Services may be
|
||||
degraded - please take immediate action (you probably need to increase max_connections
|
||||
in the Docker image and re-deploy.'
|
||||
summary: '{{ $labels.instance }} has maxed out Postgres connections.'
|
||||
expr: |
|
||||
sum by (instance) (pg_stat_activity_count{})
|
||||
>=
|
||||
sum by (instance) (pg_settings_max_connections{})
|
||||
-
|
||||
sum by (instance) (pg_settings_superuser_reserved_connections{})
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PostgreSQLHighConnections
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is exceeding 80% of the currently configured
|
||||
maximum Postgres connection limit (current value: {{ $value }}s). Please check
|
||||
utilization graphs and confirm if this is normal service growth, abuse or
|
||||
an otherwise temporary condition or if new resources need to be provisioned
|
||||
(or the limits increased, which is mostly likely).'
|
||||
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.'
|
||||
expr: |
|
||||
sum by (instance) (pg_stat_activity_count{})
|
||||
>
|
||||
(
|
||||
sum by (instance) (pg_settings_max_connections{})
|
||||
-
|
||||
sum by (instance) (pg_settings_superuser_reserved_connections{})
|
||||
) * 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PostgreSQLDown
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is rejecting query requests from the exporter,
|
||||
and thus probably not allowing DNS requests to work either. User services
|
||||
should not be effected provided at least 1 node is still alive.'
|
||||
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}'
|
||||
expr: pg_up{} != 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PostgreSQLSlowQueries
|
||||
annotations:
|
||||
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for
|
||||
database {{ $labels.datname }} with a value of {{ $value }} '
|
||||
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database
|
||||
{{ $labels.datname }} '
|
||||
expr: |
|
||||
avg by (datname) (
|
||||
rate (
|
||||
pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m]
|
||||
)
|
||||
) > 2 * 60
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PostgreSQLQPS
|
||||
annotations:
|
||||
description: PostgreSQL high number of queries per second on {{ $labels.cluster
|
||||
}} for database {{ $labels.datname }} with a value of {{ $value }}
|
||||
summary: PostgreSQL high number of queries per second {{ $labels.cluster }}
|
||||
for database {{ $labels.datname }}
|
||||
expr: |
|
||||
avg by (datname) (
|
||||
irate(
|
||||
pg_stat_database_xact_commit{datname!~"template.*",}[5m]
|
||||
)
|
||||
+
|
||||
irate(
|
||||
pg_stat_database_xact_rollback{datname!~"template.*",}[5m]
|
||||
)
|
||||
) > 10000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PostgreSQLCacheHitRatio
|
||||
annotations:
|
||||
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database
|
||||
{{ $labels.datname }} with a value of {{ $value }}
|
||||
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database
|
||||
{{ $labels.datname }}
|
||||
expr: |
|
||||
avg by (datname) (
|
||||
rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m])
|
||||
/
|
||||
(
|
||||
rate(
|
||||
pg_stat_database_blks_hit{datname!~"template.*",}[5m]
|
||||
)
|
||||
+
|
||||
rate(
|
||||
pg_stat_database_blks_read{datname!~"template.*",}[5m]
|
||||
)
|
||||
)
|
||||
) < 0.98
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
1438
assets/postgres-exporter/dashboards/postgres-overview.json
Normal file
1438
assets/postgres-exporter/dashboards/postgres-overview.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/postgres-exporter/rules.yaml
Normal file
1
assets/postgres-exporter/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
10
mixins.json
10
mixins.json
|
@ -119,6 +119,16 @@
|
|||
"name": "promscale",
|
||||
"source": "https://github.com/timescale/promscale",
|
||||
"subdir": "docs/mixin"
|
||||
},
|
||||
{
|
||||
"name": "postgres-exporter",
|
||||
"source": "https://github.com/prometheus-community/postgres_exporter",
|
||||
"subdir": "postgres_mixin"
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
"source": "https://github.com/grafana/grafana",
|
||||
"subdir": "grafana-mixin"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
58
site/content/grafana/_index.md
Normal file
58
site/content/grafana/_index.md
Normal file
|
@ -0,0 +1,58 @@
|
|||
---
|
||||
title: grafana
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/grafana/grafana](https://github.com/grafana/grafana/tree/master/grafana-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### GrafanaAlerts
|
||||
|
||||
##### GrafanaRequestsFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GrafanaRequestsFailing
|
||||
annotations:
|
||||
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing
|
||||
{{ $value | humanize }}% errors'
|
||||
expr: |
|
||||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
|
||||
/ ignoring (status_code)
|
||||
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
|
||||
> 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Recording rules
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/rules.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### grafana_rules
|
||||
|
||||
##### namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
|
||||
record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [grafana-overview](https://github.com/monitoring-mixins/website/blob/master/assets/grafana/dashboards/grafana-overview.json)
|
157
site/content/postgres-exporter/_index.md
Normal file
157
site/content/postgres-exporter/_index.md
Normal file
|
@ -0,0 +1,157 @@
|
|||
---
|
||||
title: postgres-exporter
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/prometheus-community/postgres_exporter](https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/postgres-exporter/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### PostgreSQL
|
||||
|
||||
##### PostgreSQLMaxConnectionsReached
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PostgreSQLMaxConnectionsReached
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is exceeding the currently configured maximum
|
||||
Postgres connection limit (current value: {{ $value }}s). Services may be degraded
|
||||
- please take immediate action (you probably need to increase max_connections
|
||||
in the Docker image and re-deploy.'
|
||||
summary: '{{ $labels.instance }} has maxed out Postgres connections.'
|
||||
expr: |
|
||||
sum by (instance) (pg_stat_activity_count{})
|
||||
>=
|
||||
sum by (instance) (pg_settings_max_connections{})
|
||||
-
|
||||
sum by (instance) (pg_settings_superuser_reserved_connections{})
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PostgreSQLHighConnections
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PostgreSQLHighConnections
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is exceeding 80% of the currently configured
|
||||
maximum Postgres connection limit (current value: {{ $value }}s). Please check
|
||||
utilization graphs and confirm if this is normal service growth, abuse or an otherwise
|
||||
temporary condition or if new resources need to be provisioned (or the limits
|
||||
increased, which is mostly likely).'
|
||||
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.'
|
||||
expr: |
|
||||
sum by (instance) (pg_stat_activity_count{})
|
||||
>
|
||||
(
|
||||
sum by (instance) (pg_settings_max_connections{})
|
||||
-
|
||||
sum by (instance) (pg_settings_superuser_reserved_connections{})
|
||||
) * 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PostgreSQLDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PostgreSQLDown
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is rejecting query requests from the exporter,
|
||||
and thus probably not allowing DNS requests to work either. User services should
|
||||
not be effected provided at least 1 node is still alive.'
|
||||
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}'
|
||||
expr: pg_up{} != 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PostgreSQLSlowQueries
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PostgreSQLSlowQueries
|
||||
annotations:
|
||||
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for database
|
||||
{{ $labels.datname }} with a value of {{ $value }} '
|
||||
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database {{
|
||||
$labels.datname }} '
|
||||
expr: |
|
||||
avg by (datname) (
|
||||
rate (
|
||||
pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m]
|
||||
)
|
||||
) > 2 * 60
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PostgreSQLQPS
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PostgreSQLQPS
|
||||
annotations:
|
||||
description: PostgreSQL high number of queries per second on {{ $labels.cluster
|
||||
}} for database {{ $labels.datname }} with a value of {{ $value }}
|
||||
summary: PostgreSQL high number of queries per second {{ $labels.cluster }} for
|
||||
database {{ $labels.datname }}
|
||||
expr: |
|
||||
avg by (datname) (
|
||||
irate(
|
||||
pg_stat_database_xact_commit{datname!~"template.*",}[5m]
|
||||
)
|
||||
+
|
||||
irate(
|
||||
pg_stat_database_xact_rollback{datname!~"template.*",}[5m]
|
||||
)
|
||||
) > 10000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PostgreSQLCacheHitRatio
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PostgreSQLCacheHitRatio
|
||||
annotations:
|
||||
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database
|
||||
{{ $labels.datname }} with a value of {{ $value }}
|
||||
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{
|
||||
$labels.datname }}
|
||||
expr: |
|
||||
avg by (datname) (
|
||||
rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m])
|
||||
/
|
||||
(
|
||||
rate(
|
||||
pg_stat_database_blks_hit{datname!~"template.*",}[5m]
|
||||
)
|
||||
+
|
||||
rate(
|
||||
pg_stat_database_blks_read{datname!~"template.*",}[5m]
|
||||
)
|
||||
)
|
||||
) < 0.98
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [postgres-overview](https://github.com/monitoring-mixins/website/blob/master/assets/postgres-exporter/dashboards/postgres-overview.json)
|
|
@ -119,6 +119,16 @@
|
|||
"name": "promscale",
|
||||
"source": "https://github.com/timescale/promscale",
|
||||
"subdir": "docs/mixin"
|
||||
},
|
||||
{
|
||||
"name": "postgres-exporter",
|
||||
"source": "https://github.com/prometheus-community/postgres_exporter",
|
||||
"subdir": "postgres_mixin"
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
"source": "https://github.com/grafana/grafana",
|
||||
"subdir": "grafana-mixin"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue