1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00

*:regenerate

This commit is contained in:
Paweł Krupa (paulfantom) 2022-05-02 10:59:36 +02:00
parent b6d1cb395b
commit 3ef8b4ac35
20 changed files with 5271 additions and 123 deletions

View file

@ -810,7 +810,7 @@
},
{
"collapse": false,
"height": "250px",
"collapsed": false,
"panels": [
{
"aliasColors": { },
@ -819,6 +819,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 10,
"legend": {
"avg": false,
@ -845,7 +846,7 @@
}
],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -907,6 +908,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 11,
"legend": {
"avg": false,
@ -933,7 +935,7 @@
}
],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -995,6 +997,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 12,
"legend": {
"avg": false,
@ -1015,7 +1018,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1061,19 +1064,7 @@
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Querier",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
},
{
"aliasColors": { },
"bars": false,
@ -1081,6 +1072,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"gridPos": { },
"id": 13,
"legend": {
"avg": false,
@ -1101,7 +1093,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
@ -1157,6 +1149,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"gridPos": { },
"id": 14,
"legend": {
"avg": false,
@ -1177,7 +1170,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
@ -1233,6 +1226,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 15,
"legend": {
"avg": false,
@ -1253,7 +1247,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1307,12 +1301,13 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "",
"titleSize": "h6"
"title": "Querier",
"titleSize": "h6",
"type": "row"
},
{
"collapse": false,
"height": "250px",
"collapsed": false,
"panels": [
{
"aliasColors": { },
@ -1321,6 +1316,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 16,
"legend": {
"avg": false,
@ -1347,7 +1343,7 @@
}
],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1409,6 +1405,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 17,
"legend": {
"avg": false,
@ -1435,7 +1432,7 @@
}
],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1497,6 +1494,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 18,
"legend": {
"avg": false,
@ -1517,7 +1515,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1563,19 +1561,7 @@
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Index Gateway",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
},
{
"aliasColors": { },
"bars": false,
@ -1583,6 +1569,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"gridPos": { },
"id": 19,
"legend": {
"avg": false,
@ -1603,7 +1590,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 3,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
@ -1659,6 +1646,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"gridPos": { },
"id": 20,
"legend": {
"avg": false,
@ -1679,7 +1667,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 3,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
@ -1735,6 +1723,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 21,
"legend": {
"avg": false,
@ -1755,7 +1744,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 3,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1811,6 +1800,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 22,
"legend": {
"avg": false,
@ -1831,7 +1821,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 3,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1885,8 +1875,9 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "",
"titleSize": "h6"
"title": "Index Gateway",
"titleSize": "h6",
"type": "row"
},
{
"collapse": false,
@ -2152,7 +2143,7 @@
},
{
"collapse": false,
"height": "250px",
"collapsed": false,
"panels": [
{
"aliasColors": { },
@ -2161,6 +2152,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 26,
"legend": {
"avg": false,
@ -2237,6 +2229,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 27,
"legend": {
"avg": false,
@ -2317,19 +2310,7 @@
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Ruler",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
},
{
"aliasColors": { },
"bars": false,
@ -2337,6 +2318,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 28,
"legend": {
"avg": false,
@ -2425,6 +2407,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 29,
"legend": {
"avg": false,
@ -2497,8 +2480,9 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "",
"titleSize": "h6"
"title": "Ruler",
"titleSize": "h6",
"type": "row"
}
],
"schemaVersion": 14,

View file

@ -548,7 +548,7 @@
},
{
"collapse": false,
"height": "250px",
"collapsed": false,
"panels": [
{
"aliasColors": { },
@ -557,6 +557,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 7,
"legend": {
"avg": false,
@ -631,6 +632,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 8,
"legend": {
"avg": false,
@ -711,19 +713,7 @@
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Ingester",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
},
{
"aliasColors": { },
"bars": false,
@ -731,6 +721,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 9,
"legend": {
"avg": false,
@ -819,6 +810,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 10,
"legend": {
"avg": false,
@ -885,19 +877,7 @@
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
},
{
"aliasColors": { },
"bars": false,
@ -905,6 +885,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"gridPos": { },
"id": 11,
"legend": {
"avg": false,
@ -925,7 +906,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
@ -981,6 +962,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"gridPos": { },
"id": 12,
"legend": {
"avg": false,
@ -1001,7 +983,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
@ -1057,6 +1039,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": { },
"id": 13,
"legend": {
"avg": false,
@ -1077,7 +1060,7 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
@ -1131,8 +1114,9 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "",
"titleSize": "h6"
"title": "Ingester",
"titleSize": "h6",
"type": "row"
}
],
"schemaVersion": 14,

View file

@ -0,0 +1,321 @@
groups:
- name: promscale-general
rules:
- alert: PromscaleDown
annotations:
description: No Promscale instance was found.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDown.md
summary: Promscale is down
expr: absent(up{job=~".*promscale.*"})
labels:
severity: critical
- name: promscale-ingest
rules:
- alert: PromscaleIngestHighErrorRate
annotations:
description: Promscale ingestion is having a {{ $value | humanizePercentage
}} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
summary: High error rate in Promscale ingestion
expr: |
(
sum by (job, instance, type) (
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_ingest_requests_total[5m])
)
) > 0.05
labels:
severity: warning
- alert: PromscaleIngestHighErrorRate
annotations:
description: Promscale ingestion is having a {{ $value | humanizePercentage
}} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
summary: High error rate in Promscale ingestion
expr: |
(
sum by (job, instance, type) (
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_ingest_requests_total[5m])
)
) > 0.1
labels:
severity: critical
- alert: PromscaleIngestHighLatency
annotations:
description: Slowest 10% of ingestion batch took more than {{ $value }} seconds
to ingest.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
summary: Slow Promscale ingestion
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 10
and
sum by (job, instance, type) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 0
for: 5m
labels:
severity: warning
- alert: PromscaleIngestHighLatency
annotations:
description: Slowest 10% of ingestion batch took more than {{ $value }} seconds
to ingest.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
summary: Slow Promscale ingestion
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 30
and
sum by (job, instance, type) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 0
for: 5m
labels:
severity: critical
- name: promscale-query
rules:
- alert: PromscaleQueryHighErrorRate
annotations:
description: Evaluating queries via Promscale has {{ $value | humanizePercentage
}} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
summary: High error rate in querying Promscale
expr: |
(
sum by (job, instance, type) (
rate(promscale_query_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_query_requests_total[5m])
)
) > 0.05
labels:
severity: warning
- alert: PromscaleQueryHighErrorRate
annotations:
description: Evaluating queries via Promscale had {{ $value | humanizePercentage
}} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
summary: High error rate in querying Promscale
expr: |
(
sum by (job, instance, type) (
rate(promscale_query_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_query_requests_total[5m])
)
) > 0.1
labels:
severity: critical
- alert: PromscaleQueryHighLatency
annotations:
description: Slowest 10% of the queries took more than {{ $value }} seconds
to evaluate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
summary: Slow Promscale querying
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_query_duration_seconds_bucket[5m])
)
) > 5
and
sum by (job, instance, type) (
rate(promscale_query_duration_seconds_bucket[5m])
) > 0
)
for: 5m
labels:
severity: warning
- alert: PromscaleQueryHighLatency
annotations:
description: Slowest 10% of the queries took {{ $value }} seconds to evaluate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
summary: Slow Promscale querying
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_query_duration_seconds_bucket[5m])
)
) > 10
and
sum by (job, instance, type) (
rate(promscale_query_duration_seconds_bucket[5m])
) > 0
)
for: 5m
labels:
severity: critical
- name: promscale-cache
rules:
- alert: PromscaleCacheHighNumberOfEvictions
annotations:
description: Promscale {{ $labels.name }} is evicting at {{ $value }} entries
a second.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCacheHighNumberOfEvictions.md
summary: High cache eviction in Promscale
expr: |
(
sum by (job, instance, name, type) (
rate(promscale_cache_evictions_total[5m])
)
/
sum by (job, instance, name, type) (
promscale_cache_capacity_elements
)
) > 0.2
labels:
severity: warning
- alert: PromscaleCacheTooSmall
annotations:
description: Promscale {{ $labels.name }} has a hit ratio of {{ $value | humanizePercentage
}}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCacheTooSmall.md
summary: High cache eviction in Promscale
expr: |
(
sum by (job, instance, type, name) (
rate(promscale_cache_query_hits_total[5m])
)
/
sum by (job, instance, type, name) (
rate(promscale_cache_queries_total[5m])
)
) < 0.9
labels:
severity: warning
- name: promscale-database-connection
rules:
- alert: PromscaleStorageHighErrorRate
annotations:
description: Promscale connection with the database has an error of {{ $value
| humanizePercentage }}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageHighErrorRate.md
summary: Promscale experiences a high error rate when connecting to the database
expr: |
(
sum by (job) (
# Error counter exists for query, query_row & exec, and not for send_batch.
rate(promscale_database_request_errors_total{method=~"query.*|exec"}[5m])
)
/
sum by (job) (
rate(promscale_database_requests_total{method=~"query.*|exec"}[5m])
)
) > 0.05
labels:
severity: warning
- alert: PromscaleStorageHighLatency
annotations:
description: Slowest 10% of database requests are taking more than {{ $value
}} seconds to respond.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageHighLatency.md
summary: Slow database response
expr: |
(
histogram_quantile(0.9,
sum by (le, job, type) (
rate(promscale_database_requests_duration_seconds_bucket[5m])
)
) > 5
and
sum by (job, type) (
rate(promscale_database_requests_duration_seconds_count[5m])
) > 0
)
labels:
severity: warning
- name: promscale-database
rules:
- alert: PromscaleStorageUnhealthy
annotations:
description: Promscale connection with the database has an error of {{ $value
| humanizePercentage }}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageUnhealthy.md
summary: Promscale database is unhealthy
expr: |
(
sum by (job) (
rate(promscale_sql_database_health_check_errors_total[5m])
)
/
sum by (job) (
rate(promscale_sql_database_health_check_total[5m])
)
) > 0.05
labels:
severity: warning
- alert: PromscaleMaintenanceJobRunningTooLong
annotations:
description: Promscale Database is taking {{ $value }} seconds to respond to
Promscale's requests.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
summary: Promscale maintenance jobs taking too long to complete
expr: |
(
(
(
time()
-
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds
)
>
30 * 60 * 2 # 30 mins (we launch maintenance jobs scheduled at 30 mins) * 60 (to seconds) * 2 (wait max for 2 complete scans before firing alert).
)
and
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds > 0
)
labels:
severity: warning
- alert: PromscaleMaintenanceJobFailures
annotations:
description: Promscale maintenance job failed to successfully execute.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobFailures.md
summary: Promscale maintenance job failed
expr: promscale_sql_database_worker_maintenance_job_failed == 1
labels:
severity: warning
- alert: PromscaleCompressionLow
annotations:
description: High uncompressed data in Promscale, on average, {{ $value }} uncompressed
chunks per metric.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCompressionLow.md
summary: High uncompressed data
expr: |
(
(
(promscale_sql_database_chunks_count - promscale_sql_database_chunks_compressed_count) # Number of uncompressed chunks.
/
promscale_sql_database_metric_count
) > 4 # If total number of average uncompressed chunk per metric is more than 4 chunks at maximum, we should alert.
and
promscale_sql_database_compression_status == 1
)
labels:
severity: warning

View file

@ -0,0 +1,308 @@
{
"__inputs": [
{
"description": "",
"label": "TimescaleDB / PostgreSQL data source",
"name": "DS_TIMESCALEDB",
"pluginId": "postgres",
"pluginName": "PostgreSQL",
"type": "datasource"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [ ],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 11,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"promscale",
"apm"
],
"targetBlank": false,
"title": "Menu",
"tooltip": "",
"type": "dashboards",
"url": ""
}
],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Total exec time"
},
"properties": [
{
"id": "unit",
"value": "ms"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Avg exec time"
},
"properties": [
{
"id": "unit",
"value": "ms"
},
{
"id": "decimals",
"value": 2
}
]
},
{
"matcher": {
"id": "byName",
"options": "Source"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Show service overview",
"url": "/d/YWfN6wL7z/?var-service=${__value.raw}"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "Target"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Show service overview",
"url": "/d/YWfN6wL7z/?var-service=${__value.raw}"
}
]
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0
},
"id": 4,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n p.service_name as \"Source\",\n k.service_name as \"Target\",\n k.span_name as \"Operation\",\n count(*) as \"Calls\",\n sum(k.duration_ms) as \"Total exec time\",\n avg(k.duration_ms) as \"Avg exec time\"\nFROM ps_trace.span p\nINNER JOIN ps_trace.span k\nON (p.trace_id = k.trace_id\nAND p.span_id = k.parent_span_id\nAND p.service_name != k.service_name)\nWHERE p.start_time >= NOW() - INTERVAL '10 minutes'\nAND k.start_time >= NOW() - INTERVAL '10 minutes'\nGROUP BY 1, 2, 3\nORDER BY 5 DESC",
"refId": "A",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Service Dependencies (last 10 minutes)",
"type": "table"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"description": "This maps shows all services sending traces and the interactions between them.\nEach arrow represents a service calling a specific operation in another service. The legend in the arrows includes the requests per second for that interaction. If you are downsampling your traces before sending them to Promscale, then the number of requests per second will not be accurate but you'll be able to see how it compares to other operations.",
"gridPos": {
"h": 19,
"w": 24,
"x": 0,
"y": 8
},
"id": 2,
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT \n service_name as id,\n service_name as title\nFROM ps_trace.span\nWHERE start_time >= NOW() - INTERVAL '10 minutes'\nGROUP BY service_name",
"refId": "A",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"hide": false,
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n p.service_name || '->' || k.service_name || ':' || k.span_name as id,\n p.service_name as source,\n k.service_name as target,\n k.span_name as \"mainStat\",\n count(*) as \"secondaryStat\"\nFROM ps_trace.span p\nINNER JOIN ps_trace.span k\nON (p.trace_id = k.trace_id\nAND p.span_id = k.parent_span_id\nAND p.service_name != k.service_name)\nWHERE p.start_time >= NOW() - INTERVAL '10 minutes'\nAND k.start_time >= NOW() - INTERVAL '10 minutes'\nGROUP BY 1, 2, 3, 4",
"refId": "B",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Service Map (last 10 minutes)",
"type": "nodeGraph"
}
],
"schemaVersion": 34,
"style": "dark",
"tags": [
"promscale",
"apm"
],
"templating": {
"list": [ ]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"hidden": true
},
"timezone": "",
"title": "[3] Service Map",
"uid": "K03UKvPnz",
"version": 13,
"weekStart": ""
}

View file

@ -0,0 +1,531 @@
{
"__inputs": [
{
"description": "",
"label": "TimescaleDB / PostgreSQL data source",
"name": "DS_TIMESCALEDB",
"pluginId": "postgres",
"pluginName": "PostgreSQL",
"type": "datasource"
},
{
"description": "",
"label": "Promscale Jaeger Tracing data source",
"name": "DS_PROMSCALE_JAEGER",
"pluginId": "jaeger",
"pluginName": "Jaeger",
"type": "datasource"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [ ],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 10,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"promscale",
"apm"
],
"targetBlank": false,
"title": "Menu",
"tooltip": "",
"type": "dashboards",
"url": ""
},
{
"asDropdown": false,
"icon": "doc",
"includeVars": false,
"keepTime": false,
"tags": [ ],
"targetBlank": true,
"title": "Documentation",
"tooltip": "",
"type": "link",
"url": "https://docs.timescale.com/promscale/latest/"
}
],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Requests"
},
"properties": [
{
"id": "unit",
"value": "reqps"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Avg Duration"
},
"properties": [
{
"id": "unit",
"value": "ms"
}
]
},
{
"matcher": {
"id": "byName",
"options": "p90 Duration"
},
"properties": [
{
"id": "unit",
"value": "ms"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Error rate"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Service"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Show service overview",
"url": "/d/YWfN6wL7z/?var-service=${__value.raw}"
}
]
}
]
}
]
},
"gridPos": {
"h": 24,
"w": 13,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [
{
"desc": false,
"displayName": "Avg Duration"
}
]
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"hide": false,
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n service_name AS \"Service\",\n COUNT(*)::numeric / (30 * 60) AS \"Requests\",\n AVG(duration_ms) AS \"Avg Duration\",\n ROUND(approx_percentile(0.90, percentile_agg(duration_ms))::numeric, 3) AS \"p90 Duration\",\n (count(*) filter (where status_code = 'STATUS_CODE_ERROR')::numeric / count(*)) AS \"Error rate\"\nFROM ps_trace.span s\nWHERE start_time > NOW() - INTERVAL '30m'\nAND (span_kind = 'SPAN_KIND_SERVER' OR parent_span_id is NULL)\nGROUP BY 1\nORDER BY 2",
"refId": "A",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Services (Last 30 minutes)",
"type": "table"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Duration"
},
"properties": [
{
"id": "unit",
"value": "ms"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.width",
"value": 143
}
]
},
{
"matcher": {
"id": "byName",
"options": "Trace ID"
},
"properties": [
{
"id": "custom.width",
"value": 282
},
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "View trace details",
"url": "/explore?left=%5B%22${__from}%22,%22${__to}%22,%22${DS_PROMSCALE_JAEGER}%22,%7B\"query\":\"${__value.raw}\"%7D%5D"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "start_time"
},
"properties": [
{
"id": "custom.width",
"value": 182
}
]
},
{
"matcher": {
"id": "byName",
"options": "Trace ID"
},
"properties": [
{
"id": "custom.width",
"value": 94
}
]
},
{
"matcher": {
"id": "byName",
"options": "Service"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Show service overview",
"url": "/d/YWfN6wL7z/?var-service=${__value.raw}"
}
]
}
]
}
]
},
"gridPos": {
"h": 12,
"w": 11,
"x": 13,
"y": 0
},
"id": 4,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [ ]
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n replace(trace_id::text, '-'::text, ''::text) as \"Trace ID\",\n service_name as \"Service\",\n span_name as \"Operation\",\n start_time as \"Time\",\n duration_ms as \"Duration\" \nFROM ps_trace.span\nWHERE start_time > NOW() - INTERVAL '30m'\nAND parent_span_id is null\nORDER BY duration_ms DESC\nLIMIT 50\n;",
"refId": "A",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Slowest Requests (last 30 minutes)",
"type": "table"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Service"
},
"properties": [
{
"id": "links",
"value": [
{
"title": "Show service overview",
"url": "/d/YWfN6wL7z/?var-service=${__value.raw}"
}
]
}
]
}
]
},
"gridPos": {
"h": 12,
"w": 11,
"x": 13,
"y": 12
},
"id": 5,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [ ]
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n status_message as \"Error\",\n service_name as \"Service\",\n count(*) as \"Occurrences\" \nFROM ps_trace.span\nWHERE start_time > NOW() - INTERVAL '30m'\nAND status_code = 'STATUS_CODE_ERROR'\nGROUP BY 1, 2\nORDER BY 3\n;",
"refId": "A",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Most Common Errors (last 30 minutes)",
"type": "table"
}
],
"schemaVersion": 34,
"style": "dark",
"tags": [
"promscale",
"apm"
],
"templating": {
"list": [ ]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"hidden": true
},
"timezone": "",
"title": "[1] Overview",
"uid": "vBhEewLnk",
"version": 35,
"weekStart": ""
}

View file

@ -0,0 +1,202 @@
{
"__inputs": [
{
"description": "",
"label": "TimescaleDB / PostgreSQL data source",
"name": "DS_TIMESCALEDB",
"pluginId": "postgres",
"pluginName": "PostgreSQL",
"type": "datasource"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [ ],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 13,
"iteration": 1647423383157,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"promscale",
"apm"
],
"targetBlank": false,
"title": "Menu",
"tooltip": "",
"type": "dashboards",
"url": ""
}
],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"description": "A.K.A. \"Who do I call?\"",
"gridPos": {
"h": 20,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "WITH RECURSIVE x AS\n(\n SELECT\n trace_id,\n span_id,\n parent_span_id,\n service_name,\n span_name\n FROM ps_trace.span\n WHERE start_time > NOW() - INTERVAL '10 minutes'\n AND service_name = '${service}'\n AND span_name = '${operation}'\n UNION ALL\n SELECT\n s.trace_id,\n s.span_id,\n s.parent_span_id,\n s.service_name,\n s.span_name\n FROM x\n INNER JOIN ps_trace.span s\n ON (x.trace_id = s.trace_id\n AND x.span_id = s.parent_span_id)\n AND s.start_time > NOW() - INTERVAL '10 minutes'\n)\nSELECT\n md5(service_name || '-' || span_name) as id,\n span_name as title,\n service_name as \"subTitle\",\n count(*) as \"mainStat\"\nFROM x\nGROUP BY service_name, span_name",
"refId": "A",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"hide": false,
"metricColumn": "none",
"rawQuery": true,
"rawSql": "WITH RECURSIVE x AS\n(\n SELECT\n trace_id,\n span_id,\n parent_span_id,\n service_name,\n span_name,\n null::text as id,\n null::text as source,\n null::text as target\n FROM ps_trace.span\n WHERE start_time > NOW() - INTERVAL '10 minutes'\n AND service_name = '${service}'\n AND span_name = '${operation}'\n UNION ALL\n SELECT\n s.trace_id,\n s.span_id,\n s.parent_span_id,\n s.service_name,\n s.span_name,\n md5(s.service_name || '-' || s.span_name || '-' || x.service_name || '-' || x.span_name) as id,\n md5(x.service_name || '-' || x.span_name) as source,\n md5(s.service_name || '-' || s.span_name) as target\n FROM x\n INNER JOIN ps_trace.span s\n ON (x.trace_id = s.trace_id\n AND x.span_id = s.parent_span_id)\n AND s.start_time > NOW() - INTERVAL '10 minutes'\n)\nSELECT DISTINCT\n x.id,\n x.source,\n x.target \nFROM x\nWHERE id is not null",
"refId": "B",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Map of Downstream Dependencies (last 10 minutes)",
"transformations": [ ],
"type": "nodeGraph"
}
],
"schemaVersion": 34,
"style": "dark",
"tags": [
"promscale",
"apm"
],
"templating": {
"list": [
{
"allValue": "ALL",
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"definition": "SELECT DISTINCT service_name FROM ps_trace.span WHERE start_time > NOW() - INTERVAL '10 minutes'\n",
"hide": 0,
"includeAll": false,
"label": "Service",
"multi": false,
"name": "service",
"options": [ ],
"query": "SELECT DISTINCT service_name FROM ps_trace.span WHERE start_time > NOW() - INTERVAL '10 minutes'\n",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": "ALL",
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"definition": "SELECT DISTINCT span_name FROM ps_trace.span WHERE service_name = ${service:sqlstring} AND start_time > NOW() - INTERVAL '10 minutes'\n)",
"hide": 0,
"includeAll": false,
"label": "Operation",
"multi": false,
"name": "operation",
"options": [ ],
"query": "SELECT DISTINCT span_name FROM ps_trace.span WHERE service_name = ${service:sqlstring} AND start_time > NOW() - INTERVAL '10 minutes'\n",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"hidden": true
},
"timezone": "",
"title": "[4] Downstream Dependencies",
"uid": "SDJmJvPnz",
"version": 7,
"weekStart": ""
}

View file

@ -0,0 +1,203 @@
{
"__inputs": [
{
"description": "",
"label": "TimescaleDB / PostgreSQL data source",
"name": "DS_TIMESCALEDB",
"pluginId": "postgres",
"pluginName": "PostgreSQL",
"type": "datasource"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [ ],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "Map of upstream service dependencies for a specific service and operation",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 12,
"iteration": 1647519937731,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"promscale",
"apm"
],
"targetBlank": false,
"title": "Menu",
"tooltip": "",
"type": "dashboards",
"url": ""
}
],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"description": "A.K.A. \"Who called me?\"",
"gridPos": {
"h": 26,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "WITH RECURSIVE x AS\n(\n SELECT\n trace_id,\n span_id,\n parent_span_id,\n service_name,\n span_name\n FROM ps_trace.span\n WHERE start_time > NOW() - INTERVAL '10 minutes'\n AND service_name = '${service}'\n AND span_name = '${operation}'\n UNION ALL\n SELECT\n s.trace_id,\n s.span_id,\n s.parent_span_id,\n s.service_name,\n s.span_name\n FROM x\n INNER JOIN ps_trace.span s\n ON (x.trace_id = s.trace_id\n AND x.parent_span_id = s.span_id)\n AND s.start_time > NOW() - INTERVAL '10 minutes'\n)\nSELECT\n md5(service_name || '-' || span_name) as id,\n span_name as title,\n service_name as \"subTitle\",\n count(*) as \"mainStat\"\nFROM x\nGROUP BY service_name, span_name",
"refId": "A",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"hide": false,
"metricColumn": "none",
"rawQuery": true,
"rawSql": "WITH RECURSIVE x AS\n(\n SELECT\n trace_id,\n span_id,\n parent_span_id,\n service_name,\n span_name,\n null::text as id,\n null::text as target,\n null::text as source\n FROM ps_trace.span\n WHERE start_time > NOW() - INTERVAL '10 minutes'\n AND service_name = '${service}'\n AND span_name = '${operation}'\n UNION ALL\n SELECT\n s.trace_id,\n s.span_id,\n s.parent_span_id,\n s.service_name,\n s.span_name,\n md5(s.service_name || '-' || s.span_name || '-' || x.service_name || '-' || x.span_name) as id,\n md5(x.service_name || '-' || x.span_name) as target,\n md5(s.service_name || '-' || s.span_name) as source\n FROM x\n INNER JOIN ps_trace.span s\n ON (x.trace_id = s.trace_id\n AND x.parent_span_id = s.span_id)\n AND s.start_time > NOW() - INTERVAL '10 minutes'\n)\nSELECT DISTINCT\n x.id,\n x.target,\n x.source \nFROM x\nWHERE id is not null",
"refId": "B",
"select": [
[
{
"params": [
"span_duration_ms"
],
"type": "column"
}
]
],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Map of Upstream Depencies (last 10 minutes)",
"transformations": [ ],
"type": "nodeGraph"
}
],
"schemaVersion": 34,
"style": "dark",
"tags": [
"promscale",
"apm"
],
"templating": {
"list": [
{
"allValue": "ALL",
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"definition": "SELECT DISTINCT service_name FROM ps_trace.span WHERE start_time > NOW() - INTERVAL '10 minutes'\n",
"hide": 0,
"includeAll": false,
"label": "Service",
"multi": false,
"name": "service",
"options": [ ],
"query": "SELECT DISTINCT service_name FROM ps_trace.span WHERE start_time > NOW() - INTERVAL '10 minutes'\n",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": "ALL",
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"definition": "SELECT DISTINCT span_name FROM ps_trace.span WHERE service_name = ${service:sqlstring} AND start_time > NOW() - INTERVAL '10 minutes'\n)",
"hide": 0,
"includeAll": false,
"label": "Operation",
"multi": false,
"name": "operation",
"options": [ ],
"query": "SELECT DISTINCT span_name FROM ps_trace.span WHERE service_name = ${service:sqlstring} AND start_time > NOW() - INTERVAL '10 minutes'\n",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"hidden": true
},
"timezone": "",
"title": "[5] Upstream Dependencies",
"uid": "o4PPTDPnz",
"version": 14,
"weekStart": ""
}

View file

@ -0,0 +1,751 @@
{
"__inputs": [
{
"description": "",
"label": "TimescaleDB / PostgreSQL data source",
"name": "DS_TIMESCALEDB",
"pluginId": "postgres",
"pluginName": "PostgreSQL",
"type": "datasource"
},
{
"description": "",
"label": "Promscale Jaeger Tracing data source",
"name": "DS_PROMSCALE_JAEGER",
"pluginId": "jaeger",
"pluginName": "Jaeger",
"type": "datasource"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [ ],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 2,
"id": 9,
"iteration": 1647523274899,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"promscale",
"apm"
],
"targetBlank": false,
"title": "Menu",
"tooltip": "",
"type": "dashboards",
"url": ""
}
],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "reqps"
},
"overrides": [ ]
},
"gridPos": {
"h": 11,
"w": 8,
"x": 0,
"y": 0
},
"id": 2,
"interval": "1s",
"maxDataPoints": 300,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "time_series",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n time_bucket_gapfill('$__interval', start_time) AS time,\n coalesce(count(*)::numeric / (EXTRACT(epoch FROM '$__interval'::interval)), 0) AS \"Requests\"\nFROM ps_trace.span s\nWHERE $__timeFilter(start_time)\nAND (span_kind = 'SPAN_KIND_SERVER' OR parent_span_id is NULL)\nAND service_name = '${service}'\nGROUP BY 1\nORDER BY 1",
"refId": "A",
"select": [ ],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Requests",
"type": "timeseries"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": [ ]
},
"gridPos": {
"h": 11,
"w": 8,
"x": 8,
"y": 0
},
"id": 3,
"interval": "1s",
"maxDataPoints": 300,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "time_series",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n time_bucket_gapfill('$__interval', start_time) AS time,\n COALESCE(ROUND(approx_percentile(0.99, percentile_agg(duration_ms))::numeric, 3), 0) as \"p99\",\n COALESCE(ROUND(approx_percentile(0.90, percentile_agg(duration_ms))::numeric, 3), 0) as \"p90\",\n COALESCE(ROUND(approx_percentile(0.50, percentile_agg(duration_ms))::numeric, 3), 0) as \"p50\",\n COALESCE(AVG(duration_ms), 0) as \"Average\"\nFROM ps_trace.span s\nWHERE $__timeFilter(start_time)\nAND (span_kind = 'SPAN_KIND_SERVER' OR parent_span_id is NULL)\nAND service_name = '${service}'\nGROUP BY 1\nORDER BY 1",
"refId": "A",
"select": [ ],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Duration",
"type": "timeseries"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"axisSoftMax": 1,
"axisSoftMin": 0,
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "line"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "percentunit"
},
"overrides": [ ]
},
"gridPos": {
"h": 11,
"w": 8,
"x": 16,
"y": 0
},
"id": 4,
"interval": "1s",
"maxDataPoints": 300,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "time_series",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n time_bucket('$__interval', start_time) as time,\n coalesce(count(*) filter (where status_code = 'STATUS_CODE_ERROR')::numeric / count(*), 0) as \"Error rate\"\nFROM ps_trace.span s\nWHERE $__timeFilter(start_time)\nAND (span_kind = 'SPAN_KIND_SERVER' OR parent_span_id is NULL)\nAND service_name = '${service}'\nGROUP BY 1\nORDER BY 1",
"refId": "A",
"select": [ ],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Error Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Requests"
},
"properties": [
{
"id": "unit",
"value": "reqps"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Avg Duration"
},
"properties": [
{
"id": "unit",
"value": "ms"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Error rate"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
}
]
},
"gridPos": {
"h": 12,
"w": 8,
"x": 0,
"y": 11
},
"id": 10,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [ ]
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n span_name as \"Operation\",\n count(*)::numeric / (${__to:date:seconds} - ${__from:date:seconds}) AS \"Requests\",\n sum(duration_ms) / count(*)::numeric as \"Avg Duration\",\n coalesce((count(*) filter (where status_code = 'STATUS_CODE_ERROR')::numeric / count(*)), 0) as \"Error rate\"\nFROM ps_trace.span s\nWHERE $__timeFilter(start_time)\nAND (span_kind = 'SPAN_KIND_SERVER' OR parent_span_id is NULL)\nAND service_name = '${service}'\nGROUP BY 1\nORDER BY 1",
"refId": "A",
"select": [ ],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Statistics by Operation",
"type": "table"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Duration"
},
"properties": [
{
"id": "unit",
"value": "ms"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.width",
"value": 143
}
]
},
{
"matcher": {
"id": "byName",
"options": "Trace ID"
},
"properties": [
{
"id": "custom.width",
"value": 282
},
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "View trace details",
"url": "/explore?left=%5B%22${__from}%22,%22${__to}%22,%22${DS_PROMSCALE_JAEGER}%22,%7B\"query\":\"${__value.raw}\"%7D%5D"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "start_time"
},
"properties": [
{
"id": "custom.width",
"value": 182
}
]
},
{
"matcher": {
"id": "byName",
"options": "Trace ID"
},
"properties": [
{
"id": "custom.width",
"value": 94
}
]
}
]
},
"gridPos": {
"h": 12,
"w": 8,
"x": 8,
"y": 11
},
"id": 7,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [ ]
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n replace(trace_id::text, '-'::text, ''::text) as \"Trace ID\",\n span_name as \"Operation\",\n start_time as \"Time\",\n duration_ms as \"Duration\"\nFROM ps_trace.span\nWHERE $__timeFilter(start_time)\nAND (span_kind = 'SPAN_KIND_SERVER' OR parent_span_id is NULL)\nAND service_name = '${service}'\nORDER BY duration_ms DESC\nLIMIT 50\n;",
"refId": "A",
"select": [ ],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Slowest Operation Executions",
"type": "table"
},
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 12,
"w": 8,
"x": 16,
"y": 11
},
"id": 9,
"options": {
"footer": {
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": true,
"sortBy": [ ]
},
"pluginVersion": "8.3.3",
"targets": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"format": "table",
"group": [ ],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT\n status_message as \"Error\",\n count(*) as \"Occurrences\"\nFROM ps_trace.span\nWHERE $__timeFilter(start_time) AND\nstatus_code = 'STATUS_CODE_ERROR' AND\nservice_name = '${service}'\nGROUP BY 1\nORDER BY 2 DESC\n;",
"refId": "A",
"select": [ ],
"table": "event",
"timeColumn": "\"time\"",
"timeColumnType": "timestamp",
"where": [
{
"name": "$__timeFilter",
"params": [ ],
"type": "macro"
}
]
}
],
"title": "Most Common Errors",
"type": "table"
}
],
"refresh": "",
"schemaVersion": 34,
"style": "dark",
"tags": [
"promscale",
"apm"
],
"templating": {
"list": [
{
"datasource": {
"type": "postgres",
"uid": "${DS_TIMESCALEDB}"
},
"definition": "SELECT \n distinct(service_name)\nFROM ps_trace.span\nWHERE $__timeFilter(start_time)\n",
"hide": 0,
"includeAll": false,
"label": "Service",
"multi": false,
"name": "service",
"options": [ ],
"query": "SELECT \n distinct(service_name)\nFROM ps_trace.span\nWHERE $__timeFilter(start_time)\n",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": { },
"timezone": "",
"title": "[2] Service Details",
"uid": "YWfN6wL7z",
"version": 36,
"weekStart": ""
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
null

View file

@ -463,7 +463,7 @@
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"query": "label_values(up{job=~\".*thanos-bucket-replicate.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,

View file

@ -1771,7 +1771,7 @@
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"query": "label_values(up{job=~\".*thanos-compact.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,

View file

@ -2142,29 +2142,6 @@
"query": "5m,10m,30m,1h,6h,12h",
"refresh": 2,
"type": "interval"
},
{
"allValue": null,
"current": {
"text": "all",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": "job",
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},

View file

@ -1069,7 +1069,7 @@
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"query": "label_values(up{job=~\".*thanos-query-frontend.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,

View file

@ -1830,7 +1830,7 @@
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"query": "label_values(up{job=~\".*thanos-query.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,

View file

@ -2174,7 +2174,7 @@
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"query": "label_values(up{job=~\".*thanos-receive.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,

View file

@ -1822,7 +1822,7 @@
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"query": "label_values(up{job=~\".*thanos-rule.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,

View file

@ -1459,7 +1459,7 @@
"multi": false,
"name": "job",
"options": [ ],
"query": "label_values(up{job=~\".*thanos-store.*\"}, job)",
"query": "label_values(up{job=~\".*thanos-sidecar.*\"}, job)",
"refresh": 1,
"regex": "",
"sort": 2,

View file

@ -0,0 +1,432 @@
---
title: promscale
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/timescale/promscale](https://github.com/timescale/promscale/tree/master/docs/mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/promscale/alerts.yaml).
{{< /panel >}}
### promscale-general
##### PromscaleDown
{{< code lang="yaml" >}}
alert: PromscaleDown
annotations:
description: No Promscale instance was found.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDown.md
summary: Promscale is down
expr: absent(up{job=~".*promscale.*"})
labels:
severity: critical
{{< /code >}}
### promscale-ingest
##### PromscaleIngestHighErrorRate
{{< code lang="yaml" >}}
alert: PromscaleIngestHighErrorRate
annotations:
description: Promscale ingestion is having a {{ $value | humanizePercentage }} error
rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
summary: High error rate in Promscale ingestion
expr: |
(
sum by (job, instance, type) (
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_ingest_requests_total[5m])
)
) > 0.05
labels:
severity: warning
{{< /code >}}
##### PromscaleIngestHighErrorRate
{{< code lang="yaml" >}}
alert: PromscaleIngestHighErrorRate
annotations:
description: Promscale ingestion is having a {{ $value | humanizePercentage }} error
rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
summary: High error rate in Promscale ingestion
expr: |
(
sum by (job, instance, type) (
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_ingest_requests_total[5m])
)
) > 0.1
labels:
severity: critical
{{< /code >}}
##### PromscaleIngestHighLatency
{{< code lang="yaml" >}}
alert: PromscaleIngestHighLatency
annotations:
description: Slowest 10% of ingestion batch took more than {{ $value }} seconds
to ingest.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
summary: Slow Promscale ingestion
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 10
and
sum by (job, instance, type) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 0
for: 5m
labels:
severity: warning
{{< /code >}}
##### PromscaleIngestHighLatency
{{< code lang="yaml" >}}
alert: PromscaleIngestHighLatency
annotations:
description: Slowest 10% of ingestion batch took more than {{ $value }} seconds
to ingest.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
summary: Slow Promscale ingestion
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 30
and
sum by (job, instance, type) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 0
for: 5m
labels:
severity: critical
{{< /code >}}
### promscale-query
##### PromscaleQueryHighErrorRate
{{< code lang="yaml" >}}
alert: PromscaleQueryHighErrorRate
annotations:
description: Evaluating queries via Promscale has {{ $value | humanizePercentage
}} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
summary: High error rate in querying Promscale
expr: |
(
sum by (job, instance, type) (
rate(promscale_query_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_query_requests_total[5m])
)
) > 0.05
labels:
severity: warning
{{< /code >}}
##### PromscaleQueryHighErrorRate
{{< code lang="yaml" >}}
alert: PromscaleQueryHighErrorRate
annotations:
description: Evaluating queries via Promscale had {{ $value | humanizePercentage
}} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
summary: High error rate in querying Promscale
expr: |
(
sum by (job, instance, type) (
rate(promscale_query_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, type) (
rate(promscale_query_requests_total[5m])
)
) > 0.1
labels:
severity: critical
{{< /code >}}
##### PromscaleQueryHighLatency
{{< code lang="yaml" >}}
alert: PromscaleQueryHighLatency
annotations:
description: Slowest 10% of the queries took more than {{ $value }} seconds to evaluate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
summary: Slow Promscale querying
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_query_duration_seconds_bucket[5m])
)
) > 5
and
sum by (job, instance, type) (
rate(promscale_query_duration_seconds_bucket[5m])
) > 0
)
for: 5m
labels:
severity: warning
{{< /code >}}
##### PromscaleQueryHighLatency
{{< code lang="yaml" >}}
alert: PromscaleQueryHighLatency
annotations:
description: Slowest 10% of the queries took {{ $value }} seconds to evaluate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
summary: Slow Promscale querying
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, type, le) (
rate(promscale_query_duration_seconds_bucket[5m])
)
) > 10
and
sum by (job, instance, type) (
rate(promscale_query_duration_seconds_bucket[5m])
) > 0
)
for: 5m
labels:
severity: critical
{{< /code >}}
### promscale-cache
##### PromscaleCacheHighNumberOfEvictions
{{< code lang="yaml" >}}
alert: PromscaleCacheHighNumberOfEvictions
annotations:
description: Promscale {{ $labels.name }} is evicting at {{ $value }} entries a
second.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCacheHighNumberOfEvictions.md
summary: High cache eviction in Promscale
expr: |
(
sum by (job, instance, name, type) (
rate(promscale_cache_evictions_total[5m])
)
/
sum by (job, instance, name, type) (
promscale_cache_capacity_elements
)
) > 0.2
labels:
severity: warning
{{< /code >}}
##### PromscaleCacheTooSmall
{{< code lang="yaml" >}}
alert: PromscaleCacheTooSmall
annotations:
description: Promscale {{ $labels.name }} has a hit ratio of {{ $value | humanizePercentage
}}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCacheTooSmall.md
summary: High cache eviction in Promscale
expr: |
(
sum by (job, instance, type, name) (
rate(promscale_cache_query_hits_total[5m])
)
/
sum by (job, instance, type, name) (
rate(promscale_cache_queries_total[5m])
)
) < 0.9
labels:
severity: warning
{{< /code >}}
### promscale-database-connection
##### PromscaleStorageHighErrorRate
{{< code lang="yaml" >}}
alert: PromscaleStorageHighErrorRate
annotations:
description: Promscale connection with the database has an error of {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageHighErrorRate.md
summary: Promscale experiences a high error rate when connecting to the database
expr: |
(
sum by (job) (
# Error counter exists for query, query_row & exec, and not for send_batch.
rate(promscale_database_request_errors_total{method=~"query.*|exec"}[5m])
)
/
sum by (job) (
rate(promscale_database_requests_total{method=~"query.*|exec"}[5m])
)
) > 0.05
labels:
severity: warning
{{< /code >}}
##### PromscaleStorageHighLatency
{{< code lang="yaml" >}}
alert: PromscaleStorageHighLatency
annotations:
description: Slowest 10% of database requests are taking more than {{ $value }}
seconds to respond.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageHighLatency.md
summary: Slow database response
expr: |
(
histogram_quantile(0.9,
sum by (le, job, type) (
rate(promscale_database_requests_duration_seconds_bucket[5m])
)
) > 5
and
sum by (job, type) (
rate(promscale_database_requests_duration_seconds_count[5m])
) > 0
)
labels:
severity: warning
{{< /code >}}
### promscale-database
##### PromscaleStorageUnhealthy
{{< code lang="yaml" >}}
alert: PromscaleStorageUnhealthy
annotations:
description: Promscale connection with the database has an error of {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageUnhealthy.md
summary: Promscale database is unhealthy
expr: |
(
sum by (job) (
rate(promscale_sql_database_health_check_errors_total[5m])
)
/
sum by (job) (
rate(promscale_sql_database_health_check_total[5m])
)
) > 0.05
labels:
severity: warning
{{< /code >}}
##### PromscaleMaintenanceJobRunningTooLong
{{< code lang="yaml" >}}
alert: PromscaleMaintenanceJobRunningTooLong
annotations:
description: Promscale Database is taking {{ $value }} seconds to respond to Promscale's
requests.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
summary: Promscale maintenance jobs taking too long to complete
expr: |
(
(
(
time()
-
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds
)
>
30 * 60 * 2 # 30 mins (we launch maintenance jobs scheduled at 30 mins) * 60 (to seconds) * 2 (wait max for 2 complete scans before firing alert).
)
and
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds > 0
)
labels:
severity: warning
{{< /code >}}
##### PromscaleMaintenanceJobFailures
{{< code lang="yaml" >}}
alert: PromscaleMaintenanceJobFailures
annotations:
description: Promscale maintenance job failed to successfully execute.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobFailures.md
summary: Promscale maintenance job failed
expr: promscale_sql_database_worker_maintenance_job_failed == 1
labels:
severity: warning
{{< /code >}}
##### PromscaleCompressionLow
{{< code lang="yaml" >}}
alert: PromscaleCompressionLow
annotations:
description: High uncompressed data in Promscale, on average, {{ $value }} uncompressed
chunks per metric.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCompressionLow.md
summary: High uncompressed data
expr: |
(
(
(promscale_sql_database_chunks_count - promscale_sql_database_chunks_compressed_count) # Number of uncompressed chunks.
/
promscale_sql_database_metric_count
) > 4 # If total number of average uncompressed chunk per metric is more than 4 chunks at maximum, we should alert.
and
promscale_sql_database_compression_status == 1
)
labels:
severity: warning
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [apm-dependencies](https://github.com/monitoring-mixins/website/blob/master/assets/promscale/dashboards/apm-dependencies.json)
- [apm-home](https://github.com/monitoring-mixins/website/blob/master/assets/promscale/dashboards/apm-home.json)
- [apm-service-dependencies-downstream](https://github.com/monitoring-mixins/website/blob/master/assets/promscale/dashboards/apm-service-dependencies-downstream.json)
- [apm-service-dependencies-upstream](https://github.com/monitoring-mixins/website/blob/master/assets/promscale/dashboards/apm-service-dependencies-upstream.json)
- [apm-service-overview](https://github.com/monitoring-mixins/website/blob/master/assets/promscale/dashboards/apm-service-overview.json)
- [promscale](https://github.com/monitoring-mixins/website/blob/master/assets/promscale/dashboards/promscale.json)

View file

@ -114,6 +114,11 @@
"name": "promtail",
"source": "https://github.com/grafana/loki",
"subdir": "production/promtail-mixin"
},
{
"name": "promscale",
"source": "https://github.com/timescale/promscale",
"subdir": "docs/mixin"
}
]
}