mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
assets,site/content: daily assets regeneration
This commit is contained in:
parent
53269c316f
commit
313c81e897
19 changed files with 21291 additions and 4 deletions
|
@ -1 +1,41 @@
|
|||
null
|
||||
groups:
|
||||
- name: loki_alerts
|
||||
rules:
|
||||
- alert: LokiRequestErrors
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|
||||
expr: |
|
||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestPanics
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|
||||
expr: |
|
||||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestLatency
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
||||
expr: |
|
||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiTooManyCompactorsRunning
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|
||||
expr: |
|
||||
sum(loki_boltdb_shipper_compactor_running) by (namespace) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
1336
assets/loki/dashboards/loki-chunks.json
Normal file
1336
assets/loki/dashboards/loki-chunks.json
Normal file
File diff suppressed because it is too large
Load diff
544
assets/loki/dashboards/loki-deletion.json
Normal file
544
assets/loki/dashboards/loki-deletion.json
Normal file
|
@ -0,0 +1,544 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"links": [
|
||||
{
|
||||
"asDropdown": true,
|
||||
"icon": "external link",
|
||||
"includeVars": true,
|
||||
"keepTime": true,
|
||||
"tags": [
|
||||
"loki"
|
||||
],
|
||||
"targetBlank": false,
|
||||
"title": "Loki Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "100px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"format": "none",
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(loki_compactor_pending_delete_requests_count{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "70,80",
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Number of Pending Requests",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "singlestat",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"format": "dtdurations",
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(loki_compactor_oldest_pending_delete_request_age_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "70,80",
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Oldest Pending Request Age",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "singlestat",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Headlines",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "received",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Delete Requests Received / Day",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "processed",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Delete Requests Processed / Day",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Churn",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(loki_compactor_load_pending_requests_attempts_total{status=\"fail\", cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "failures",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Failures in Loading Delete Requests / Hour",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 2,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Failures",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"loki"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Data Source",
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "prod",
|
||||
"value": "prod"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "cluster",
|
||||
"multi": false,
|
||||
"name": "cluster",
|
||||
"options": [ ],
|
||||
"query": "label_values(loki_build_info, cluster)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 2,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "prod",
|
||||
"value": "prod"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"options": [ ],
|
||||
"query": "label_values(loki_build_info{cluster=~\"$cluster\"}, namespace)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 2,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Loki / Deletion",
|
||||
"uid": "deletion",
|
||||
"version": 0
|
||||
}
|
1073
assets/loki/dashboards/loki-logs.json
Normal file
1073
assets/loki/dashboards/loki-logs.json
Normal file
File diff suppressed because it is too large
Load diff
657
assets/loki/dashboards/loki-mixin-recording-rules.json
Normal file
657
assets/loki/dashboards/loki-mixin-recording-rules.json
Normal file
|
@ -0,0 +1,657 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"target": {
|
||||
"limit": 100,
|
||||
"matchAny": false,
|
||||
"tags": [ ],
|
||||
"type": "dashboard"
|
||||
},
|
||||
"type": "dashboard"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"enable": false,
|
||||
"expr": "sum by (tenant) (changes(loki_ruler_wal_prometheus_tsdb_wal_truncations_total{tenant=~\"${tenant}\"}[$__rate_interval]))",
|
||||
"iconColor": "red",
|
||||
"name": "WAL Truncations",
|
||||
"target": {
|
||||
"queryType": "Azure Monitor",
|
||||
"refId": "Anno"
|
||||
},
|
||||
"titleFormat": "{{tenant}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"iteration": 1635347545534,
|
||||
"links": [ ],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [ ],
|
||||
"noValue": "0",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 2,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "8.3.0-38205pre",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"exemplar": false,
|
||||
"expr": "sum(loki_ruler_wal_appender_ready) by (pod, tenant) == 0",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Appenders Not Ready",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"description": "",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 11,
|
||||
"x": 2,
|
||||
"y": 0
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"exemplar": true,
|
||||
"expr": "sum(rate(loki_ruler_wal_samples_appended_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{tenant}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Samples Appended to WAL per Second",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"description": "Series are unique combinations of labels",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 11,
|
||||
"x": 13,
|
||||
"y": 0
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"exemplar": true,
|
||||
"expr": "sum(rate(loki_ruler_wal_storage_created_series_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{tenant}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Series Created per Second",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"description": "Difference between highest timestamp appended to WAL and highest timestamp successfully written to remote storage",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"exemplar": true,
|
||||
"expr": "loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds{tenant=~\"${tenant}\"}\n- on (tenant)\n (\n loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds{tenant=~\"${tenant}\"}\n or vector(0)\n )",
|
||||
"interval": "",
|
||||
"legendFormat": "{{tenant}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Write Behind",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"description": "",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"exemplar": true,
|
||||
"expr": "sum(rate(loki_ruler_wal_prometheus_remote_storage_samples_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{tenant}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Samples Sent per Second",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"description": "\n",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"exemplar": true,
|
||||
"expr": "sum by (tenant) (loki_ruler_wal_disk_size{tenant=~\"${tenant}\"})",
|
||||
"interval": "",
|
||||
"legendFormat": "{{tenant}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "WAL Disk Size",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"description": "Some number of pending samples is expected, but if remote-write is failing this value will remain high",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [ ],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [ ]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 20
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": "${datasource}",
|
||||
"exemplar": true,
|
||||
"expr": "max(loki_ruler_wal_prometheus_remote_storage_samples_pending{tenant=~\"${tenant}\"}) by (tenant,pod) > 0",
|
||||
"interval": "",
|
||||
"legendFormat": "{{tenant}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Pending Samples",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 31,
|
||||
"style": "dark",
|
||||
"tags": [ ],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Datasource",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"queryValue": "",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"datasource": "${datasource}",
|
||||
"definition": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Tenant",
|
||||
"multi": true,
|
||||
"name": "tenant",
|
||||
"options": [ ],
|
||||
"query": {
|
||||
"query": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
|
||||
"refId": "StandardVariableQuery"
|
||||
},
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": { },
|
||||
"timezone": "",
|
||||
"title": "Recording Rules",
|
||||
"uid": "2xKA_ZK7k",
|
||||
"version": 9,
|
||||
"weekStart": ""
|
||||
}
|
6675
assets/loki/dashboards/loki-operational.json
Normal file
6675
assets/loki/dashboards/loki-operational.json
Normal file
File diff suppressed because it is too large
Load diff
2268
assets/loki/dashboards/loki-reads-resources.json
Normal file
2268
assets/loki/dashboards/loki-reads-resources.json
Normal file
File diff suppressed because it is too large
Load diff
1256
assets/loki/dashboards/loki-reads.json
Normal file
1256
assets/loki/dashboards/loki-reads.json
Normal file
File diff suppressed because it is too large
Load diff
1537
assets/loki/dashboards/loki-retention.json
Normal file
1537
assets/loki/dashboards/loki-retention.json
Normal file
File diff suppressed because it is too large
Load diff
1240
assets/loki/dashboards/loki-writes-resources.json
Normal file
1240
assets/loki/dashboards/loki-writes-resources.json
Normal file
File diff suppressed because it is too large
Load diff
1068
assets/loki/dashboards/loki-writes.json
Normal file
1068
assets/loki/dashboards/loki-writes.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1 +1,49 @@
|
|||
null
|
||||
groups:
|
||||
- name: loki_rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (job)
|
||||
record: job:loki_request_duration_seconds:avg
|
||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)
|
||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (job, route)
|
||||
record: job_route:loki_request_duration_seconds:avg
|
||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)
|
||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
/ sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job,
|
||||
route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
||||
|
|
|
@ -1 +1,87 @@
|
|||
null
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
description: Errors while performing List operations in controller {{$labels.controller}}
|
||||
in {{$labels.namespace}} namespace.
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
description: Errors while performing watch operations in controller {{$labels.controller}}
|
||||
in {{$labels.namespace}} namespace.
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
|
||||
namespace fails to reconcile {{ $value }} objects.
|
||||
summary: Last controller reconciliation failed
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed
|
||||
for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||
summary: Errors while reconciling controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
description: Errors while reconciling Prometheus in {{ $labels.namespace }}
|
||||
Namespace.
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: |
|
||||
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace isn't
|
||||
ready to reconcile {{ $labels.controller }} resources.
|
||||
summary: Prometheus operator not ready
|
||||
expr: |
|
||||
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace rejected
|
||||
{{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }}
|
||||
resources.
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: config-reloaders
|
||||
rules:
|
||||
- alert: ConfigReloaderSidecarErrors
|
||||
annotations:
|
||||
description: |-
|
||||
Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
|
||||
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.
|
||||
summary: config-reloader sidecar has not had a successful reload for 10m
|
||||
expr: |
|
||||
max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
@ -1 +1,241 @@
|
|||
null
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to reload its configuration.
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is
|
||||
running full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than
|
||||
30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a
|
||||
specific Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
reload failures over the last 3h.
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
compaction failures over the last 3h.
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not ingesting samples.
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0
|
||||
)
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with different values but duplicated timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f"
|
||||
$value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
|
||||
/
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
|
||||
+
|
||||
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m]))
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f"
|
||||
$value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation
|
||||
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
|
||||
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
|
||||
$labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more
|
||||
than configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf
|
||||
"%.0f" $value }} rules in the last 5m.
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
|
||||
}} rule group evaluations in the last 5m.
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||
}} targets because the number of targets exceeded the configured target_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||
the targets limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||
}} targets because some samples exceeded the configured label_limit, label_name_length_limit
|
||||
or label_value_length_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||
the labels limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}}
|
||||
have failed to sync because invalid configuration was supplied.'
|
||||
summary: Prometheus has failed to sync targets.
|
||||
expr: |
|
||||
increase(prometheus_target_sync_failed_total{job="prometheus"}[30m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
|
||||
from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
1443
assets/prometheus/dashboards/prometheus-remote-write.json
Normal file
1443
assets/prometheus/dashboards/prometheus-remote-write.json
Normal file
File diff suppressed because it is too large
Load diff
1074
assets/prometheus/dashboards/prometheus.json
Normal file
1074
assets/prometheus/dashboards/prometheus.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -10,3 +10,227 @@ title: loki
|
|||
Jsonnet source code is available at [github.com/grafana/loki](https://github.com/grafana/loki/tree/master/production/loki-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/loki/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### loki_alerts
|
||||
|
||||
##### LokiRequestErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: LokiRequestErrors
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|
||||
expr: |
|
||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### LokiRequestPanics
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: LokiRequestPanics
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|
||||
expr: |
|
||||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### LokiRequestLatency
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: LokiRequestLatency
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
||||
expr: |
|
||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### LokiTooManyCompactorsRunning
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: LokiTooManyCompactorsRunning
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|
||||
expr: |
|
||||
sum(loki_boltdb_shipper_compactor_running) by (namespace) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Recording rules
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/loki/rules.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### loki_rules
|
||||
|
||||
##### job:loki_request_duration_seconds:99quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:99quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### job:loki_request_duration_seconds:50quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:50quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### job:loki_request_duration_seconds:avg
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (job)
|
||||
record: job:loki_request_duration_seconds:avg
|
||||
{{< /code >}}
|
||||
|
||||
##### job:loki_request_duration_seconds_bucket:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)
|
||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### job:loki_request_duration_seconds_sum:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### job:loki_request_duration_seconds_count:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_count:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### job_route:loki_request_duration_seconds:99quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:99quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### job_route:loki_request_duration_seconds:50quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:50quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### job_route:loki_request_duration_seconds:avg
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (job, route)
|
||||
record: job_route:loki_request_duration_seconds:avg
|
||||
{{< /code >}}
|
||||
|
||||
##### job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)
|
||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### job_route:loki_request_duration_seconds_count:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### namespace_job_route:loki_request_duration_seconds:avg
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
/ sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
||||
{{< /code >}}
|
||||
|
||||
##### namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job,
|
||||
route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [loki-chunks](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-chunks.json)
|
||||
- [loki-deletion](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-deletion.json)
|
||||
- [loki-logs](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-logs.json)
|
||||
- [loki-mixin-recording-rules](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-mixin-recording-rules.json)
|
||||
- [loki-operational](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-operational.json)
|
||||
- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json)
|
||||
- [loki-reads](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads.json)
|
||||
- [loki-retention](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-retention.json)
|
||||
- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json)
|
||||
- [loki-writes](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes.json)
|
||||
|
|
|
@ -10,3 +10,133 @@ title: prometheus-operator
|
|||
Jsonnet source code is available at [github.com/prometheus-operator/prometheus-operator](https://github.com/prometheus-operator/prometheus-operator/tree/master/jsonnet/mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus-operator/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### prometheus-operator
|
||||
|
||||
##### PrometheusOperatorListErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
description: Errors while performing List operations in controller {{$labels.controller}}
|
||||
in {{$labels.namespace}} namespace.
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOperatorWatchErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
description: Errors while performing watch operations in controller {{$labels.controller}}
|
||||
in {{$labels.namespace}} namespace.
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOperatorSyncFailed
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace
|
||||
fails to reconcile {{ $value }} objects.
|
||||
summary: Last controller reconciliation failed
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOperatorReconcileErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed
|
||||
for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
|
||||
summary: Errors while reconciling controller.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOperatorNodeLookupErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: |
|
||||
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOperatorNotReady
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready
|
||||
to reconcile {{ $labels.controller }} resources.
|
||||
summary: Prometheus operator not ready
|
||||
expr: |
|
||||
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOperatorRejectedResources
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{
|
||||
printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### config-reloaders
|
||||
|
||||
##### ConfigReloaderSidecarErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ConfigReloaderSidecarErrors
|
||||
annotations:
|
||||
description: |-
|
||||
Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
|
||||
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.
|
||||
summary: config-reloader sidecar has not had a successful reload for 10m
|
||||
expr: |
|
||||
max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
|
|
|
@ -10,3 +10,351 @@ The Prometheus Mixin is a set of configurable, reusable, and extensible alerts a
|
|||
Jsonnet source code is available at [github.com/prometheus/prometheus](https://github.com/prometheus/prometheus/tree/master/documentation/prometheus-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### prometheus
|
||||
|
||||
##### PrometheusBadConfig
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to reload its configuration.
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotificationQueueRunningFull
|
||||
Prometheus alert notification queue predicted to run full in less than
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running
|
||||
full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than
|
||||
30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||
|
||||
Prometheus has encountered more than 1% errors sending alerts to a specific
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
|
||||
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
|
||||
Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotConnectedToAlertmanagers
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusTSDBReloadsFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
reload failures over the last 3h.
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusTSDBCompactionsFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
|
||||
compaction failures over the last 3h.
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotIngestingSamples
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not ingesting samples.
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0
|
||||
)
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusDuplicateTimestamps
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with different values but duplicated timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOutOfOrderTimestamps
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
|
||||
samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRemoteStorageFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value
|
||||
}}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
|
||||
/
|
||||
(
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
|
||||
+
|
||||
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m]))
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRemoteWriteBehind
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value
|
||||
}}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
|
||||
- ignoring(remote_name, url) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRemoteWriteDesiredShards
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation
|
||||
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
|
||||
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
|
||||
$labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than
|
||||
configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRuleFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f"
|
||||
$value }} rules in the last 5m.
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusMissingRuleEvaluations
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
|
||||
}} rule group evaluations in the last 5m.
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusTargetLimitHit
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTargetLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||
}} targets because the number of targets exceeded the configured target_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||
the targets limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusLabelLimitHit
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusLabelLimitHit
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
|
||||
}} targets because some samples exceeded the configured label_limit, label_name_length_limit
|
||||
or label_value_length_limit.
|
||||
summary: Prometheus has dropped targets because some scrape configs have exceeded
|
||||
the labels limit.
|
||||
expr: |
|
||||
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusTargetSyncFailure
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTargetSyncFailure
|
||||
annotations:
|
||||
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}}
|
||||
have failed to sync because invalid configuration was supplied.'
|
||||
summary: Prometheus has failed to sync targets.
|
||||
expr: |
|
||||
increase(prometheus_target_sync_failed_total{job="prometheus"}[30m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
|
||||
Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
||||
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
|
||||
|
|
Loading…
Reference in a new issue