1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00

assets,site/content: daily assets regeneration

This commit is contained in:
github-actions[bot] 2021-12-28 03:25:10 +00:00
parent 53269c316f
commit 313c81e897
19 changed files with 21291 additions and 4 deletions

View file

@ -1 +1,41 @@
null
groups:
- name: loki_alerts
rules:
- alert: LokiRequestErrors
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
for: 15m
labels:
severity: critical
- alert: LokiTooManyCompactorsRunning
annotations:
message: |
{{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace) > 1
for: 5m
labels:
severity: warning

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,544 @@
{
"annotations": {
"list": [ ]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"links": [
{
"asDropdown": true,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"loki"
],
"targetBlank": false,
"title": "Loki Dashboards",
"type": "dashboards"
}
],
"refresh": "10s",
"rows": [
{
"collapse": false,
"height": "100px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"format": "none",
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(loki_compactor_pending_delete_requests_count{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": "70,80",
"timeFrom": null,
"timeShift": null,
"title": "Number of Pending Requests",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "singlestat",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"format": "dtdurations",
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "max(loki_compactor_oldest_pending_delete_request_age_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": "70,80",
"timeFrom": null,
"timeShift": null,
"title": "Oldest Pending Request Age",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "singlestat",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Headlines",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 3,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(increase(loki_compactor_delete_requests_received_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "received",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Delete Requests Received / Day",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(increase(loki_compactor_delete_requests_processed_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[1d]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "processed",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Delete Requests Processed / Day",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Churn",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 5,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(increase(loki_compactor_load_pending_requests_attempts_total{status=\"fail\", cluster=~\"$cluster\", namespace=~\"$namespace\"}[1h]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "failures",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Failures in Loading Delete Requests / Hour",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Failures",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"loki"
],
"templating": {
"list": [
{
"current": {
"text": "default",
"value": "default"
},
"hide": 0,
"label": "Data Source",
"name": "datasource",
"options": [ ],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"allValue": null,
"current": {
"text": "prod",
"value": "prod"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [ ],
"query": "label_values(loki_build_info, cluster)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "prod",
"value": "prod"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": false,
"label": "namespace",
"multi": false,
"name": "namespace",
"options": [ ],
"query": "label_values(loki_build_info{cluster=~\"$cluster\"}, namespace)",
"refresh": 1,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
"tags": [ ],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "utc",
"title": "Loki / Deletion",
"uid": "deletion",
"version": 0
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,657 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [ ],
"type": "dashboard"
},
"type": "dashboard"
},
{
"datasource": "${datasource}",
"enable": false,
"expr": "sum by (tenant) (changes(loki_ruler_wal_prometheus_tsdb_wal_truncations_total{tenant=~\"${tenant}\"}[$__rate_interval]))",
"iconColor": "red",
"name": "WAL Truncations",
"target": {
"queryType": "Azure Monitor",
"refId": "Anno"
},
"titleFormat": "{{tenant}}"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": null,
"graphTooltip": 0,
"iteration": 1635347545534,
"links": [ ],
"liveNow": false,
"panels": [
{
"datasource": "${datasource}",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [ ],
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 10,
"w": 2,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.3.0-38205pre",
"targets": [
{
"datasource": "${datasource}",
"exemplar": false,
"expr": "sum(loki_ruler_wal_appender_ready) by (pod, tenant) == 0",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "Appenders Not Ready",
"type": "stat"
},
{
"datasource": "${datasource}",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 10,
"w": 11,
"x": 2,
"y": 0
},
"id": 4,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": "${datasource}",
"exemplar": true,
"expr": "sum(rate(loki_ruler_wal_samples_appended_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
"interval": "",
"legendFormat": "{{tenant}}",
"refId": "A"
}
],
"title": "Samples Appended to WAL per Second",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Series are unique combinations of labels",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 10,
"w": 11,
"x": 13,
"y": 0
},
"id": 5,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": "${datasource}",
"exemplar": true,
"expr": "sum(rate(loki_ruler_wal_storage_created_series_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
"interval": "",
"legendFormat": "{{tenant}}",
"refId": "A"
}
],
"title": "Series Created per Second",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Difference between highest timestamp appended to WAL and highest timestamp successfully written to remote storage",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 10
},
"id": 6,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": "${datasource}",
"exemplar": true,
"expr": "loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds{tenant=~\"${tenant}\"}\n- on (tenant)\n (\n loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds{tenant=~\"${tenant}\"}\n or vector(0)\n )",
"interval": "",
"legendFormat": "{{tenant}}",
"refId": "A"
}
],
"title": "Write Behind",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 10
},
"id": 7,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": "${datasource}",
"exemplar": true,
"expr": "sum(rate(loki_ruler_wal_prometheus_remote_storage_samples_total{tenant=~\"${tenant}\"}[$__rate_interval])) by (tenant) > 0",
"interval": "",
"legendFormat": "{{tenant}}",
"refId": "A"
}
],
"title": "Samples Sent per Second",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "\n",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": [ ]
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 20
},
"id": 8,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": "${datasource}",
"exemplar": true,
"expr": "sum by (tenant) (loki_ruler_wal_disk_size{tenant=~\"${tenant}\"})",
"interval": "",
"legendFormat": "{{tenant}}",
"refId": "A"
}
],
"title": "WAL Disk Size",
"type": "timeseries"
},
{
"datasource": "${datasource}",
"description": "Some number of pending samples is expected, but if remote-write is failing this value will remain high",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [ ],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [ ]
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 20
},
"id": 9,
"options": {
"legend": {
"calcs": [ ],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": "${datasource}",
"exemplar": true,
"expr": "max(loki_ruler_wal_prometheus_remote_storage_samples_pending{tenant=~\"${tenant}\"}) by (tenant,pod) > 0",
"interval": "",
"legendFormat": "{{tenant}}",
"refId": "A"
}
],
"title": "Pending Samples",
"type": "timeseries"
}
],
"schemaVersion": 31,
"style": "dark",
"tags": [ ],
"templating": {
"list": [
{
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [ ],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": null,
"datasource": "${datasource}",
"definition": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": "Tenant",
"multi": true,
"name": "tenant",
"options": [ ],
"query": {
"query": "label_values(loki_ruler_wal_samples_appended_total, tenant)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": { },
"timezone": "",
"title": "Recording Rules",
"uid": "2xKA_ZK7k",
"version": 9,
"weekStart": ""
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1 +1,49 @@
null
groups:
- name: loki_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (job)
record: job:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)
record: job:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
record: job:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m]))
by (job, route)
record: job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)
record: job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job,
route)
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate

View file

@ -1 +1,87 @@
null
groups:
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
namespace fails to reconcile {{ $value }} objects.
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed
for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }}
Namespace.
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't
ready to reconcile {{ $labels.controller }} resources.
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0)
for: 5m
labels:
severity: warning
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected
{{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }}
resources.
summary: Resources rejected by Prometheus operator
expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0
for: 5m
labels:
severity: warning
- name: config-reloaders
rules:
- alert: ConfigReloaderSidecarErrors
annotations:
description: |-
Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.
summary: config-reloader sidecar has not had a successful reload for 10m
expr: |
max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
labels:
severity: warning

View file

@ -1 +1,241 @@
null
groups:
- name: prometheus
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.instance}} has failed to reload its configuration.
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.instance}} is
running full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a
specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.instance}} is not ingesting samples.
summary: Prometheus is not ingesting samples.
expr: |
(
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0
)
)
for: 10m
labels:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f"
$value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
summary: Prometheus fails to send samples to remote storage.
expr: |
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m]))
)
)
* 100
> 1
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f"
$value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
)
> 120
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.instance}} remote write desired shards calculation
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf
"%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
}} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
}} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusLabelLimitHit
annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
}} targets because some samples exceeded the configured label_limit, label_name_length_limit
or label_value_length_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the labels limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetSyncFailure
annotations:
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}}
have failed to sync because invalid configuration was supplied.'
summary: Prometheus has failed to sync targets.
expr: |
increase(prometheus_target_sync_failed_total{job="prometheus"}[30m]) > 0
for: 5m
labels:
severity: critical
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,3 +10,227 @@ title: loki
Jsonnet source code is available at [github.com/grafana/loki](https://github.com/grafana/loki/tree/master/production/loki-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/loki/alerts.yaml).
{{< /panel >}}
### loki_alerts
##### LokiRequestErrors
{{< code lang="yaml" >}}
alert: LokiRequestErrors
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
{{< /code >}}
##### LokiRequestPanics
{{< code lang="yaml" >}}
alert: LokiRequestPanics
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: critical
{{< /code >}}
##### LokiRequestLatency
{{< code lang="yaml" >}}
alert: LokiRequestLatency
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
for: 15m
labels:
severity: critical
{{< /code >}}
##### LokiTooManyCompactorsRunning
{{< code lang="yaml" >}}
alert: LokiTooManyCompactorsRunning
annotations:
message: |
{{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace) > 1
for: 5m
labels:
severity: warning
{{< /code >}}
## Recording rules
{{< panel style="warning" >}}
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/loki/rules.yaml).
{{< /panel >}}
### loki_rules
##### job:loki_request_duration_seconds:99quantile
{{< code lang="yaml" >}}
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:99quantile
{{< /code >}}
##### job:loki_request_duration_seconds:50quantile
{{< code lang="yaml" >}}
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:50quantile
{{< /code >}}
##### job:loki_request_duration_seconds:avg
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (job)
record: job:loki_request_duration_seconds:avg
{{< /code >}}
##### job:loki_request_duration_seconds_bucket:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)
record: job:loki_request_duration_seconds_bucket:sum_rate
{{< /code >}}
##### job:loki_request_duration_seconds_sum:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
record: job:loki_request_duration_seconds_sum:sum_rate
{{< /code >}}
##### job:loki_request_duration_seconds_count:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds_count:sum_rate
{{< /code >}}
##### job_route:loki_request_duration_seconds:99quantile
{{< code lang="yaml" >}}
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:99quantile
{{< /code >}}
##### job_route:loki_request_duration_seconds:50quantile
{{< code lang="yaml" >}}
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:50quantile
{{< /code >}}
##### job_route:loki_request_duration_seconds:avg
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m]))
by (job, route)
record: job_route:loki_request_duration_seconds:avg
{{< /code >}}
##### job_route:loki_request_duration_seconds_bucket:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)
record: job_route:loki_request_duration_seconds_bucket:sum_rate
{{< /code >}}
##### job_route:loki_request_duration_seconds_sum:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_sum:sum_rate
{{< /code >}}
##### job_route:loki_request_duration_seconds_count:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_count:sum_rate
{{< /code >}}
##### namespace_job_route:loki_request_duration_seconds:99quantile
{{< code lang="yaml" >}}
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:99quantile
{{< /code >}}
##### namespace_job_route:loki_request_duration_seconds:50quantile
{{< code lang="yaml" >}}
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:50quantile
{{< /code >}}
##### namespace_job_route:loki_request_duration_seconds:avg
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds:avg
{{< /code >}}
##### namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job,
route)
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
{{< /code >}}
##### namespace_job_route:loki_request_duration_seconds_sum:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
{{< /code >}}
##### namespace_job_route:loki_request_duration_seconds_count:sum_rate
{{< code lang="yaml" >}}
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [loki-chunks](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-chunks.json)
- [loki-deletion](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-deletion.json)
- [loki-logs](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-logs.json)
- [loki-mixin-recording-rules](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-mixin-recording-rules.json)
- [loki-operational](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-operational.json)
- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json)
- [loki-reads](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads.json)
- [loki-retention](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-retention.json)
- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json)
- [loki-writes](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes.json)

View file

@ -10,3 +10,133 @@ title: prometheus-operator
Jsonnet source code is available at [github.com/prometheus-operator/prometheus-operator](https://github.com/prometheus-operator/prometheus-operator/tree/master/jsonnet/mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus-operator/alerts.yaml).
{{< /panel >}}
### prometheus-operator
##### PrometheusOperatorListErrors
{{< code lang="yaml" >}}
alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusOperatorWatchErrors
{{< code lang="yaml" >}}
alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusOperatorSyncFailed
{{< code lang="yaml" >}}
alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace
fails to reconcile {{ $value }} objects.
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
for: 10m
labels:
severity: warning
{{< /code >}}
##### PrometheusOperatorReconcileErrors
{{< code lang="yaml" >}}
alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed
for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
{{< /code >}}
##### PrometheusOperatorNodeLookupErrors
{{< code lang="yaml" >}}
alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning
{{< /code >}}
##### PrometheusOperatorNotReady
{{< code lang="yaml" >}}
alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready
to reconcile {{ $labels.controller }} resources.
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0)
for: 5m
labels:
severity: warning
{{< /code >}}
##### PrometheusOperatorRejectedResources
{{< code lang="yaml" >}}
alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{
printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
summary: Resources rejected by Prometheus operator
expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0
for: 5m
labels:
severity: warning
{{< /code >}}
### config-reloaders
##### ConfigReloaderSidecarErrors
{{< code lang="yaml" >}}
alert: ConfigReloaderSidecarErrors
annotations:
description: |-
Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.
summary: config-reloader sidecar has not had a successful reload for 10m
expr: |
max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
labels:
severity: warning
{{< /code >}}

View file

@ -10,3 +10,351 @@ The Prometheus Mixin is a set of configurable, reusable, and extensible alerts a
Jsonnet source code is available at [github.com/prometheus/prometheus](https://github.com/prometheus/prometheus/tree/master/documentation/prometheus-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/alerts.yaml).
{{< /panel >}}
### prometheus
##### PrometheusBadConfig
{{< code lang="yaml" >}}
alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.instance}} has failed to reload its configuration.
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
for: 10m
labels:
severity: critical
{{< /code >}}
##### PrometheusNotificationQueueRunningFull
Prometheus alert notification queue predicted to run full in less than
{{< code lang="yaml" >}}
alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.instance}} is running
full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
)
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusErrorSendingAlertsToSomeAlertmanagers
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
Prometheus has encountered more than 1% errors sending alerts to a specific
{{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
{{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusNotConnectedToAlertmanagers
{{< code lang="yaml" >}}
alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
for: 10m
labels:
severity: warning
{{< /code >}}
##### PrometheusTSDBReloadsFailing
{{< code lang="yaml" >}}
alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
for: 4h
labels:
severity: warning
{{< /code >}}
##### PrometheusTSDBCompactionsFailing
{{< code lang="yaml" >}}
alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}}
compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
for: 4h
labels:
severity: warning
{{< /code >}}
##### PrometheusNotIngestingSamples
{{< code lang="yaml" >}}
alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.instance}} is not ingesting samples.
summary: Prometheus is not ingesting samples.
expr: |
(
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0
)
)
for: 10m
labels:
severity: warning
{{< /code >}}
##### PrometheusDuplicateTimestamps
{{< code lang="yaml" >}}
alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
{{< /code >}}
##### PrometheusOutOfOrderTimestamps
{{< code lang="yaml" >}}
alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }}
samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
for: 10m
labels:
severity: warning
{{< /code >}}
##### PrometheusRemoteStorageFailures
{{< code lang="yaml" >}}
alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value
}}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
summary: Prometheus fails to send samples to remote storage.
expr: |
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m]))
)
)
* 100
> 1
for: 15m
labels:
severity: critical
{{< /code >}}
##### PrometheusRemoteWriteBehind
{{< code lang="yaml" >}}
alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value
}}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
)
> 120
for: 15m
labels:
severity: critical
{{< /code >}}
##### PrometheusRemoteWriteDesiredShards
{{< code lang="yaml" >}}
alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.instance}} remote write desired shards calculation
wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url
}}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more than
configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
)
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusRuleFailures
{{< code lang="yaml" >}}
alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f"
$value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: critical
{{< /code >}}
##### PrometheusMissingRuleEvaluations
{{< code lang="yaml" >}}
alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value
}} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusTargetLimitHit
{{< code lang="yaml" >}}
alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
}} targets because the number of targets exceeded the configured target_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusLabelLimitHit
{{< code lang="yaml" >}}
alert: PrometheusLabelLimitHit
annotations:
description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value
}} targets because some samples exceeded the configured label_limit, label_name_length_limit
or label_value_length_limit.
summary: Prometheus has dropped targets because some scrape configs have exceeded
the labels limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus"}[5m]) > 0
for: 15m
labels:
severity: warning
{{< /code >}}
##### PrometheusTargetSyncFailure
{{< code lang="yaml" >}}
alert: PrometheusTargetSyncFailure
annotations:
description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}}
have failed to sync because invalid configuration was supplied.'
summary: Prometheus has failed to sync targets.
expr: |
increase(prometheus_target_sync_failed_total{job="prometheus"}[30m]) > 0
for: 5m
labels:
severity: critical
{{< /code >}}
##### PrometheusErrorSendingAlertsToAnyAlertmanager
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
{{< code lang="yaml" >}}
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from
Prometheus {{$labels.instance}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)