1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00
adinhodovic 2024-10-31 20:33:07 +01:00
parent 526200ba6e
commit a672deef1c
38 changed files with 14343 additions and 7 deletions

View file

@ -0,0 +1,107 @@
groups:
- name: argo-cd
rules:
- alert: ArgoCdAppOutOfSync
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
$labels.name }}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} is out of sync with the sync status {{ $labels.sync_status
}} for the past 15m.
summary: An ArgoCD Application is Out Of Sync.
expr: |
sum(
argocd_app_info{
job=~".*",
sync_status!="Synced"
}
) by (job, dest_server, project, name, sync_status)
> 0
for: 15m
labels:
severity: warning
- alert: ArgoCdAppUnhealthy
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
$labels.name }}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} is unhealthy with the health status {{ $labels.health_status
}} for the past 15m.
summary: An ArgoCD Application is Unhealthy.
expr: |
sum(
argocd_app_info{
job=~".*",
health_status!~"Healthy|Progressing"
}
) by (job, dest_server, project, name, health_status)
> 0
for: 15m
labels:
severity: warning
- alert: ArgoCdAppAutoSyncDisabled
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
$labels.name }}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} has autosync disabled for the past 2h.
summary: An ArgoCD Application has AutoSync Disabled.
expr: |
sum(
argocd_app_info{
job=~".*",
autosync_enabled!="true",
name!~""
}
) by (job, dest_server, project, name, autosync_enabled)
> 0
for: 2h
labels:
severity: warning
- alert: ArgoCdAppSyncFailed
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
$labels.name }}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} has failed to sync with the status {{ $labels.phase }} the
past 10m.
summary: An ArgoCD Application has Failed to Sync.
expr: |
sum(
round(
increase(
argocd_app_sync_total{
job=~".*",
phase!="Succeeded"
}[10m]
)
)
) by (job, dest_server, project, name, phase) > 0
for: 1m
labels:
severity: warning
- alert: ArgoCdNotificationDeliveryFailed
annotations:
dashboard_url: https://grafana.com/d/argo-cd-notifications-overview-kask/argocd-notifications-overview?var-job={{
$labels.job }}&var-exported_service={{ $labels.exported_service }}
description: The notification job {{ $labels.job }} has failed to deliver to
{{ $labels.exported_service }} for the past 10m.
summary: ArgoCD Notification Delivery Failed.
expr: |
sum(
round(
increase(
argocd_notifications_deliveries_total{
job=~".*",
succeeded!="true"
}[10m]
)
)
) by (job, exported_service, succeeded) > 0
for: 1m
labels:
severity: warning

View file

@ -0,0 +1,945 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors ArgoCD with a focus on Application status. It is created using the [argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin). Requires custom configuration to add application badges. Please refer to the mixin.",
"editable": true,
"links": [
{
"tags": [
"ci/cd",
"argo-cd"
],
"targetBlank": true,
"title": "ArgoCD Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Summary by Cluster, Project",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 5,
"w": 9,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, health_status)\n",
"legendFormat": "{{ dest_server }}/{{ project }} - {{ health_status }}"
}
],
"title": "Application Health Status",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 5,
"w": 9,
"x": 9,
"y": 1
},
"id": 3,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, sync_status)\n",
"legendFormat": "{{ dest_server }}/{{ project }} - {{ sync_status }}"
}
],
"title": "Application Sync Status",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 5,
"w": 9,
"x": 0,
"y": 6
},
"id": 4,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }[$__rate_interval]\n )\n )\n) by (job, dest_server, project, phase)\n",
"legendFormat": "{{ dest_server }}/{{ project }} - {{ phase }}"
}
],
"title": "Application Syncs",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 5,
"w": 9,
"x": 9,
"y": 6
},
"id": 5,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, autosync_enabled)\n",
"legendFormat": "{{ dest_server }}/{{ project }} - {{ autosync_enabled }}"
}
],
"title": "Application Auto Sync Enabled",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"gridPos": {
"h": 10,
"w": 6,
"x": 18,
"y": 1
},
"id": 6,
"options": {
"content": "No applications defined",
"mode": "markdown"
},
"pluginVersion": "v11.1.0",
"title": "Application Badges",
"type": "text"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 18,
"x": 0,
"y": 11
},
"id": 7,
"title": "Applications (Unhealthy/OutOfSync/AutoSyncDisabled) Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "name"
},
"properties": [
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "Go To Application",
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "health_status"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
},
{
"id": "custom.displayMode",
"value": "color-background"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 12
},
"id": 8,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"displayName": "Application"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n health_status!~\"Healthy|Progressing\"\n }\n) by (job, dest_server, project, name, health_status)\n",
"format": "table",
"instant": true
}
],
"title": "Applications Unhealthy",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value": true,
"dest_server": true,
"job": true
},
"indexByName": {
"health_status": 2,
"name": 0,
"project": 1
},
"renameByName": {
"dest_server": "Cluster",
"health_status": "Health Status",
"job": "Job",
"name": "Application",
"project": "Project"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "name"
},
"properties": [
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "Go To Application",
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "sync_status"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
},
{
"id": "custom.displayMode",
"value": "color-background"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 12
},
"id": 9,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"displayName": "Application"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n sync_status!=\"Synced\"\n }\n) by (job, dest_server, project, name, sync_status) > 0\n",
"format": "table",
"instant": true
}
],
"title": "Applications Out Of Sync",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value": true,
"dest_server": true,
"job": true
},
"indexByName": {
"name": 0,
"project": 1,
"sync_status": 2
},
"renameByName": {
"dest_server": "Cluster",
"job": "Job",
"name": "Application",
"project": "Project",
"sync_status": "Sync Status"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "name"
},
"properties": [
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "Go To Application",
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "Value"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
},
{
"id": "custom.displayMode",
"value": "color-background"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 18
},
"id": 10,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"displayName": "Application"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n phase!=\"Succeeded\"\n }[7d]\n )\n )\n) by (job, dest_server, project, name, phase) > 0\n",
"format": "table",
"instant": true
}
],
"title": "Applications That Failed to Sync[7d]",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"dest_server": true,
"job": true
},
"indexByName": {
"name": 0,
"phase": 2,
"project": 1
},
"renameByName": {
"Value": "Count",
"dest_server": "Cluster",
"job": "Job",
"name": "Application",
"phase": "Phase",
"project": "Project"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "name"
},
"properties": [
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "Go To Application",
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "autosync_enabled"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
},
{
"id": "custom.displayMode",
"value": "color-background"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 18
},
"id": 11,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"displayName": "Application"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n autosync_enabled!=\"true\"\n }\n) by (job, dest_server, project, name, autosync_enabled) > 0\n",
"format": "table",
"instant": true
}
],
"title": "Applications With Auto Sync Disabled",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value": true,
"dest_server": true,
"job": true
},
"indexByName": {
"autosync_enabled": 2,
"name": 0,
"project": 1
},
"renameByName": {
"autosync_enabled": "Auto Sync Enabled",
"dest_server": "Cluster",
"job": "Job",
"name": "Application",
"project": "Project"
}
}
}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 23
},
"id": 12,
"title": "Application ($application)",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 24
},
"id": 13,
"interval": "5m",
"options": {
"legend": {
"calcs": [
"last"
],
"displayMode": "table",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }\n) by (namespace, job, dest_server, project, name, health_status)\n",
"legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ health_status }}"
}
],
"title": "Application Health Status",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 24
},
"id": 14,
"interval": "5m",
"options": {
"legend": {
"calcs": [
"last"
],
"displayMode": "table",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }\n) by (namespace, job, dest_server, project, name, sync_status)\n",
"legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ sync_status }}"
}
],
"title": "Application Sync Status",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 24
},
"id": 15,
"interval": "5m",
"options": {
"legend": {
"calcs": [
"last"
],
"displayMode": "table",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }[$__rate_interval]\n )\n )\n) by (namespace, job, dest_server, project, name, phase)\n",
"legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ phase }}"
}
],
"title": "Application Sync Result",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"ci/cd",
"argo-cd"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"query": "label_values(argocd_app_info{}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(argocd_app_info{namespace=~\"$namespace\"}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Cluster",
"multi": true,
"name": "cluster",
"query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\"}, dest_server)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Project",
"multi": true,
"name": "project",
"query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\", dest_server=~\"$cluster\"}, project)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Application",
"multi": true,
"name": "application",
"query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\", dest_server=~\"$cluster\", project=~\"$project\"}, name)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timezone": "utc",
"title": "ArgoCD / Application / Overview",
"uid": "argo-cd-application-overview-kask"
}

View file

@ -0,0 +1,198 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors ArgoCD notifications. It is created using the [argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin).",
"editable": true,
"links": [
{
"tags": [
"ci/cd",
"argo-cd"
],
"targetBlank": true,
"title": "ArgoCD Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n round(\n increase(\n argocd_notifications_deliveries_total{\n namespace=~'$namespace',\njob=~'$job',\n\n exported_service=~\"$exported_service\",\n }[$__rate_interval]\n )\n )\n) by (job, exported_service, succeeded)\n",
"legendFormat": "{{ exported_service }} - Succeeded: {{ succeeded }}"
}
],
"title": "Notification Deliveries",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 1
},
"id": 3,
"options": {
"legend": {
"calcs": [
"last",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n round(\n increase(\n argocd_notifications_trigger_eval_total{\n namespace=~'$namespace',\njob=~'$job',\n\n }[$__rate_interval]\n )\n )\n) by (job, name, triggered)\n",
"legendFormat": "{{ name }} - Triggered: {{ triggered }}"
}
],
"title": "Trigger Evaluations",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"ci/cd",
"argo-cd"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"query": "label_values(argocd_notifications_deliveries_total{}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"allValue": ".*",
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(argocd_notifications_deliveries_total{namespace=~\"$namespace\"}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Notifications Service",
"multi": true,
"name": "exported_service",
"query": "label_values(argocd_notifications_deliveries_total{namespace=~\"$namespace\", job=~\"$job\"}, exported_service)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-2d",
"to": "now"
},
"timezone": "utc",
"title": "ArgoCD / Notifications / Overview",
"uid": "argo-cd-notifications-overview-kask"
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
null

76
assets/celery/alerts.yaml Normal file
View file

@ -0,0 +1,76 @@
groups:
- name: celery
rules:
- alert: CeleryTaskHighFailRate
annotations:
dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{
$labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name
}}
description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name
}}/{{ $labels.name }} the past 10m.
summary: Celery high task fail rate.
expr: |
sum(
increase(
celery_task_failed_total{
job=~".*celery.*",
queue_name!~"None",
name!~"None"
}[10m]
)
) by (job, namespace, queue_name, name)
/
(
sum(
increase(
celery_task_failed_total{
job=~".*celery.*",
queue_name!~"None",
name!~"None"
}[10m]
)
) by (job, namespace, queue_name, name)
+
sum(
increase(
celery_task_succeeded_total{
job=~".*celery.*",
queue_name!~"None",
name!~"None"
}[10m]
)
) by (job, namespace, queue_name, name)
)
* 100 > 5
for: 1m
labels:
severity: warning
- alert: CeleryHighQueueLength
annotations:
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
$labels.job }}&var-queue_name={{ $labels.queue_name }}
description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name
}} the past 20m.
summary: Celery high queue length.
expr: |
sum(
celery_queue_length{
job=~".*celery.*",
queue_name!~"None"
}
) by (job, namespace, queue_name)
> 100
for: 20m
labels:
severity: warning
- alert: CeleryWorkerDown
annotations:
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
$labels.job }}
description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline.
summary: A Celery worker is offline.
expr: |
celery_worker_up{job=~".*celery.*"} == 0
for: 15m
labels:
severity: warning

View file

@ -0,0 +1,590 @@
{
"description": "A dashboard that monitors Celery. It is created using the Celery-mixin for the the (Celery-exporter)[https://github.com/danihodovic/celery-exporter]",
"editable": true,
"links": [
{
"tags": [
"celery",
"celery-mixin"
],
"targetBlank": true,
"title": "Celery Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Tasks",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"noValue": 0,
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Success Rate"
},
"properties": [
{
"id": "unit",
"value": "percentunit"
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 16,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"desc": true,
"displayName": "Succeeded"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n/(sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n+sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n) > -1\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
"format": "table",
"instant": true
}
],
"title": "Task Stats",
"transformations": [
{
"id": "merge"
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
},
"indexByName": {
"Value #A": 1,
"Value #B": 2,
"Value #C": 3,
"Value #D": 4,
"Value #E": 5,
"Value #F": 6,
"Value #G": 7,
"Value #H": 8,
"name": 0
},
"renameByName": {
"Value #A": "Success Rate",
"Value #B": "Succeeded",
"Value #C": "Failed",
"Value #D": "Sent",
"Value #E": "Received",
"Value #F": "Rejected",
"Value #G": "Retried",
"Value #H": "Revoked",
"name": "Name"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 1
},
"id": 3,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"desc": true,
"displayName": "Value"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum (\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n ) by (name, exception) > 0\n)\n",
"format": "table",
"instant": true
}
],
"title": "Task Exceptions",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"job": true
},
"indexByName": {
"Value": 2,
"exception": 1,
"name": 0
},
"renameByName": {
"exception": "Exception",
"name": "Task"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 9
},
"id": 4,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
"legendFormat": "Succeeded - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
"legendFormat": "Failed - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
"legendFormat": "Sent - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
"legendFormat": "Received - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
"legendFormat": "Retried - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
"legendFormat": "Revoked - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
"legendFormat": "Rejected - {{ name }}"
}
],
"title": "Tasks Completed",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 17
},
"id": 5,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name, exception) > 0\n",
"legendFormat": "{{ name }}/{{ exception }}"
}
],
"title": "Task Exceptions",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "P50"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "P95"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "P99"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 25
},
"id": 6,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n",
"legendFormat": "P50 - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n",
"legendFormat": "P95 - {{ name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n",
"legendFormat": "P99 - {{ name }}"
}
],
"title": "Tasks Runtime",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"celery",
"celery-mixin"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"query": "label_values(celery_worker_up{}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Job",
"multi": false,
"name": "job",
"query": "label_values(celery_worker_up{namespace=\"$namespace\"}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Queue Name",
"multi": false,
"name": "queue_name",
"query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", name!~\"None\"}, queue_name)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Task",
"multi": true,
"name": "task",
"query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", queue_name=~\"$queue_name\", name!~\"None\"}, name)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-2d",
"to": "now"
},
"timezone": "utc",
"title": "Celery / Tasks / By Task",
"uid": "celery-tasks-by-task-32s3"
}

File diff suppressed because it is too large Load diff

1
assets/celery/rules.yaml Normal file
View file

@ -0,0 +1 @@
null

97
assets/django/alerts.yaml Normal file
View file

@ -0,0 +1,97 @@
groups:
- name: django
rules:
- alert: DjangoMigrationsUnapplied
annotations:
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}
description: The job {{ $labels.job }} has unapplied migrations.
summary: Django has unapplied migrations.
expr: |
sum(
django_migrations_unapplied_total{
job=~"django"
}
) by (namespace, job)
> 0
for: 15m
labels:
severity: warning
- alert: DjangoDatabaseException
annotations:
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}
description: The job {{ $labels.job }} has hit the database exception {{ $labels.type
}}.
summary: Django database exception.
expr: |
sum (
increase(
django_db_errors_total{
job=~"django"
}[10m]
)
) by (type, namespace, job)
> 0
labels:
severity: info
- alert: DjangoHighHttp4xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
description: More than 5% HTTP requests with status 4xx for {{ $labels.job }}/{{
$labels.view }} the past 5m.
summary: Django high HTTP 4xx error rate.
expr: |
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
status=~"^4.*",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
/
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
* 100 > 5
for: 1m
labels:
severity: warning
- alert: DjangoHighHttp5xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
description: More than 5% HTTP requests with status 5xx for {{ $labels.job }}/{{
$labels.view }} the past 5m.
summary: Django high HTTP 5xx error rate.
expr: |
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
status=~"^5.*",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
/
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
* 100 > 5
for: 1m
labels:
severity: warning

View file

@ -0,0 +1,741 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Django which focuses on giving a overview for the system (requests, db, cache). It is created using the [Django-mixin](https://github.com/adinhodovic/django-mixin).",
"editable": true,
"links": [
{
"tags": [
"django",
"django-mixin"
],
"targetBlank": true,
"title": "Django Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "reqps"
}
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_requests_total_by_view_transport_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n }[$__rate_interval]\n )\n ), 0.001\n)\n"
}
],
"title": "Request Volume",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "ops"
}
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 1
},
"id": 3,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n rate (\n django_db_execute_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n )\n) by (namespace, job)\n"
}
],
"title": "Database Ops",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 1
},
"id": 4,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n rate (\n django_cache_get_hits_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[30m]\n )\n) by (namespace, job)\n/\nsum (\n rate (\n django_cache_get_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[30m]\n )\n) by (namespace, job)\n"
}
],
"title": "Cache Hitrate [30m]",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "percent"
}
},
"unit": "reqps"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "2xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "3xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "blue",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "4xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "5xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 5
},
"id": 5,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"2.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "2xx"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"3.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "3xx"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"4.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "4xx"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"5.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "5xx"
}
],
"title": "Responses",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 11
},
"id": 6,
"title": "Database",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 0,
"y": 12
},
"id": 7,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max (\n django_migrations_applied_total {\n namespace=\"$namespace\",\n job=~\"$job\"\n }\n) by (namespace, job)\n"
}
],
"title": "Migrations Applied",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 6,
"y": 12
},
"id": 8,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max (\n django_migrations_unapplied_total {\n namespace=\"$namespace\",\n job=~\"$job\"\n }\n) by (namespace, job)\n"
}
],
"title": "Migrations Unapplied",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
}
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 15
},
"id": 9,
"options": {
"sortBy": [
{
"displayName": "Type"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n topk(10,\n sum by (type) (\n increase(\n django_db_errors_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[1w]\n ) > 0\n )\n )\n)\n",
"format": "table",
"instant": true
}
],
"title": "Top Database Errors (1w)",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
},
"indexByName": {
"job": 1,
"namespace": 0,
"type": 2
},
"renameByName": {
"job": "Job",
"namespace": "Namespace",
"type": "Type"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 12
},
"id": 10,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n django_db_new_connections_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, vendor)\n)\n",
"legendFormat": "{{ vendor }}"
}
],
"title": "Database Connections",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 18
},
"id": 11,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
"legendFormat": "50 - {{ vendor }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
"legendFormat": "95 - {{ vendor }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
"legendFormat": "99 - {{ vendor }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.999,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
"legendFormat": "99.9 - {{ vendor }}"
}
],
"title": "Database Latency",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 24
},
"id": 12,
"title": "Cache",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "percent"
}
},
"unit": "ops"
}
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 25
},
"id": 13,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n rate(\n django_cache_get_hits_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n) by (namespace, job, backend)\n",
"legendFormat": "Hit - {{ backend }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n rate(\n django_cache_get_misses_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n) by (namespace, job, backend)\n",
"legendFormat": "Miss - {{ backend }}"
}
],
"title": "Cache Get",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"django",
"django-mixin"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"query": "label_values(django_http_responses_total_by_status_view_method_total{}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Job",
"multi": false,
"name": "job",
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\"}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timezone": "utc",
"title": "Django / Overview",
"uid": "django-overview-jkwq"
}

View file

@ -0,0 +1,673 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Django which focuses on breaking down requests by view. It is created using the [Django-mixin](https://github.com/adinhodovic/django-mixin).",
"editable": true,
"links": [
{
"tags": [
"django",
"django-mixin"
],
"targetBlank": true,
"title": "Django Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0.90000000000000002
},
{
"color": "yellow",
"value": 0.94999999999999996
},
{
"color": "green",
"value": 0.98999999999999999
}
]
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\",\n status!~\"[4-5].*\"\n }[1w]\n )\n) /\nsum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[1w]\n )\n)\n"
}
],
"title": "Success Rate (non 4xx-5xx responses) [1w]",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 1
},
{
"color": "yellow",
"value": 10
},
{
"color": "red",
"value": 100
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 1
},
"id": 3,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (view) (\n increase(\n django_http_exceptions_total_by_view_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n }[1w]\n ) > 0\n)\n"
}
],
"title": "HTTP Exceptions [1w]",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 1000
},
{
"color": "red",
"value": 2000
}
]
},
"unit": "s"
}
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 1
},
"id": 4,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.50,\n sum (\n rate (\n django_http_requests_latency_seconds_by_view_method_bucket {\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__range]\n )\n ) by (job, le)\n)\n"
}
],
"title": "Average Request Latency (P50) [1w]",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 2500
},
{
"color": "red",
"value": 5000
}
]
},
"unit": "s"
}
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 1
},
"id": 5,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.95,\n sum (\n rate (\n django_http_requests_latency_seconds_by_view_method_bucket {\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__range]\n )\n ) by (job, le)\n)\n"
}
],
"title": "Average Request Latency (P95) [1w]",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 5
},
"id": 6,
"title": "Request & Responses",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false
},
"unit": "reqps"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 6
},
"id": 7,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_requests_total_by_view_transport_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\"\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "reqps"
}
],
"title": "Requests",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "percent"
}
},
"unit": "reqps"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "2xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "3xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "blue",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "4xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "5xx"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 6
},
"id": 8,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"2.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "2xx"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"3.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "3xx"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"4.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "4xx"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"5.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
"legendFormat": "5xx"
}
],
"title": "Responses",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 14
},
"id": 9,
"title": "Latency & Status Codes",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "value"
}
},
"unit": "reqps"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 15
},
"id": 10,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, status, method), 0.001\n)\n",
"legendFormat": "{{ view }} / {{ status }} / {{ method }}"
}
],
"title": "Responses Status Codes",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 15
},
"id": 11,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
"legendFormat": "50 - {{ view }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
"legendFormat": "95 - {{ view }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
"legendFormat": "99 - {{ view }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.999,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
"legendFormat": "99.9 - {{ view }}"
}
],
"title": "Request Latency",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"django",
"django-mixin"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"query": "label_values(django_http_responses_total_by_status_view_method_total{}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Job",
"multi": false,
"name": "job",
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\"}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "View",
"multi": false,
"name": "view",
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\"}, view)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Method",
"multi": true,
"name": "method",
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view=~\"$view\"}, method)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timezone": "utc",
"title": "Django / Requests / By View",
"uid": "django-requests-by-view-jkwq"
}

File diff suppressed because it is too large Load diff

1
assets/django/rules.yaml Normal file
View file

@ -0,0 +1 @@
null

View file

@ -0,0 +1,42 @@
groups:
- name: nginx.rules
rules:
- alert: NginxConfigReloadFailed
annotations:
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{
$labels.job }}&var-controller_class={{ $labels.controller_class }}
description: Nginx config reload failed for the controller with the class {{
$labels.controller_class }}.
summary: Nginx config reload failed.
expr: |
sum(
nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"}
) by (job, controller_class)
== 0
for: 5m
labels:
severity: warning
- alert: NginxHighHttp4xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace
}}/{{ $labels.ingress }} the past 5m.
summary: Nginx high HTTP 4xx error rate.
expr: |
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^4.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
for: 1m
labels:
severity: info
- alert: NginxHighHttp5xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace
}}/{{ $labels.ingress }} the past 5m.
summary: Nginx high HTTP 5xx error rate.
expr: |
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^5.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
for: 1m
labels:
severity: warning

View file

@ -0,0 +1,812 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Ingress-nginx. It is created using the (Ingress-Nginx-mixin)[https://github.com/adinhodovic/ingress-nginx-mixin]",
"editable": true,
"links": [
{
"tags": [
"ingress-nginx",
"ingress-nginx-mixin"
],
"targetBlank": true,
"title": "Ingress Nginx Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Controller",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.001
}
]
},
"unit": "reqps"
}
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n irate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n namespace=~\"$namespace\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\"\n }[$__rate_interval]\n )\n ), 0.001\n)\n"
}
],
"title": "Controller Request Volume",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 1
},
"id": 3,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n avg_over_time(\n nginx_ingress_controller_nginx_process_connections{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n)\n"
}
],
"title": "Controller Connections",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "yellow",
"value": 0.94999999999999996
},
{
"color": "green",
"value": 0.98999999999999999
}
]
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 1
},
"id": 4,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n status!~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n exported_namespace=~\"$exported_namespace\",\n namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n)\n"
}
],
"title": "Controller Success Rate (non $error_codes-xx responses)",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 4,
"w": 3,
"x": 18,
"y": 1
},
"id": 5,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "avg(\n irate(\n nginx_ingress_controller_success{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n) * 60\n"
}
],
"title": "Config Reloads",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 1
}
]
},
"unit": "bool"
}
},
"gridPos": {
"h": 4,
"w": 3,
"x": 21,
"y": 1
},
"id": 6,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "count(\n nginx_ingress_controller_config_last_reload_successful{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_namespace=~\"$namespace\"\n } == 0\n) OR vector(0)\n"
}
],
"title": "Last Config Failed",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 5
},
"id": 7,
"title": "Ingress",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "value"
}
},
"unit": "reqps"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 6
},
"id": 8,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n irate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n ingress=~\"$ingress\",\n exported_namespace=~\"$exported_namespace\"\n }[$__rate_interval]\n )\n ) by (ingress, exported_namespace), 0.001\n)\n",
"legendFormat": "{{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Ingress Request Volume",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 6
},
"id": 9,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status!~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n) by (ingress, exported_namespace)\n/\nsum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (ingress, exported_namespace)\n",
"legendFormat": "{{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Ingress Success Rate (non $error_codes-xx responses)",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "dtdurations"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Ingress"
},
"properties": [
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "Go To Ingress",
"type": "dashboard",
"url": "/d/ingress-nginx-request-handling-jqkw/ingress-nginx-overview?var-exported_namespace=${__data.fields.Namespace}&var-ingress=${__data.fields.Ingress}"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "IN"
},
"properties": [
{
"id": "unit",
"value": "binBps"
}
]
},
{
"matcher": {
"id": "byName",
"options": "OUT"
},
"properties": [
{
"id": "unit",
"value": "binBps"
}
]
}
]
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 14
},
"id": 10,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"desc": true,
"displayName": "P50 Latency"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.50, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.90, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.99, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n irate(\n nginx_ingress_controller_request_size_sum{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (job, ingress, exported_namespace)\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n irate(\n nginx_ingress_controller_response_size_sum{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (job, ingress, exported_namespace)\n",
"format": "table",
"instant": true
}
],
"title": "Ingress Percentile Response Times and Transfer Rates",
"transformations": [
{
"id": "merge"
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"job": true
},
"indexByName": {
"Value #A": 2,
"Value #B": 3,
"Value #C": 4,
"Value #D": 5,
"Value #E": 6,
"exported_namespace": 0,
"ingress": 1
},
"renameByName": {
"Value #A": "P50 Latency",
"Value #B": "P95 Latency",
"Value #C": "P99 Latency",
"Value #D": "IN",
"Value #E": "OUT",
"exported_namespace": "Namespace",
"ingress": "Ingress",
"job": "Job"
}
}
}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 24
},
"id": 11,
"title": "Certificates",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Host"
},
"properties": [
{
"id": "links",
"value": [
{
"targetBlank": true,
"title": "Go To Site",
"type": "link",
"url": "https://${__data.fields.Host}"
}
]
}
]
},
{
"matcher": {
"id": "byName",
"options": "TTL"
},
"properties": [
{
"id": "custom.cellOptions",
"value": {
"type": "color-text"
}
},
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 1814400
}
]
}
}
]
}
]
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 25
},
"id": 12,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"desc": false,
"displayName": "TTL"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "avg(\n nginx_ingress_controller_ssl_expire_time_seconds{\n job=~\"$job\",\n pod=~\"$controller\"\n }\n) by (host) - time()\n",
"format": "table",
"instant": true
}
],
"title": "Ingress Certificate Expiry",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true
},
"indexByName": {
"Value": 1,
"host": 0
},
"renameByName": {
"Value": "TTL",
"host": "Host"
}
}
}
],
"type": "table"
}
],
"schemaVersion": 39,
"tags": [
"ingress-nginx",
"ingress-nginx-mixin"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Job",
"multi": false,
"name": "job",
"query": "label_values(nginx_ingress_controller_config_hash{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Controller Namespace",
"multi": true,
"name": "namespace",
"query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\"}, controller_namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Controller Class",
"multi": true,
"name": "controller_class",
"query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\", controller_namespace=~\"$namespace\"}, controller_class)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Controller",
"multi": true,
"name": "controller",
"query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\", controller_namespace=~\"$namespace\", controller_class=~\"$controller_class\"}, controller_pod)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Ingress Namespace",
"multi": true,
"name": "exported_namespace",
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\", namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\"}, exported_namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Ingress",
"multi": true,
"name": "ingress",
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\", namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", exported_namespace=~\"$exported_namespace\"}, ingress)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"allValue": "4-5",
"current": {
"selected": false,
"text": [
"All"
],
"value": [
"$__all"
]
},
"description": "4 represents all 4xx codes, 5 represents all 5xx codes",
"includeAll": true,
"label": "Error Codes",
"multi": true,
"name": "error_codes",
"options": [
{
"selected": true,
"text": "4",
"value": "4"
},
{
"selected": false,
"text": "5",
"value": "5"
}
],
"query": "4 : 4,5 : 5",
"type": "custom"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "Ingress Nginx / Overview",
"uid": "ingress-nginx-overview-12mk"
}

View file

@ -0,0 +1,594 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Ingress-nginx. It is created using the (Ingress-Nginx-mixin)[https://github.com/adinhodovic/ingress-nginx-mixin]",
"editable": true,
"links": [
{
"tags": [
"ingress-nginx",
"ingress-nginx-mixin"
],
"targetBlank": true,
"title": "Ingress Nginx Dashboards",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Ingress Response Times",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.5,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
"legendFormat": ".5 - {{ ingress }}/{{ exported_namespace }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.95,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
"legendFormat": ".95 - {{ ingress }}/{{ exported_namespace }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.99,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
"legendFormat": ".99 - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Total Request Time",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 1
},
"id": 3,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.5,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
"legendFormat": ".5 - {{ ingress }}/{{ exported_namespace }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.95,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
"legendFormat": ".95 - {{ ingress }}/{{ exported_namespace }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n 0.99,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
"legendFormat": ".99 - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Upstream Response Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 7
},
"id": 4,
"title": "Ingress Paths",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "value"
}
},
"unit": "reqps"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 8
},
"id": 5,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (path, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n",
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Request Volume",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 8
},
"id": 6,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(\n .5,\n sum by (le, path, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Median upstream response time",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "value"
}
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 14
},
"id": 7,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status=~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n)\n/\nsum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n exported_namespace =~ \"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n",
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Response error rate",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 14
},
"id": 8,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_response_duration_seconds_sum{\n job=~\"$job\",\n exported_namespace =~ \"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n",
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Upstream time consumed",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "value"
}
},
"unit": "reqps"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 20
},
"id": 9,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status=~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n) by(path, ingress, exported_namespace, status)\n",
"legendFormat": "{{ status }} {{ path }} - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Response error volume",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "value"
}
},
"unit": "decbytes"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 20
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum (\n rate (\n nginx_ingress_controller_response_size_sum {\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (path, ingress, exported_namespace)\n/\nsum (\n rate(\n nginx_ingress_controller_response_size_count {\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n }[$__rate_interval]\n )\n) by (path, ingress, exported_namespace)\n",
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
}
],
"title": "Average response size",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"ingress-nginx",
"ingress-nginx-mixin"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Job",
"multi": false,
"name": "job",
"query": "label_values(nginx_ingress_controller_config_hash{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Ingress Namespace",
"multi": true,
"name": "exported_namespace",
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\"}, exported_namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": false,
"label": "Ingress",
"multi": true,
"name": "ingress",
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\", exported_namespace=~\"$exported_namespace\"}, ingress)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"allValue": "4-5",
"current": {
"selected": false,
"text": [
"All"
],
"value": [
"$__all"
]
},
"description": "4 represents all 4xx codes, 5 represents all 5xx codes",
"includeAll": true,
"label": "Error Codes",
"multi": true,
"name": "error_codes",
"options": [
{
"selected": true,
"text": "4",
"value": "4"
},
{
"selected": false,
"text": "5",
"value": "5"
}
],
"query": "4 : 4,5 : 5",
"type": "custom"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "Ingress Nginx / Request Handling Performance",
"uid": "ingress-nginx-request-handling-jqkw"
}

View file

@ -0,0 +1 @@
groups: []

View file

@ -0,0 +1,73 @@
groups:
- name: karpenter
rules:
- alert: KarpenterCloudProviderErrors
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance
description: The Karpenter provider {{ $labels.provider }} with the controller
{{ $labels.controller }} has errors with the method {{ $labels.method }}.
summary: Karpenter has Cloud Provider Errors.
expr: |
sum(
increase(
karpenter_cloudprovider_errors_total{
job=~"karpenter"
}[5m]
)
) by (namespace, job, provider, controller, method) > 0
for: 5m
labels:
severity: warning
- alert: KarpenterNodepoolNearCapacity
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview
description: The resource {{ $labels.resource_type }} in the Karpenter node
pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding
resources.
summary: Karpenter Nodepool near capacity.
expr: |
sum (
karpenter_nodepools_usage{job=~"karpenter"}
) by (namespace, job, nodepool, resource_type)
/
sum (
karpenter_nodepools_limit{job=~"karpenter"}
) by (namespace, job, nodepool, resource_type)
* 100 > 75
for: 15m
labels:
severity: warning
- name: cluster-autoscaler
rules:
- alert: ClusterAutoscalerNodeCountNearCapacity
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
description: The node count for the cluster autoscaler job {{ $labels.job }}
is reaching max limit. Consider scaling node groups.
summary: Cluster Autoscaler Node Count near Capacity.
expr: |
sum (
cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"}
) by (namespace, job)
/
sum (
cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"}
) by (namespace, job)
* 100 > 75
for: 15m
labels:
severity: warning
- alert: ClusterAutoscalerUnschedulablePods
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
description: The cluster currently has unschedulable pods, indicating resource
shortages. Consider adding more nodes or increasing node group capacity.
summary: Pods Pending Scheduling - Cluster Node Group Scaling Required
expr: |
sum (
cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"}
) by (namespace, job)
> 0
for: 15m
labels:
severity: warning

View file

@ -0,0 +1,643 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for cluster autoscaler. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
"editable": true,
"links": [
{
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"cluster-autoscaler"
],
"targetBlank": true,
"title": "Kubernetes / Autoscaling / Cluster Autoscaler",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n )\n)\n"
}
],
"title": "Total Nodes",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 3,
"y": 1
},
"id": 3,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_max_nodes_count{\n job=~\"$job\"\n }\n )\n)\n"
}
],
"title": "Max Nodes",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 6,
"y": 1
},
"id": 4,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_node_groups_count{\n job=~\"$job\"\n }\n )\n)\n"
}
],
"title": "Node Groups",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "percent"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 9,
"y": 1
},
"id": 5,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\",\n state=\"ready\"\n }\n ) /\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n ) * 100\n)\n"
}
],
"title": "Healthy Nodes",
"type": "gauge"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"color": "red",
"text": "No"
},
"1": {
"color": "green",
"text": "Yes"
}
},
"type": "value"
}
],
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 12,
"y": 1
},
"id": 6,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_cluster_safe_to_autoscale{\n job=~\"$job\"\n }\n )\n)\n"
}
],
"title": "Safe To Scale",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 15,
"y": 1
},
"id": 7,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_unschedulable_pods_count{\n job=~\"$job\"\n }\n )\n)\n"
}
],
"title": "Unscheduled Pods",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 18,
"y": 1
},
"id": 8,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "time() - sum(\n cluster_autoscaler_last_activity{\n job=~\"$job\",\n activity=\"scaleDown\"\n }\n)\n"
}
],
"title": "Last Scale Down",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
}
},
"gridPos": {
"h": 3,
"w": 3,
"x": 21,
"y": 1
},
"id": 9,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "time() - sum(\n cluster_autoscaler_last_activity{\n job=~\"$job\",\n activity=\"scaleUp\"\n }\n)\n"
}
],
"title": "Last Scale Up",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"id": 10,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true,
"sortBy": "Last *",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_unschedulable_pods_count{\n job=~\"$job\"\n }[2m]\n )\n ) by (type)\n)\n",
"legendFormat": "{{ type }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_evicted_pods_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n",
"legendFormat": "Evicted Pods"
}
],
"title": "Pod Activity",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"id": 11,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last *",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n ) by (state)\n)\n",
"legendFormat": "{{ state }}"
}
],
"title": "Node Activity",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 12
},
"id": 12,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last *",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n )\n)\n",
"legendFormat": "Total Nodes"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n cluster_autoscaler_unneeded_nodes_count{\n job=~\"$job\"\n }\n )\n)\n",
"legendFormat": "Unneeded Nodes"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_scaled_up_nodes_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n",
"legendFormat": "Scaled Up Nodes"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_scaled_down_nodes_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n",
"legendFormat": "Scaled Down Nodes"
}
],
"title": "Autoscaling Activity",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"cluster-autoscaler"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Job",
"name": "job",
"query": "label_values(cluster_autoscaler_last_activity{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-24h",
"to": "now"
},
"timezone": "utc",
"title": "Kubernetes / Autoscaling / Cluster Autoscaler",
"uid": "kubernetes-autoscaling-mixin-ca-jkwq"
}

View file

@ -0,0 +1,507 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for horizontal pod autoscalers. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
"editable": true,
"links": [
{
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"kubernetes-core"
],
"targetBlank": true,
"title": "Kubernetes / Autoscaling",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_desired_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
}
],
"title": "Desired Replicas",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 6,
"y": 1
},
"id": 3,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_current_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
}
],
"title": "Current Replicas",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 12,
"y": 1
},
"id": 4,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_min_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
}
],
"title": "Min Replicas",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 18,
"y": 1
},
"id": 5,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_max_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
}
],
"title": "Max Replicas",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 6
},
"id": 6,
"options": {
"sortBy": [
{
"displayName": "Horitzontal Pod Autoscaler"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n",
"format": "table",
"instant": true
}
],
"title": "Metric Targets",
"transformations": [
{
"id": "merge"
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"job": true
},
"indexByName": {
"Value #A": 4,
"horizontalpodautoscaler": 1,
"metric_name": 2,
"metric_target_type": 3,
"namespace": 0
},
"renameByName": {
"Value #A": "Threshold",
"horizontalpodautoscaler": "Horitzontal Pod Autoscaler",
"metric_name": "Metric Name",
"metric_target_type": "Metric Target Type",
"namespace": "Namespace"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 12
},
"id": 7,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last *",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\",\n metric_target_type=\"utilization\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n",
"legendFormat": "Utilization / {{ metric_name }}"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\",\n metric_target_type=\"utilization\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n",
"legendFormat": "Threshold / {{ metric_name }}"
}
],
"title": "Utilization & Threshold",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 18
},
"id": 8,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last *",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_desired_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
"legendFormat": "Desired Replicas"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_current_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
"legendFormat": "Current Replicas"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_min_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
"legendFormat": "Min Replicas"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_max_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
"legendFormat": "Max Replicas"
}
],
"title": "Replicas",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"kubernetes-core"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Job",
"name": "job",
"query": "label_values(kube_horizontalpodautoscaler_metadata_generation{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Namespace",
"multi": true,
"name": "namespace",
"query": "label_values(kube_horizontalpodautoscaler_metadata_generation{job=~\"$job\"}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Horitzontal Pod Autoscaler",
"name": "hpa",
"query": "label_values(kube_horizontalpodautoscaler_spec_target_metric{job=~\"$job\", namespace=\"$namespace\"},horizontalpodautoscaler)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Metric Name",
"multi": true,
"name": "metric_name",
"query": "label_values(kube_horizontalpodautoscaler_spec_target_metric{job=~\"$job\", namespace=\"$namespace\", horizontalpodautoscaler=\"$hpa\"}, metric_name)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timezone": "utc",
"title": "Kubernetes / Autoscaling / Horitzontal Pod Autoscaler",
"uid": "kubernetes-autoscaling-mixin-hpa-jkwq"
}

View file

@ -0,0 +1,482 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Karpenter and focuses on Karpenter deletion/creation activity. It is created using the [Kubernetes Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
"editable": true,
"links": [
{
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"karpenter"
],
"targetBlank": true,
"title": "Kubernetes / Autoscaling / Karpenter",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Node Pool Activity",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n karpenter_nodes_created_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n",
"interval": "1m",
"legendFormat": "{{ nodepool }}"
}
],
"title": "Nodes Created by Node Pool",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 1
},
"id": 3,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n karpenter_nodes_terminated_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n",
"interval": "1m",
"legendFormat": "{{ nodepool }}"
}
],
"title": "Nodes Terminated by Node Pool",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 7
},
"id": 4,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n karpenter_voluntary_disruption_decisions_total{\n job=~\"$job\",\n }[$__rate_interval]\n )\n ) by (decision, reason, consolidation_type)\n)\n",
"interval": "1m",
"legendFormat": "{{ decision }} - {{ reason }} - {{ consolidation_type }}"
}
],
"title": "Node Disruption Decisions by Reason, Decision, and Consolidation Type",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 7
},
"id": 5,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n karpenter_voluntary_disruption_eligible_nodes{\n job=~\"$job\",\n }[$__rate_interval]\n )\n ) by (reason)\n)\n",
"interval": "1m",
"legendFormat": "{{ reason }}"
}
],
"title": "Nodes Eligible for Disruption by Reason",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 13
},
"id": 6,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n karpenter_nodeclaims_disrupted_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool, capacity_type, reason)\n)\n",
"interval": "1m",
"legendFormat": "{{ nodepool }} - {{ capacity_type }} - {{ reason }}"
}
],
"title": "Nodes Disrupted by Node Pool",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 19
},
"id": 7,
"title": "Pod Activity",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 20
},
"id": 8,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n karpenter_pods_state{\n job=~\"$job\"\n }\n ) by (phase)\n)\n",
"interval": "1m",
"legendFormat": "{{ phase }}"
}
],
"title": "Pods by Phase",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 20
},
"id": 9,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P50"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P95"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P99"
}
],
"title": "Pods Startup Duration",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"karpenter"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Job",
"name": "job",
"query": "label_values(karpenter_nodes_allocatable{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Node Pool",
"multi": true,
"name": "nodepool",
"query": "label_values(karpenter_nodepools_allowed_disruptions{job=~\"$job\"}, nodepool)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-24h",
"to": "now"
},
"timezone": "utc",
"title": "Kubernetes / Autoscaling / Karpenter / Activity",
"uid": "kubernetes-autoscaling-mixin-kact-jkwq"
}

View file

@ -0,0 +1,839 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Karpenter and focuses on Karpenter performance. It is created using the [Kubernetes Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
"editable": true,
"links": [
{
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"karpenter"
],
"targetBlank": true,
"title": "Kubernetes / Autoscaling / Karpenter",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"color": "red",
"text": "No"
},
"1": {
"color": "green",
"text": "Yes"
}
},
"type": "value"
}
],
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n karpenter_cluster_state_synced{\n job=~\"$job\",\n }\n) by (job)\n"
}
],
"title": "Cluster State Synced",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 0,
"y": 4
},
"id": 3,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n karpenter_cluster_state_node_count{\n job=~\"$job\",\n }\n) by (job)\n"
}
],
"title": "Cluster State Node Count",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 18,
"x": 6,
"y": 1
},
"id": 4,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n increase(\n karpenter_cloudprovider_errors_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n ) by (job, provider, controller, method)\n)\n",
"interval": "1m",
"legendFormat": "{{ provider }} - {{ controller }} - {{ method }}"
}
],
"title": "Cloud Provider Errors",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 7
},
"id": 5,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P50"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P95"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P99"
}
],
"title": "Node Termination Duration",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 7
},
"id": 6,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P50"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P95"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n",
"interval": "1m",
"legendFormat": "P99"
}
],
"title": "Pods Startup Duration",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 13
},
"id": 7,
"title": "Interruption Queue",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 14
},
"id": 8,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n increase(\n karpenter_interruption_received_messages_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job, message_type)\n",
"legendFormat": "{{ message_type }}"
}
],
"title": "Interruption Received Messages",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 14
},
"id": 9,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n increase(\n karpenter_interruption_deleted_messages_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job)\n",
"legendFormat": "Deleted Messages"
}
],
"title": "Interruption Deleted Messages",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 14
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P50"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P95"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P99"
}
],
"title": "Interruption Duration",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 20
},
"id": 11,
"title": "Work Queue",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 21
},
"id": 12,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n karpenter_workqueue_depth{\n job=~\"$job\"\n }\n) by (job)\n",
"legendFormat": "Queue Depth"
}
],
"title": "Work Queue Depth",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 21
},
"id": 13,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P50"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P95"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P99"
}
],
"title": "Work Queue In Queue Duration",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 10,
"spanNulls": false
},
"unit": "s"
}
},
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 21
},
"id": 14,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P50"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P95"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
"legendFormat": "P99"
}
],
"title": "Work Queue Work Duration",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 27
},
"id": 15,
"title": "Controller",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"fillOpacity": 100,
"spanNulls": false,
"stacking": {
"mode": "value"
}
},
"unit": "reqps"
}
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 28
},
"id": 16,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n rate(\n controller_runtime_reconcile_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job, controller) > 0\n",
"legendFormat": "{{ controller }}"
}
],
"title": "Controller Reconcile",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"karpenter"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Job",
"name": "job",
"query": "label_values(karpenter_nodes_allocatable{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Node Pool",
"multi": true,
"name": "nodepool",
"query": "label_values(karpenter_nodepools_allowed_disruptions{job=~\"$job\"}, nodepool)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-24h",
"to": "now"
},
"timezone": "utc",
"title": "Kubernetes / Autoscaling / Karpenter / Performance",
"uid": "kubernetes-autoscaling-mixin-kperf-jkwq"
}

View file

@ -0,0 +1,568 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for pod disruption budgets. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
"editable": true,
"links": [
{
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"kubernetes-core"
],
"targetBlank": true,
"title": "Kubernetes / Autoscaling",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "$namespace Namespace Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Disruptions Allowed"
},
"properties": [
{
"id": "custom.cellOptions",
"value": {
"type": "color-text"
}
},
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
}
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"sortBy": [
{
"displayName": "Pod Disruption Budget"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
"format": "table",
"instant": true
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
"format": "table",
"instant": true
}
],
"title": "Summary",
"transformations": [
{
"id": "merge"
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"job": true
},
"indexByName": {
"Value #A": 2,
"Value #B": 3,
"Value #C": 4,
"Value #D": 5,
"namespace": 0,
"poddisruptionbudget": 1
},
"renameByName": {
"Value #A": "Disruptions Allowed",
"Value #B": "Desired Healthy",
"Value #C": "Currently Healthy",
"Value #D": "Expected Pods",
"namespace": "Namespace",
"poddisruptionbudget": "Pod Disruption Budget"
}
}
}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 8
},
"id": 3,
"title": "$pdb Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 0,
"y": 9
},
"id": 4,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
}
],
"title": "Disruptions Allowed",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 6,
"y": 9
},
"id": 5,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
}
],
"title": "Desired Healthy",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 12,
"y": 9
},
"id": 6,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
}
],
"title": "Currently Healthy",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 0.10000000000000001
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 3,
"w": 6,
"x": 18,
"y": 9
},
"id": 7,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
}
],
"title": "Expected Pods",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Currently Healthy"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Disruptions Allowed"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Desired Healthy"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Expected Pods"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "blue",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 12
},
"id": 8,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Last *",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
"legendFormat": "Disruptions Allowed"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
"legendFormat": "Desired Healthy"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
"legendFormat": "Currently Healthy"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
"legendFormat": "Expected Pods"
}
],
"title": "Status",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"kubernetes-core"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Job",
"name": "job",
"query": "label_values(kube_horizontalpodautoscaler_metadata_generation{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Namespace",
"multi": true,
"name": "namespace",
"query": "label_values(kube_poddisruptionbudget_status_current_healthy{job=~\"$job\"}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Pod Disruption Budget",
"name": "pdb",
"query": "label_values(kube_poddisruptionbudget_status_current_healthy{job=~\"$job\", namespace=~\"$namespace\"}, poddisruptionbudget)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timezone": "utc",
"title": "Kubernetes / Autoscaling / Pod Disruption Budget",
"uid": "kubernetes-autoscaling-mixin-pdb-jkwq"
}

View file

@ -0,0 +1,895 @@
{
"__inputs": [ ],
"__requires": [ ],
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for vertical pod autoscalers. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
"editable": true,
"links": [
{
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"kubernetes-core"
],
"targetBlank": true,
"title": "Kubernetes / Autoscaling",
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"title": "$namespace Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "CPU Lower Bound"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "dark-red",
"mode": "fixed"
}
},
{
"id": "custom.cellOptions",
"value": {
"mode": "basic",
"type": "color-background"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "CPU Target"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
},
{
"id": "custom.cellOptions",
"value": {
"mode": "basic",
"type": "color-background"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "CPU Upper Bound"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
},
{
"id": "custom.cellOptions",
"value": {
"mode": "basic",
"type": "color-background"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 1
},
"id": 2,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"displayName": "Vertical Pod Autoscaler"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"format": "table",
"instant": true,
"legendFormat": "CPU Lower Bound"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"format": "table",
"instant": true,
"legendFormat": "CPU Target"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"format": "table",
"instant": true,
"legendFormat": "CPU Upper Bound"
}
],
"title": "CPU Resource Recommendations",
"transformations": [
{
"id": "merge"
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"job": true
},
"indexByName": {
"Value #A": 4,
"Value #B": 5,
"Value #C": 6,
"container": 2,
"namespace": 0,
"resource": 3,
"verticalpodautoscaler": 1
},
"renameByName": {
"Value #A": "CPU Lower Bound",
"Value #B": "CPU Target",
"Value #C": "CPU Upper Bound",
"container": "Container",
"namespace": "Namespace",
"resource": "Resource",
"verticalpodautoscaler": "Vertical Pod Autoscaler"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Memory Lower Bound"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "dark-red",
"mode": "fixed"
}
},
{
"id": "custom.cellOptions",
"value": {
"mode": "basic",
"type": "color-background"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Memory Target"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "yellow",
"mode": "fixed"
}
},
{
"id": "custom.cellOptions",
"value": {
"mode": "basic",
"type": "color-background"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Memory Upper Bound"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
},
{
"id": "custom.cellOptions",
"value": {
"mode": "basic",
"type": "color-background"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 9
},
"id": 3,
"options": {
"footer": {
"enablePagination": true
},
"sortBy": [
{
"displayName": "Vertical Pod Autoscaler"
}
]
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"format": "table",
"instant": true,
"legendFormat": "Memory Lower Bound"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"format": "table",
"instant": true,
"legendFormat": "Memory Target"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"format": "table",
"instant": true,
"legendFormat": "Memory Upper Bound"
}
],
"title": "Memory Resource Recommendations",
"transformations": [
{
"id": "merge"
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"job": true
},
"indexByName": {
"Value #A": 4,
"Value #B": 5,
"Value #C": 6,
"container": 2,
"namespace": 0,
"resource": 3,
"verticalpodautoscaler": 1
},
"renameByName": {
"Value #A": "Memory Lower Bound",
"Value #B": "Memory Target",
"Value #C": "Memory Upper Bound",
"container": "Container",
"namespace": "Namespace",
"resource": "Resource",
"verticalpodautoscaler": "Vertical Pod Autoscaler"
}
}
}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 17
},
"id": 4,
"repeat": "container",
"title": "$vpa / $container Summary",
"type": "row"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "yellow",
"value": 0
}
]
},
"unit": "short"
}
},
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 18
},
"id": 5,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "CPU Requests"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "CPU Limits"
}
],
"title": "CPU Guaranteed QoS",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "CPU Requests"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "CPU Limits"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 18
},
"id": 6,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "CPU Requests"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "CPU Limits"
}
],
"title": "CPU Burstable QoS",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{
"color": "yellow",
"value": 0
}
]
},
"unit": "bytes"
}
},
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 18
},
"id": 7,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "Memory Requests"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "Memory Limits"
}
],
"title": "Memory Guaranteed QoS",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Memory Requests"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "red",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Memory Limits"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 18
},
"id": 8,
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "Memory Requests"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "Memory Limits"
}
],
"title": "Memory Burstable QoS",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "short"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 26
},
"id": 9,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "{{ container }} - Lower Bound"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "{{ container }} - Target"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "{{ container }} - Upper Bound"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n container=\"$container\"\n }\n) by (container)\n",
"legendFormat": "{{ container }} - Usage"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_pod_container_resource_requests{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"cpu\",\n container=\"$container\"\n }\n) by (container)\n",
"legendFormat": "{{ container }} - Requests"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_pod_container_resource_limits{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"cpu\",\n container=\"$container\"\n }\n) by (container)\n",
"legendFormat": "{{ container }} - Limits"
}
],
"title": "VPA CPU Recommendations Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"custom": {
"spanNulls": false
},
"unit": "bytes"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 26
},
"id": 10,
"options": {
"legend": {
"calcs": [
"lastNotNull",
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "v11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "{{ container }} - Lower Bound"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "{{ container }} - Target"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
"legendFormat": "{{ container }} - Upper Bound"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n container_memory_working_set_bytes{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa(.+)\",\n container=\"$container\"\n }\n) by (container)\n",
"legendFormat": "{{ container }} - Usage"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_pod_container_resource_requests{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"memory\",\n container=\"$container\"\n }\n) by (container)\n",
"legendFormat": "{{ container }} - Requests"
},
{
"datasource": {
"type": "prometheus",
"uid": "$datasource"
},
"expr": "sum(\n kube_pod_container_resource_limits{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"memory\",\n container=\"$container\"\n }\n) by (container)\n",
"legendFormat": "{{ container }} - Limits"
}
],
"title": "VPA Memory Recommendations Over Time",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": [
"kubernetes",
"autoscaling",
"kubernetes-autoscaling-mixin",
"kubernetes-core"
],
"templating": {
"list": [
{
"label": "Data source",
"name": "datasource",
"query": "prometheus",
"type": "datasource"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Job",
"name": "job",
"query": "label_values(kube_customresource_verticalpodautoscaler_labels{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "Namespace",
"multi": true,
"name": "namespace",
"query": "label_values(kube_customresource_verticalpodautoscaler_labels{job=~\"$job\"}, namespace)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"label": "VPA Pod Autoscaler",
"name": "vpa",
"query": "label_values(kube_customresource_verticalpodautoscaler_labels{job=~\"$job\", namespace=~\"$namespace\"}, verticalpodautoscaler)",
"refresh": 2,
"sort": 1,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"includeAll": true,
"label": "Container",
"multi": true,
"name": "container",
"query": "label_values(kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{job=~\"$job\", namespace=~\"$namespace\", verticalpodautoscaler=~\"$vpa\"}, container)",
"refresh": 2,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timezone": "utc",
"title": "Kubernetes / Autoscaling / Vertical Pod Autoscaler",
"uid": "kubernetes-autoscaling-mixin-vpa-jkwq"
}

View file

@ -0,0 +1 @@
null

View file

@ -516,6 +516,31 @@
"name": "gitea",
"source": "https://github.com/go-gitea/gitea",
"subdir": "contrib/gitea-monitoring-mixin"
},
{
"name": "django",
"source": "https://github.com/adinhodovic/django-mixin",
"subdir": ""
},
{
"name": "celery",
"source": "https://github.com/danihodovic/celery-exporter",
"subdir": "celery-mixin"
},
{
"name": "argo-cd-2",
"source": "https://github.com/adinhodovic/argo-cd-mixin",
"subdir": ""
},
{
"name": "ingress-nginx-mixin",
"source": "https://github.com/adinhodovic/ingress-nginx-mixin",
"subdir": ""
},
{
"name": "kubernetes-autoscaling",
"source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin",
"subdir": ""
}
]
}

View file

@ -0,0 +1,156 @@
---
title: argo-cd-2
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/adinhodovic/argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/alerts.yaml).
{{< /panel >}}
### argo-cd
##### ArgoCdAppOutOfSync
{{< code lang="yaml" >}}
alert: ArgoCdAppOutOfSync
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
}}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} is out of sync with the sync status {{ $labels.sync_status }}
for the past 15m.
summary: An ArgoCD Application is Out Of Sync.
expr: |
sum(
argocd_app_info{
job=~".*",
sync_status!="Synced"
}
) by (job, dest_server, project, name, sync_status)
> 0
for: 15m
labels:
severity: warning
{{< /code >}}
##### ArgoCdAppUnhealthy
{{< code lang="yaml" >}}
alert: ArgoCdAppUnhealthy
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
}}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} is unhealthy with the health status {{ $labels.health_status }}
for the past 15m.
summary: An ArgoCD Application is Unhealthy.
expr: |
sum(
argocd_app_info{
job=~".*",
health_status!~"Healthy|Progressing"
}
) by (job, dest_server, project, name, health_status)
> 0
for: 15m
labels:
severity: warning
{{< /code >}}
##### ArgoCdAppAutoSyncDisabled
{{< code lang="yaml" >}}
alert: ArgoCdAppAutoSyncDisabled
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
}}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} has autosync disabled for the past 2h.
summary: An ArgoCD Application has AutoSync Disabled.
expr: |
sum(
argocd_app_info{
job=~".*",
autosync_enabled!="true",
name!~""
}
) by (job, dest_server, project, name, autosync_enabled)
> 0
for: 2h
labels:
severity: warning
{{< /code >}}
##### ArgoCdAppSyncFailed
{{< code lang="yaml" >}}
alert: ArgoCdAppSyncFailed
annotations:
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
}}
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
$labels.name }} has failed to sync with the status {{ $labels.phase }} the past
10m.
summary: An ArgoCD Application has Failed to Sync.
expr: |
sum(
round(
increase(
argocd_app_sync_total{
job=~".*",
phase!="Succeeded"
}[10m]
)
)
) by (job, dest_server, project, name, phase) > 0
for: 1m
labels:
severity: warning
{{< /code >}}
##### ArgoCdNotificationDeliveryFailed
{{< code lang="yaml" >}}
alert: ArgoCdNotificationDeliveryFailed
annotations:
dashboard_url: https://grafana.com/d/argo-cd-notifications-overview-kask/argocd-notifications-overview?var-job={{
$labels.job }}&var-exported_service={{ $labels.exported_service }}
description: The notification job {{ $labels.job }} has failed to deliver to {{
$labels.exported_service }} for the past 10m.
summary: ArgoCD Notification Delivery Failed.
expr: |
sum(
round(
increase(
argocd_notifications_deliveries_total{
job=~".*",
succeeded!="true"
}[10m]
)
)
) by (job, exported_service, succeeded) > 0
for: 1m
labels:
severity: warning
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [argo-cd-application-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-application-overview.json)
- [argo-cd-notifications-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json)
- [argo-cd-operational-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-operational-overview.json)

View file

@ -0,0 +1,114 @@
---
title: celery
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/danihodovic/celery-exporter](https://github.com/danihodovic/celery-exporter/tree/master/celery-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/celery/alerts.yaml).
{{< /panel >}}
### celery
##### CeleryTaskHighFailRate
{{< code lang="yaml" >}}
alert: CeleryTaskHighFailRate
annotations:
dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{
$labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name
}}
description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name
}}/{{ $labels.name }} the past 10m.
summary: Celery high task fail rate.
expr: |
sum(
increase(
celery_task_failed_total{
job=~".*celery.*",
queue_name!~"None",
name!~"None"
}[10m]
)
) by (job, namespace, queue_name, name)
/
(
sum(
increase(
celery_task_failed_total{
job=~".*celery.*",
queue_name!~"None",
name!~"None"
}[10m]
)
) by (job, namespace, queue_name, name)
+
sum(
increase(
celery_task_succeeded_total{
job=~".*celery.*",
queue_name!~"None",
name!~"None"
}[10m]
)
) by (job, namespace, queue_name, name)
)
* 100 > 5
for: 1m
labels:
severity: warning
{{< /code >}}
##### CeleryHighQueueLength
{{< code lang="yaml" >}}
alert: CeleryHighQueueLength
annotations:
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
$labels.job }}&var-queue_name={{ $labels.queue_name }}
description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name
}} the past 20m.
summary: Celery high queue length.
expr: |
sum(
celery_queue_length{
job=~".*celery.*",
queue_name!~"None"
}
) by (job, namespace, queue_name)
> 100
for: 20m
labels:
severity: warning
{{< /code >}}
##### CeleryWorkerDown
{{< code lang="yaml" >}}
alert: CeleryWorkerDown
annotations:
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
$labels.job }}
description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline.
summary: A Celery worker is offline.
expr: |
celery_worker_up{job=~".*celery.*"} == 0
for: 15m
labels:
severity: warning
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [celery-tasks-by-task](https://github.com/monitoring-mixins/website/blob/master/assets/celery/dashboards/celery-tasks-by-task.json)
- [celery-tasks-overview](https://github.com/monitoring-mixins/website/blob/master/assets/celery/dashboards/celery-tasks-overview.json)

View file

@ -68,10 +68,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
Following dashboards are generated from mixins and hosted on github:
- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json)
- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json)
- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json)
- [cilium-agent](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent.json)
- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json)
- [cilium-api](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-api.json)
- [cilium-bpf](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-bpf.json)
- [cilium-conntrack](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-conntrack.json)
@ -80,6 +78,8 @@ Following dashboards are generated from mixins and hosted on github:
- [cilium-fqdn-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-fqdn-proxy.json)
- [cilium-identities](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-identities.json)
- [cilium-kubernetes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-kubernetes.json)
- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json)
- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json)
- [cilium-network](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-network.json)
- [cilium-nodes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-nodes.json)
- [cilium-operator](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-operator.json)

View file

@ -2491,8 +2491,8 @@ Following dashboards are generated from mixins and hosted on github:
- [alertmanager](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/alertmanager.json)
- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json)
- [cortex-compactor](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor.json)
- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json)
- [cortex-config](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-config.json)
- [cortex-object-store](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-object-store.json)
- [cortex-queries](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-queries.json)

View file

@ -0,0 +1,141 @@
---
title: django
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/adinhodovic/django-mixin](https://github.com/adinhodovic/django-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/django/alerts.yaml).
{{< /panel >}}
### django
##### DjangoMigrationsUnapplied
{{< code lang="yaml" >}}
alert: DjangoMigrationsUnapplied
annotations:
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}
description: The job {{ $labels.job }} has unapplied migrations.
summary: Django has unapplied migrations.
expr: |
sum(
django_migrations_unapplied_total{
job=~"django"
}
) by (namespace, job)
> 0
for: 15m
labels:
severity: warning
{{< /code >}}
##### DjangoDatabaseException
{{< code lang="yaml" >}}
alert: DjangoDatabaseException
annotations:
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}
description: The job {{ $labels.job }} has hit the database exception {{ $labels.type
}}.
summary: Django database exception.
expr: |
sum (
increase(
django_db_errors_total{
job=~"django"
}[10m]
)
) by (type, namespace, job)
> 0
labels:
severity: info
{{< /code >}}
##### DjangoHighHttp4xxErrorRate
{{< code lang="yaml" >}}
alert: DjangoHighHttp4xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
description: More than 5% HTTP requests with status 4xx for {{ $labels.job }}/{{
$labels.view }} the past 5m.
summary: Django high HTTP 4xx error rate.
expr: |
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
status=~"^4.*",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
/
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
* 100 > 5
for: 1m
labels:
severity: warning
{{< /code >}}
##### DjangoHighHttp5xxErrorRate
{{< code lang="yaml" >}}
alert: DjangoHighHttp5xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
description: More than 5% HTTP requests with status 5xx for {{ $labels.job }}/{{
$labels.view }} the past 5m.
summary: Django high HTTP 5xx error rate.
expr: |
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
status=~"^5.*",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
/
sum(
rate(
django_http_responses_total_by_status_view_method_total{
job=~"django",
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
}[5m]
)
) by (namespace, job, view)
* 100 > 5
for: 1m
labels:
severity: warning
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [django-overview](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-overview.json)
- [django-requests-by-view](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-requests-by-view.json)
- [django-requests-overview](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-requests-overview.json)

View file

@ -0,0 +1,80 @@
---
title: ingress-nginx-mixin
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/adinhodovic/ingress-nginx-mixin](https://github.com/adinhodovic/ingress-nginx-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/alerts.yaml).
{{< /panel >}}
### nginx.rules
##### NginxConfigReloadFailed
{{< code lang="yaml" >}}
alert: NginxConfigReloadFailed
annotations:
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{
$labels.job }}&var-controller_class={{ $labels.controller_class }}
description: Nginx config reload failed for the controller with the class {{ $labels.controller_class
}}.
summary: Nginx config reload failed.
expr: |
sum(
nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"}
) by (job, controller_class)
== 0
for: 5m
labels:
severity: warning
{{< /code >}}
##### NginxHighHttp4xxErrorRate
{{< code lang="yaml" >}}
alert: NginxHighHttp4xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace
}}/{{ $labels.ingress }} the past 5m.
summary: Nginx high HTTP 4xx error rate.
expr: |
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^4.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
for: 1m
labels:
severity: info
{{< /code >}}
##### NginxHighHttp5xxErrorRate
{{< code lang="yaml" >}}
alert: NginxHighHttp5xxErrorRate
annotations:
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace
}}/{{ $labels.ingress }} the past 5m.
summary: Nginx high HTTP 5xx error rate.
expr: |
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^5.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
for: 1m
labels:
severity: warning
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [ingress-nginx-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/dashboards/ingress-nginx-overview.json)
- [ingress-nginx-request-handling-performance](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/dashboards/ingress-nginx-request-handling-performance.json)

View file

@ -0,0 +1,120 @@
---
title: kubernetes-autoscaling
---
## Overview
{{< panel style="danger" >}}
Jsonnet source code is available at [github.com/adinhodovic/kubernetes-autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin)
{{< /panel >}}
## Alerts
{{< panel style="warning" >}}
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/alerts.yaml).
{{< /panel >}}
### karpenter
##### KarpenterCloudProviderErrors
{{< code lang="yaml" >}}
alert: KarpenterCloudProviderErrors
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance
description: The Karpenter provider {{ $labels.provider }} with the controller {{
$labels.controller }} has errors with the method {{ $labels.method }}.
summary: Karpenter has Cloud Provider Errors.
expr: |
sum(
increase(
karpenter_cloudprovider_errors_total{
job=~"karpenter"
}[5m]
)
) by (namespace, job, provider, controller, method) > 0
for: 5m
labels:
severity: warning
{{< /code >}}
##### KarpenterNodepoolNearCapacity
{{< code lang="yaml" >}}
alert: KarpenterNodepoolNearCapacity
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview
description: The resource {{ $labels.resource_type }} in the Karpenter node pool
{{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources.
summary: Karpenter Nodepool near capacity.
expr: |
sum (
karpenter_nodepools_usage{job=~"karpenter"}
) by (namespace, job, nodepool, resource_type)
/
sum (
karpenter_nodepools_limit{job=~"karpenter"}
) by (namespace, job, nodepool, resource_type)
* 100 > 75
for: 15m
labels:
severity: warning
{{< /code >}}
### cluster-autoscaler
##### ClusterAutoscalerNodeCountNearCapacity
{{< code lang="yaml" >}}
alert: ClusterAutoscalerNodeCountNearCapacity
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
description: The node count for the cluster autoscaler job {{ $labels.job }} is
reaching max limit. Consider scaling node groups.
summary: Cluster Autoscaler Node Count near Capacity.
expr: |
sum (
cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"}
) by (namespace, job)
/
sum (
cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"}
) by (namespace, job)
* 100 > 75
for: 15m
labels:
severity: warning
{{< /code >}}
##### ClusterAutoscalerUnschedulablePods
{{< code lang="yaml" >}}
alert: ClusterAutoscalerUnschedulablePods
annotations:
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
description: The cluster currently has unschedulable pods, indicating resource shortages.
Consider adding more nodes or increasing node group capacity.
summary: Pods Pending Scheduling - Cluster Node Group Scaling Required
expr: |
sum (
cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"}
) by (namespace, job)
> 0
for: 15m
labels:
severity: warning
{{< /code >}}
## Dashboards
Following dashboards are generated from mixins and hosted on github:
- [kubernetes-autoscaling-mixin-ca](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-ca.json)
- [kubernetes-autoscaling-mixin-hpa](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-hpa.json)
- [kubernetes-autoscaling-mixin-karpenter-act](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-act.json)
- [kubernetes-autoscaling-mixin-karpenter-over](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-over.json)
- [kubernetes-autoscaling-mixin-karpenter-perf](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-perf.json)
- [kubernetes-autoscaling-mixin-pdb](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-pdb.json)
- [kubernetes-autoscaling-mixin-vpa](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-vpa.json)

View file

@ -281,8 +281,8 @@ Following dashboards are generated from mixins and hosted on github:
- [loki-logs](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-logs.json)
- [loki-mixin-recording-rules](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-mixin-recording-rules.json)
- [loki-operational](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-operational.json)
- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json)
- [loki-reads](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads.json)
- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json)
- [loki-retention](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-retention.json)
- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json)
- [loki-writes](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes.json)
- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json)

View file

@ -433,5 +433,5 @@ labels:
Following dashboards are generated from mixins and hosted on github:
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)

View file

@ -516,6 +516,31 @@
"name": "gitea",
"source": "https://github.com/go-gitea/gitea",
"subdir": "contrib/gitea-monitoring-mixin"
},
{
"name": "django",
"source": "https://github.com/adinhodovic/django-mixin",
"subdir": ""
},
{
"name": "celery",
"source": "https://github.com/danihodovic/celery-exporter",
"subdir": "celery-mixin"
},
{
"name": "argo-cd-2",
"source": "https://github.com/adinhodovic/argo-cd-mixin",
"subdir": ""
},
{
"name": "ingress-nginx-mixin",
"source": "https://github.com/adinhodovic/ingress-nginx-mixin",
"subdir": ""
},
{
"name": "kubernetes-autoscaling",
"source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin",
"subdir": ""
}
]
}