diff --git a/assets/argo-cd-2/alerts.yaml b/assets/argo-cd-2/alerts.yaml new file mode 100644 index 0000000..adc909d --- /dev/null +++ b/assets/argo-cd-2/alerts.yaml @@ -0,0 +1,107 @@ +groups: +- name: argo-cd + rules: + - alert: ArgoCdAppOutOfSync + annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ + $labels.name }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} is out of sync with the sync status {{ $labels.sync_status + }} for the past 15m. + summary: An ArgoCD Application is Out Of Sync. + expr: | + sum( + argocd_app_info{ + job=~".*", + sync_status!="Synced" + } + ) by (job, dest_server, project, name, sync_status) + > 0 + for: 15m + labels: + severity: warning + - alert: ArgoCdAppUnhealthy + annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ + $labels.name }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} is unhealthy with the health status {{ $labels.health_status + }} for the past 15m. + summary: An ArgoCD Application is Unhealthy. + expr: | + sum( + argocd_app_info{ + job=~".*", + health_status!~"Healthy|Progressing" + } + ) by (job, dest_server, project, name, health_status) + > 0 + for: 15m + labels: + severity: warning + - alert: ArgoCdAppAutoSyncDisabled + annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ + $labels.name }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} has autosync disabled for the past 2h. + summary: An ArgoCD Application has AutoSync Disabled. + expr: | + sum( + argocd_app_info{ + job=~".*", + autosync_enabled!="true", + name!~"" + } + ) by (job, dest_server, project, name, autosync_enabled) + > 0 + for: 2h + labels: + severity: warning + - alert: ArgoCdAppSyncFailed + annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ + $labels.name }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} has failed to sync with the status {{ $labels.phase }} the + past 10m. + summary: An ArgoCD Application has Failed to Sync. + expr: | + sum( + round( + increase( + argocd_app_sync_total{ + job=~".*", + phase!="Succeeded" + }[10m] + ) + ) + ) by (job, dest_server, project, name, phase) > 0 + for: 1m + labels: + severity: warning + - alert: ArgoCdNotificationDeliveryFailed + annotations: + dashboard_url: https://grafana.com/d/argo-cd-notifications-overview-kask/argocd-notifications-overview?var-job={{ + $labels.job }}&var-exported_service={{ $labels.exported_service }} + description: The notification job {{ $labels.job }} has failed to deliver to + {{ $labels.exported_service }} for the past 10m. + summary: ArgoCD Notification Delivery Failed. + expr: | + sum( + round( + increase( + argocd_notifications_deliveries_total{ + job=~".*", + succeeded!="true" + }[10m] + ) + ) + ) by (job, exported_service, succeeded) > 0 + for: 1m + labels: + severity: warning diff --git a/assets/argo-cd-2/dashboards/argo-cd-application-overview.json b/assets/argo-cd-2/dashboards/argo-cd-application-overview.json new file mode 100644 index 0000000..6c68b86 --- /dev/null +++ b/assets/argo-cd-2/dashboards/argo-cd-application-overview.json @@ -0,0 +1,945 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors ArgoCD with a focus on Application status. It is created using the [argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin). Requires custom configuration to add application badges. Please refer to the mixin.", + "editable": true, + "links": [ + { + "tags": [ + "ci/cd", + "argo-cd" + ], + "targetBlank": true, + "title": "ArgoCD Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary by Cluster, Project", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 9, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, health_status)\n", + "legendFormat": "{{ dest_server }}/{{ project }} - {{ health_status }}" + } + ], + "title": "Application Health Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 9, + "x": 9, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, sync_status)\n", + "legendFormat": "{{ dest_server }}/{{ project }} - {{ sync_status }}" + } + ], + "title": "Application Sync Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 9, + "x": 0, + "y": 6 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }[$__rate_interval]\n )\n )\n) by (job, dest_server, project, phase)\n", + "legendFormat": "{{ dest_server }}/{{ project }} - {{ phase }}" + } + ], + "title": "Application Syncs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 9, + "x": 9, + "y": 6 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, autosync_enabled)\n", + "legendFormat": "{{ dest_server }}/{{ project }} - {{ autosync_enabled }}" + } + ], + "title": "Application Auto Sync Enabled", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 6, + "options": { + "content": "No applications defined", + "mode": "markdown" + }, + "pluginVersion": "v11.1.0", + "title": "Application Badges", + "type": "text" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 18, + "x": 0, + "y": 11 + }, + "id": 7, + "title": "Applications (Unhealthy/OutOfSync/AutoSyncDisabled) Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Application", + "url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "health_status" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.displayMode", + "value": "color-background" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 8, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Application" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n health_status!~\"Healthy|Progressing\"\n }\n) by (job, dest_server, project, name, health_status)\n", + "format": "table", + "instant": true + } + ], + "title": "Applications Unhealthy", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "dest_server": true, + "job": true + }, + "indexByName": { + "health_status": 2, + "name": 0, + "project": 1 + }, + "renameByName": { + "dest_server": "Cluster", + "health_status": "Health Status", + "job": "Job", + "name": "Application", + "project": "Project" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Application", + "url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "sync_status" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.displayMode", + "value": "color-background" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 9, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Application" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n sync_status!=\"Synced\"\n }\n) by (job, dest_server, project, name, sync_status) > 0\n", + "format": "table", + "instant": true + } + ], + "title": "Applications Out Of Sync", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "dest_server": true, + "job": true + }, + "indexByName": { + "name": 0, + "project": 1, + "sync_status": 2 + }, + "renameByName": { + "dest_server": "Cluster", + "job": "Job", + "name": "Application", + "project": "Project", + "sync_status": "Sync Status" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Application", + "url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.displayMode", + "value": "color-background" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 10, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Application" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n phase!=\"Succeeded\"\n }[7d]\n )\n )\n) by (job, dest_server, project, name, phase) > 0\n", + "format": "table", + "instant": true + } + ], + "title": "Applications That Failed to Sync[7d]", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "dest_server": true, + "job": true + }, + "indexByName": { + "name": 0, + "phase": 2, + "project": 1 + }, + "renameByName": { + "Value": "Count", + "dest_server": "Cluster", + "job": "Job", + "name": "Application", + "phase": "Phase", + "project": "Project" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Application", + "url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "autosync_enabled" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.displayMode", + "value": "color-background" + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 11, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Application" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n autosync_enabled!=\"true\"\n }\n) by (job, dest_server, project, name, autosync_enabled) > 0\n", + "format": "table", + "instant": true + } + ], + "title": "Applications With Auto Sync Disabled", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "dest_server": true, + "job": true + }, + "indexByName": { + "autosync_enabled": 2, + "name": 0, + "project": 1 + }, + "renameByName": { + "autosync_enabled": "Auto Sync Enabled", + "dest_server": "Cluster", + "job": "Job", + "name": "Application", + "project": "Project" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 12, + "title": "Application ($application)", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 24 + }, + "id": 13, + "interval": "5m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }\n) by (namespace, job, dest_server, project, name, health_status)\n", + "legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ health_status }}" + } + ], + "title": "Application Health Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 24 + }, + "id": 14, + "interval": "5m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }\n) by (namespace, job, dest_server, project, name, sync_status)\n", + "legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ sync_status }}" + } + ], + "title": "Application Sync Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 24 + }, + "id": 15, + "interval": "5m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }[$__rate_interval]\n )\n )\n) by (namespace, job, dest_server, project, name, phase)\n", + "legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ phase }}" + } + ], + "title": "Application Sync Result", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "ci/cd", + "argo-cd" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(argocd_app_info{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(argocd_app_info{namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\"}, dest_server)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Project", + "multi": true, + "name": "project", + "query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\", dest_server=~\"$cluster\"}, project)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Application", + "multi": true, + "name": "application", + "query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\", dest_server=~\"$cluster\", project=~\"$project\"}, name)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "ArgoCD / Application / Overview", + "uid": "argo-cd-application-overview-kask" +} diff --git a/assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json b/assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json new file mode 100644 index 0000000..8a04af3 --- /dev/null +++ b/assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json @@ -0,0 +1,198 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors ArgoCD notifications. It is created using the [argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "ci/cd", + "argo-cd" + ], + "targetBlank": true, + "title": "ArgoCD Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_notifications_deliveries_total{\n namespace=~'$namespace',\njob=~'$job',\n\n exported_service=~\"$exported_service\",\n }[$__rate_interval]\n )\n )\n) by (job, exported_service, succeeded)\n", + "legendFormat": "{{ exported_service }} - Succeeded: {{ succeeded }}" + } + ], + "title": "Notification Deliveries", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_notifications_trigger_eval_total{\n namespace=~'$namespace',\njob=~'$job',\n\n }[$__rate_interval]\n )\n )\n) by (job, name, triggered)\n", + "legendFormat": "{{ name }} - Triggered: {{ triggered }}" + } + ], + "title": "Trigger Evaluations", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "ci/cd", + "argo-cd" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(argocd_notifications_deliveries_total{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(argocd_notifications_deliveries_total{namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Notifications Service", + "multi": true, + "name": "exported_service", + "query": "label_values(argocd_notifications_deliveries_total{namespace=~\"$namespace\", job=~\"$job\"}, exported_service)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timezone": "utc", + "title": "ArgoCD / Notifications / Overview", + "uid": "argo-cd-notifications-overview-kask" +} diff --git a/assets/argo-cd-2/dashboards/argo-cd-operational-overview.json b/assets/argo-cd-2/dashboards/argo-cd-operational-overview.json new file mode 100644 index 0000000..a9860b5 --- /dev/null +++ b/assets/argo-cd-2/dashboards/argo-cd-operational-overview.json @@ -0,0 +1,1139 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors ArgoCD with a focus on the operational. It is created using the [argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "ci/cd", + "argo-cd" + ], + "targetBlank": true, + "title": "ArgoCD Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_cluster_info{\n namespace=~'$namespace',\n job=~'$job'\n }\n)\n" + } + ], + "title": "Clusters", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n count(\n argocd_app_info{\n namespace=~'$namespace',\n job=~'$job'\n }\n )\n by (repo)\n)\n" + } + ], + "title": "Repositories", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n)\n" + } + ], + "title": "Applications", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Healthy" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Degraded" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Progressing" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (health_status)\n", + "instant": true, + "legendFormat": "{{ health_status }}" + } + ], + "title": "Health Status", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Synced" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OutOfSync" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unknown" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 5 + }, + "id": 6, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (sync_status)\n", + "instant": true, + "legendFormat": "{{ sync_status }}" + } + ], + "title": "Sync Status", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Application", + "type": "dashboard", + "url": "/d/argo-cd-application-overview-kask/argocd-notifications-overview?&var-project=${__data.fields.Project}&var-application=${__value.raw}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 7, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Application" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, name, health_status, sync_status)\n", + "format": "table", + "instant": true + } + ], + "title": "Applications", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "dest_server": true, + "job": true + }, + "indexByName": { + "health_status": 2, + "name": 0, + "project": 1, + "sync_status": 3 + }, + "renameByName": { + "dest_server": "Cluster", + "health_status": "Health Status", + "job": "Job", + "name": "Application", + "project": "Project", + "sync_status": "Sync Status" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 8, + "title": "Sync Stats", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }[$__rate_interval]\n )\n )\n) by (job, dest_server, project, name)\n", + "legendFormat": "{{ dest_server }}/{{ project }}/{{ name }}" + } + ], + "title": "Sync Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n phase=~\"Error|Failed\"\n }[$__rate_interval]\n )\n )\n) by (job, dest_server, project, application, phase)\n", + "legendFormat": "{{ dest_server }}/{{ project }}/{{ application }} - {{ phase }}" + } + ], + "title": "Sync Failures", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 11, + "title": "Controller Stats", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_app_reconcile_count{\n namespace=~'$namespace',\n job=~'$job',\n dest_server=~'$cluster'\n }[$__rate_interval]\n )\n )\n) by (namespace, job, dest_server)\n", + "legendFormat": "{{ namespace }}/{{ dest_server }}" + } + ], + "title": "Recociliation Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 13, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n argocd_app_reconcile_bucket{\n namespace=~'$namespace',\n job=~'$job',\n dest_server=~'$cluster'\n }[$__rate_interval]\n )\n) by (le)\n", + "format": "heatmap", + "legendFormat": "{{ le }}" + } + ], + "title": "Reconciliation Performance", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n argocd_app_k8s_request_total{\n namespace=~'$namespace',\n job=~'$job',\n project=~'$project'\n }[$__rate_interval]\n )\n )\n) by (job, server, project, verb, resource_kind)\n", + "legendFormat": "{{ server }}/{{ project }} - {{ verb }}/{{ resource_kind }}" + } + ], + "title": "K8s API Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_kubectl_exec_pending{\n namespace=~'$namespace',\n job=~'$job'\n }\n) by (job, command)\n", + "legendFormat": "{{ dest_server }} - {{ command }}" + } + ], + "title": "Pending Kubectl Runs", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 16, + "title": "Cluster Stats", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 32 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_cluster_api_resource_objects{\n namespace=~'$namespace',\n job=~'$job',\n server=~'$cluster'\n }\n) by (namespace, job, server)\n", + "legendFormat": "{{ server }}" + } + ], + "title": "Resource Objects", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 32 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n argocd_cluster_api_resources{\n namespace=~'$namespace',\n job=~'$job',\n server=~'$cluster'\n }\n) by (namespace, job, server)\n", + "legendFormat": "{{ server }}" + } + ], + "title": "API Resources", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 32 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n argocd_cluster_events_total{\n namespace=~'$namespace',\n job=~'$job',\n server=~'$cluster'\n }[$__rate_interval]\n )\n) by (namespace, job, server)\n", + "legendFormat": "{{ server }}" + } + ], + "title": "Cluster Events", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 20, + "title": "Repo Server Stats", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n argocd_git_request_total{\n namespace=~'$namespace',\n job=~'$job',\n request_type=\"ls-remote\"\n }[$__rate_interval]\n )\n) by (namespace, job, repo)\n", + "legendFormat": "{{ namespace }} - {{ repo }}" + } + ], + "title": "Git Requests (ls-remote)", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n argocd_git_request_total{\n namespace=~'$namespace',\n job=~'$job',\n request_type=\"fetch\"\n }[$__rate_interval]\n )\n) by (namespace, job, repo)\n", + "legendFormat": "{{ namespace }} - {{ repo }}" + } + ], + "title": "Git Requests (checkout)", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 45 + }, + "id": 23, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n argocd_git_request_duration_seconds_bucket{\n namespace=~'$namespace',\n job=~'$job',\n request_type=\"fetch\"\n }[$__rate_interval]\n )\n) by (le)\n", + "format": "heatmap", + "legendFormat": "{{ le }}" + } + ], + "title": "Git Fetch Performance", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 45 + }, + "id": 24, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n argocd_git_request_duration_seconds_bucket{\n namespace=~'$namespace',\n job=~'$job',\n request_type=\"ls-remote\"\n }[$__rate_interval]\n )\n) by (le)\n", + "format": "heatmap", + "legendFormat": "{{ le }}" + } + ], + "title": "Git Ls-remote Performance", + "type": "heatmap" + } + ], + "schemaVersion": 39, + "tags": [ + "ci/cd", + "argo-cd" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(argocd_app_info{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(job)", + "refresh": 2, + "regex": "argo.*", + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\"}, dest_server)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Project", + "multi": true, + "name": "project", + "query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\", dest_server=~\"$cluster\"}, project)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "ArgoCD / Operational / Overview", + "uid": "argo-cd-operational-overview-kask" +} diff --git a/assets/argo-cd-2/rules.yaml b/assets/argo-cd-2/rules.yaml new file mode 100644 index 0000000..19765bd --- /dev/null +++ b/assets/argo-cd-2/rules.yaml @@ -0,0 +1 @@ +null diff --git a/assets/celery/alerts.yaml b/assets/celery/alerts.yaml new file mode 100644 index 0000000..9f8d8c1 --- /dev/null +++ b/assets/celery/alerts.yaml @@ -0,0 +1,76 @@ +groups: +- name: celery + rules: + - alert: CeleryTaskHighFailRate + annotations: + dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{ + $labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name + }} + description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name + }}/{{ $labels.name }} the past 10m. + summary: Celery high task fail rate. + expr: | + sum( + increase( + celery_task_failed_total{ + job=~".*celery.*", + queue_name!~"None", + name!~"None" + }[10m] + ) + ) by (job, namespace, queue_name, name) + / + ( + sum( + increase( + celery_task_failed_total{ + job=~".*celery.*", + queue_name!~"None", + name!~"None" + }[10m] + ) + ) by (job, namespace, queue_name, name) + + + sum( + increase( + celery_task_succeeded_total{ + job=~".*celery.*", + queue_name!~"None", + name!~"None" + }[10m] + ) + ) by (job, namespace, queue_name, name) + ) + * 100 > 5 + for: 1m + labels: + severity: warning + - alert: CeleryHighQueueLength + annotations: + dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ + $labels.job }}&var-queue_name={{ $labels.queue_name }} + description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name + }} the past 20m. + summary: Celery high queue length. + expr: | + sum( + celery_queue_length{ + job=~".*celery.*", + queue_name!~"None" + } + ) by (job, namespace, queue_name) + > 100 + for: 20m + labels: + severity: warning + - alert: CeleryWorkerDown + annotations: + dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ + $labels.job }} + description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline. + summary: A Celery worker is offline. + expr: | + celery_worker_up{job=~".*celery.*"} == 0 + for: 15m + labels: + severity: warning diff --git a/assets/celery/dashboards/celery-tasks-by-task.json b/assets/celery/dashboards/celery-tasks-by-task.json new file mode 100644 index 0000000..98da03e --- /dev/null +++ b/assets/celery/dashboards/celery-tasks-by-task.json @@ -0,0 +1,590 @@ +{ + "description": "A dashboard that monitors Celery. It is created using the Celery-mixin for the the (Celery-exporter)[https://github.com/danihodovic/celery-exporter]", + "editable": true, + "links": [ + { + "tags": [ + "celery", + "celery-mixin" + ], + "targetBlank": true, + "title": "Celery Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Tasks", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "noValue": 0, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Success Rate" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Succeeded" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n/(sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n+sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n) > -1\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + } + ], + "title": "Task Stats", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Value #A": 1, + "Value #B": 2, + "Value #C": 3, + "Value #D": 4, + "Value #E": 5, + "Value #F": 6, + "Value #G": 7, + "Value #H": 8, + "name": 0 + }, + "renameByName": { + "Value #A": "Success Rate", + "Value #B": "Succeeded", + "Value #C": "Failed", + "Value #D": "Sent", + "Value #E": "Received", + "Value #F": "Rejected", + "Value #G": "Retried", + "Value #H": "Revoked", + "name": "Name" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 3, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum (\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n ) by (name, exception) > 0\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Task Exceptions", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value": 2, + "exception": 1, + "name": 0 + }, + "renameByName": { + "exception": "Exception", + "name": "Task" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", + "legendFormat": "Succeeded - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", + "legendFormat": "Failed - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", + "legendFormat": "Sent - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", + "legendFormat": "Received - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", + "legendFormat": "Retried - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", + "legendFormat": "Revoked - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", + "legendFormat": "Rejected - {{ name }}" + } + ], + "title": "Tasks Completed", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name, exception) > 0\n", + "legendFormat": "{{ name }}/{{ exception }}" + } + ], + "title": "Task Exceptions", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "P50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "P95" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "P99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n", + "legendFormat": "P50 - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n", + "legendFormat": "P95 - {{ name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n", + "legendFormat": "P99 - {{ name }}" + } + ], + "title": "Tasks Runtime", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "celery", + "celery-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(celery_worker_up{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(celery_worker_up{namespace=\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Queue Name", + "multi": false, + "name": "queue_name", + "query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", name!~\"None\"}, queue_name)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Task", + "multi": true, + "name": "task", + "query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", queue_name=~\"$queue_name\", name!~\"None\"}, name)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timezone": "utc", + "title": "Celery / Tasks / By Task", + "uid": "celery-tasks-by-task-32s3" +} diff --git a/assets/celery/dashboards/celery-tasks-overview.json b/assets/celery/dashboards/celery-tasks-overview.json new file mode 100644 index 0000000..c752bf5 --- /dev/null +++ b/assets/celery/dashboards/celery-tasks-overview.json @@ -0,0 +1,1008 @@ +{ + "description": "A dashboard that monitors Celery. It is created using the Celery-mixin for the the (Celery-exporter)[https://github.com/danihodovic/celery-exporter].", + "editable": true, + "links": [ + { + "tags": [ + "celery", + "celery-mixin" + ], + "targetBlank": true, + "title": "Celery Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n celery_worker_up{\n job=\"$job\",\n } == 1\n)\n" + } + ], + "title": "Workers", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 5, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n celery_worker_tasks_active{\n job=\"$job\",\n }\n)\n" + } + ], + "title": "Tasks Active", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 10, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n )\n)\n" + } + ], + "title": "Tasks received by workers [1w]", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 0.94999999999999996 + }, + { + "color": "green", + "value": 0.98999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 15, + "y": 1 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n )\n)\n/(sum(\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n )\n)\n+sum(\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n )\n)\n)\n" + } + ], + "title": "Tasks Success Rate [1w]", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n celery_task_runtime_sum{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n)\n/\nsum(\n rate(\n celery_task_runtime_count{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n) > 0\n" + } + ], + "title": "Average Runtime for Tasks [1w]", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Task" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To View", + "type": "dashboard", + "url": "/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-task=${__data.fields.Task}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 5 + }, + "id": 7, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum (\n increase(\n celery_task_failed_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n ) > 0\n ) by (job, name)\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top Failed Tasks [1w]", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value": 1, + "name": 0 + }, + "renameByName": { + "name": "Task" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 5 + }, + "id": 8, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum (\n increase(\n celery_task_failed_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n ) by (job, exception) > 0\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top Task Exceptions [1w]", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value": 1, + "exception": 0 + }, + "renameByName": { + "exception": "Exception" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Task" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Task", + "type": "dashboard", + "url": "/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-task=${__data.fields.Task}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 5 + }, + "id": 9, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Runtime" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n rate(\n celery_task_runtime_sum{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n) by(name)\n/\nsum (\n rate(\n celery_task_runtime_count{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[1w]\n )\n) by (name) > 0\n", + "format": "table", + "instant": true + } + ], + "title": "Top Average Task Runtime [1w]", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Value": 1, + "name": 0 + }, + "renameByName": { + "Value": "Runtime", + "name": "Task" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 10, + "title": "Queues", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n celery_queue_length{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }\n) by (job, queue_name)\n", + "legendFormat": "{{ job }}/{{ queue_name }}" + } + ], + "title": "Queue Length", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 12, + "title": "Tasks", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "noValue": 0, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Success Rate" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 13, + "options": { + "sortBy": [ + { + "desc": true, + "displayName": "Succeeded" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job)\n/(sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job)\n+sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job)\n) > -1\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job) > 0\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (job) > 0\n", + "format": "table", + "instant": true + } + ], + "title": "Task Stats", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Value #A": 1, + "Value #B": 2, + "Value #C": 3, + "Value #D": 4, + "Value #E": 5, + "Value #F": 6, + "Value #G": 7, + "Value #H": 8, + "job": 0 + }, + "renameByName": { + "Value #A": "Success Rate", + "Value #B": "Succeeded", + "Value #C": "Failed", + "Value #D": "Sent", + "Value #E": "Received", + "Value #F": "Rejected", + "Value #G": "Retried", + "Value #H": "Revoked", + "job": "Job" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "Succeeded" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "Failed" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "Sent" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "Retried" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "Revoked" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "Rejected" + } + ], + "title": "Tasks Completed", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "P50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "P95" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "P99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Tasks Runtime", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "celery", + "celery-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(celery_worker_up{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(celery_worker_up{namespace=\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Queue Name", + "multi": false, + "name": "queue_name", + "query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", name!~\"None\"}, queue_name)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timezone": "utc", + "title": "Celery / Tasks / Overview", + "uid": "celery-tasks-overview-32s3" +} diff --git a/assets/celery/rules.yaml b/assets/celery/rules.yaml new file mode 100644 index 0000000..19765bd --- /dev/null +++ b/assets/celery/rules.yaml @@ -0,0 +1 @@ +null diff --git a/assets/django/alerts.yaml b/assets/django/alerts.yaml new file mode 100644 index 0000000..9fd1f94 --- /dev/null +++ b/assets/django/alerts.yaml @@ -0,0 +1,97 @@ +groups: +- name: django + rules: + - alert: DjangoMigrationsUnapplied + annotations: + dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }} + description: The job {{ $labels.job }} has unapplied migrations. + summary: Django has unapplied migrations. + expr: | + sum( + django_migrations_unapplied_total{ + job=~"django" + } + ) by (namespace, job) + > 0 + for: 15m + labels: + severity: warning + - alert: DjangoDatabaseException + annotations: + dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }} + description: The job {{ $labels.job }} has hit the database exception {{ $labels.type + }}. + summary: Django database exception. + expr: | + sum ( + increase( + django_db_errors_total{ + job=~"django" + }[10m] + ) + ) by (type, namespace, job) + > 0 + labels: + severity: info + - alert: DjangoHighHttp4xxErrorRate + annotations: + dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }} + description: More than 5% HTTP requests with status 4xx for {{ $labels.job }}/{{ + $labels.view }} the past 5m. + summary: Django high HTTP 4xx error rate. + expr: | + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + status=~"^4.*", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + / + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + * 100 > 5 + for: 1m + labels: + severity: warning + - alert: DjangoHighHttp5xxErrorRate + annotations: + dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }} + description: More than 5% HTTP requests with status 5xx for {{ $labels.job }}/{{ + $labels.view }} the past 5m. + summary: Django high HTTP 5xx error rate. + expr: | + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + status=~"^5.*", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + / + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + * 100 > 5 + for: 1m + labels: + severity: warning diff --git a/assets/django/dashboards/django-overview.json b/assets/django/dashboards/django-overview.json new file mode 100644 index 0000000..9d181db --- /dev/null +++ b/assets/django/dashboards/django-overview.json @@ -0,0 +1,741 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Django which focuses on giving a overview for the system (requests, db, cache). It is created using the [Django-mixin](https://github.com/adinhodovic/django-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "django", + "django-mixin" + ], + "targetBlank": true, + "title": "Django Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_requests_total_by_view_transport_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n }[$__rate_interval]\n )\n ), 0.001\n)\n" + } + ], + "title": "Request Volume", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n rate (\n django_db_execute_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n )\n) by (namespace, job)\n" + } + ], + "title": "Database Ops", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n rate (\n django_cache_get_hits_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[30m]\n )\n) by (namespace, job)\n/\nsum (\n rate (\n django_cache_get_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[30m]\n )\n) by (namespace, job)\n" + } + ], + "title": "Cache Hitrate [30m]", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "percent" + } + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"2.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "2xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"3.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "3xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"4.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "4xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"5.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "5xx" + } + ], + "title": "Responses", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 6, + "title": "Database", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max (\n django_migrations_applied_total {\n namespace=\"$namespace\",\n job=~\"$job\"\n }\n) by (namespace, job)\n" + } + ], + "title": "Migrations Applied", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 12 + }, + "id": 8, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max (\n django_migrations_unapplied_total {\n namespace=\"$namespace\",\n job=~\"$job\"\n }\n) by (namespace, job)\n" + } + ], + "title": "Migrations Unapplied", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 9, + "options": { + "sortBy": [ + { + "displayName": "Type" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n topk(10,\n sum by (type) (\n increase(\n django_db_errors_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[1w]\n ) > 0\n )\n )\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top Database Errors (1w)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "job": 1, + "namespace": 0, + "type": 2 + }, + "renameByName": { + "job": "Job", + "namespace": "Namespace", + "type": "Type" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n django_db_new_connections_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, vendor)\n)\n", + "legendFormat": "{{ vendor }}" + } + ], + "title": "Database Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n", + "legendFormat": "50 - {{ vendor }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n", + "legendFormat": "95 - {{ vendor }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n", + "legendFormat": "99 - {{ vendor }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.999,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n", + "legendFormat": "99.9 - {{ vendor }}" + } + ], + "title": "Database Latency", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 12, + "title": "Cache", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "percent" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n django_cache_get_hits_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n) by (namespace, job, backend)\n", + "legendFormat": "Hit - {{ backend }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n django_cache_get_misses_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n) by (namespace, job, backend)\n", + "legendFormat": "Miss - {{ backend }}" + } + ], + "title": "Cache Get", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "django", + "django-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(django_http_responses_total_by_status_view_method_total{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Django / Overview", + "uid": "django-overview-jkwq" +} diff --git a/assets/django/dashboards/django-requests-by-view.json b/assets/django/dashboards/django-requests-by-view.json new file mode 100644 index 0000000..55595f5 --- /dev/null +++ b/assets/django/dashboards/django-requests-by-view.json @@ -0,0 +1,673 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Django which focuses on breaking down requests by view. It is created using the [Django-mixin](https://github.com/adinhodovic/django-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "django", + "django-mixin" + ], + "targetBlank": true, + "title": "Django Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0.90000000000000002 + }, + { + "color": "yellow", + "value": 0.94999999999999996 + }, + { + "color": "green", + "value": 0.98999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\",\n status!~\"[4-5].*\"\n }[1w]\n )\n) /\nsum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[1w]\n )\n)\n" + } + ], + "title": "Success Rate (non 4xx-5xx responses) [1w]", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 1 + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (view) (\n increase(\n django_http_exceptions_total_by_view_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n }[1w]\n ) > 0\n)\n" + } + ], + "title": "HTTP Exceptions [1w]", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 2000 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum (\n rate (\n django_http_requests_latency_seconds_by_view_method_bucket {\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__range]\n )\n ) by (job, le)\n)\n" + } + ], + "title": "Average Request Latency (P50) [1w]", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 2500 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum (\n rate (\n django_http_requests_latency_seconds_by_view_method_bucket {\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__range]\n )\n ) by (job, le)\n)\n" + } + ], + "title": "Average Request Latency (P95) [1w]", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 6, + "title": "Request & Responses", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_requests_total_by_view_transport_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\"\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "reqps" + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "percent" + } + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"2.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "2xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"3.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "3xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"4.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "4xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"5.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n", + "legendFormat": "5xx" + } + ], + "title": "Responses", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 9, + "title": "Latency & Status Codes", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, status, method), 0.001\n)\n", + "legendFormat": "{{ view }} / {{ status }} / {{ method }}" + } + ], + "title": "Responses Status Codes", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n", + "legendFormat": "50 - {{ view }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n", + "legendFormat": "95 - {{ view }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n", + "legendFormat": "99 - {{ view }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.999,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n", + "legendFormat": "99.9 - {{ view }}" + } + ], + "title": "Request Latency", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "django", + "django-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(django_http_responses_total_by_status_view_method_total{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "View", + "multi": false, + "name": "view", + "query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view!~\"|health_check:health_check_home|prometheus-django-metrics\"}, view)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Method", + "multi": true, + "name": "method", + "query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view=~\"$view\"}, method)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Django / Requests / By View", + "uid": "django-requests-by-view-jkwq" +} diff --git a/assets/django/dashboards/django-requests-overview.json b/assets/django/dashboards/django-requests-overview.json new file mode 100644 index 0000000..11c4f48 --- /dev/null +++ b/assets/django/dashboards/django-requests-overview.json @@ -0,0 +1,1109 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Django which focuses on giving a overview for requests. It is created using the [Django-mixin](https://github.com/adinhodovic/django-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "django", + "django-mixin" + ], + "targetBlank": true, + "title": "Django Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.001 + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_requests_total_by_view_transport_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\"\n }[$__rate_interval]\n )\n ), 0.001\n)\n" + } + ], + "title": "Request Volume", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0.90000000000000002 + }, + { + "color": "yellow", + "value": 0.94999999999999996 + }, + { + "color": "green", + "value": 0.98999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\",\n status!~\"[4-5].*\"\n }[$__rate_interval]\n )\n) /\nsum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\"\n }[$__rate_interval]\n )\n)\n" + } + ], + "title": "Success Rate (non 4-5xx responses)", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 2500 + }, + { + "color": "red", + "value": 5000 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum (\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n }[$__rate_interval]\n )\n ) by (job, le)\n)\n" + } + ], + "title": "Request Latency (P95)", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0.10000000000000001 + }, + { + "color": "yellow", + "value": 0.20000000000000001 + }, + { + "color": "green", + "value": 0.29999999999999999 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum (\n rate (\n django_http_requests_body_total_bytes_bucket {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n )\n ) by (job, le)\n)\n" + } + ], + "title": "Request Body Size (P95)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 6, + "title": "API Views & Other", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\",\n status=~\"2.*\",\n view!~\"admin.*\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view), 0.001\n)\n", + "legendFormat": "{{ view }} / 2xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\",\n status=~\"4.*\",\n view!~\"admin.*\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view), 0.001\n)\n", + "legendFormat": "{{ view }} / 4xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\",\n status=~\"5.*\",\n view!~\"admin.*\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view), 0.001\n)\n", + "legendFormat": "{{ view }} / 5xx" + } + ], + "title": "API & Other Views Response Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "dtdurations" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "View" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To View", + "type": "dashboard", + "url": "/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace=${__data.fields.Namespace}&var-job=${__data.fields.Job}&var-view=${__data.fields.View}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 8, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "P50 Latency" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n rate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics|\",\n view!~\"admin.*\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, le)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n rate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics|\",\n view!~\"admin.*\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, le)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n rate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics|\",\n view!~\"admin.*\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, le)\n)\n", + "format": "table", + "instant": true + } + ], + "title": "API & Other Views Request Latency", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Value #A": 3, + "Value #B": 4, + "Value #C": 5, + "job": 1, + "namespace": 0, + "view": 2 + }, + "renameByName": { + "Value #A": "P50 Latency", + "Value #B": "P95 Latency", + "Value #C": "P99 Latency", + "job": "Job", + "namespace": "Namespace", + "view": "View" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 9, + "title": "Admin Views", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\",\n status=~\"2.*\",\n view=~\"admin.*\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view), 0.001\n)\n", + "legendFormat": "{{ view }} / 2xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\",\n status=~\"4.*\",\n view=~\"admin.*\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view), 0.001\n)\n", + "legendFormat": "{{ view }} / 4xx" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\",\n status=~\"5.*\",\n view=~\"admin.*\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view), 0.001\n)\n", + "legendFormat": "{{ view }} / 5xx" + } + ], + "title": "Admin Views Response Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "dtdurations" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "View" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To View", + "type": "dashboard", + "url": "/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace=${__data.fields.Namespace}&var-job=${__data.fields.Job}&var-view=${__data.fields.View}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 11, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "P50 Latency" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n rate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics|\",\n view=~\"admin.*\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, le)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n rate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics|\",\n view=~\"admin.*\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, le)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n rate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=~\"$view\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics|\",\n view=~\"admin.*\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, le)\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Admin Request Latency", + "transformations": [ + { + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Value #A": 3, + "Value #B": 4, + "Value #C": 5, + "job": 1, + "namespace": 0, + "view": 2 + }, + "renameByName": { + "Value #A": "P50 Latency", + "Value #B": "P95 Latency", + "Value #C": "P99 Latency", + "job": "Job", + "namespace": "Namespace", + "view": "View" + } + } + } + ] + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 12, + "title": "Weekly Breakdown", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "View" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To View", + "type": "dashboard", + "url": "/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace=${__data.fields.Namespace}&var-job=${__data.fields.Job}&var-view=${__data.fields.View}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 13, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n topk(10,\n sum by (namespace, job, view) (\n increase(\n django_http_exceptions_total_by_view_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n }[1w]\n ) > 0\n )\n )\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top Exceptions by View (1w)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "job": 1, + "namespace": 0, + "view": 2 + }, + "renameByName": { + "job": "Job", + "namespace": "Namespace", + "view": "View" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 14, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n topk(10,\n sum by (namespace, job, type) (\n increase(\n django_http_exceptions_total_by_type_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[1w]\n ) > 0\n )\n )\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top Exceptions by Type (1w)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "job": 1, + "namespace": 0, + "type": 2 + }, + "renameByName": { + "job": "Job", + "namespace": "Namespace", + "type": "Type" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "View" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To View", + "type": "dashboard", + "url": "/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace=${__data.fields.Namespace}&var-job=${__data.fields.Job}&var-view=${__data.fields.View}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 15, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n topk(10,\n sum by (namespace, job, view) (\n increase(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"|health_check:health_check_home|prometheus-django-metrics\",\n method=~\"$method\"\n }[1w]\n ) > 0\n )\n )\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top Responses By View (1w)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "job": 1, + "namespace": 0, + "view": 2 + }, + "renameByName": { + "job": "Job", + "namespace": "Namespace", + "view": "View" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 16, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(10,\n round(\n sum by (namespace, job, templatename) (\n increase(\n django_http_responses_total_by_templatename_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n templatename!~\".*'health_check/index.html'.*|None\"\n }[1w]\n ) > 0\n )\n )\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top Templates (1w)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "job": 1, + "namespace": 0, + "templatename": 2 + }, + "renameByName": { + "job": "Job", + "namespace": "Namespace", + "templatename": "Template Name" + } + } + } + ], + "type": "table" + } + ], + "schemaVersion": 39, + "tags": [ + "django", + "django-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(django_http_responses_total_by_status_view_method_total{}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "View", + "multi": true, + "name": "view", + "query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view!~\"|health_check:health_check_home|prometheus-django-metrics\"}, view)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Method", + "multi": true, + "name": "method", + "query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view=~\"$view\"}, method)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timezone": "utc", + "title": "Django / Requests / Overview", + "uid": "django-requests-jkwq" +} diff --git a/assets/django/rules.yaml b/assets/django/rules.yaml new file mode 100644 index 0000000..19765bd --- /dev/null +++ b/assets/django/rules.yaml @@ -0,0 +1 @@ +null diff --git a/assets/ingress-nginx-mixin/alerts.yaml b/assets/ingress-nginx-mixin/alerts.yaml new file mode 100644 index 0000000..df165af --- /dev/null +++ b/assets/ingress-nginx-mixin/alerts.yaml @@ -0,0 +1,42 @@ +groups: +- name: nginx.rules + rules: + - alert: NginxConfigReloadFailed + annotations: + dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{ + $labels.job }}&var-controller_class={{ $labels.controller_class }} + description: Nginx config reload failed for the controller with the class {{ + $labels.controller_class }}. + summary: Nginx config reload failed. + expr: | + sum( + nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"} + ) by (job, controller_class) + == 0 + for: 5m + labels: + severity: warning + - alert: NginxHighHttp4xxErrorRate + annotations: + dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{ + $labels.exported_namespace }}&var-ingress={{ $labels.ingress }} + description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace + }}/{{ $labels.ingress }} the past 5m. + summary: Nginx high HTTP 4xx error rate. + expr: | + (sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^4.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5 + for: 1m + labels: + severity: info + - alert: NginxHighHttp5xxErrorRate + annotations: + dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{ + $labels.exported_namespace }}&var-ingress={{ $labels.ingress }} + description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace + }}/{{ $labels.ingress }} the past 5m. + summary: Nginx high HTTP 5xx error rate. + expr: | + (sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^5.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5 + for: 1m + labels: + severity: warning diff --git a/assets/ingress-nginx-mixin/dashboards/ingress-nginx-overview.json b/assets/ingress-nginx-mixin/dashboards/ingress-nginx-overview.json new file mode 100644 index 0000000..cd977a6 --- /dev/null +++ b/assets/ingress-nginx-mixin/dashboards/ingress-nginx-overview.json @@ -0,0 +1,812 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Ingress-nginx. It is created using the (Ingress-Nginx-mixin)[https://github.com/adinhodovic/ingress-nginx-mixin]", + "editable": true, + "links": [ + { + "tags": [ + "ingress-nginx", + "ingress-nginx-mixin" + ], + "targetBlank": true, + "title": "Ingress Nginx Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Controller", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.001 + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n irate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n namespace=~\"$namespace\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\"\n }[$__rate_interval]\n )\n ), 0.001\n)\n" + } + ], + "title": "Controller Request Volume", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n avg_over_time(\n nginx_ingress_controller_nginx_process_connections{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n)\n" + } + ], + "title": "Controller Connections", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 0.94999999999999996 + }, + { + "color": "green", + "value": 0.98999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n status!~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n exported_namespace=~\"$exported_namespace\",\n namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n)\n" + } + ], + "title": "Controller Success Rate (non $error_codes-xx responses)", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "avg(\n irate(\n nginx_ingress_controller_success{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n) * 60\n" + } + ], + "title": "Config Reloads", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "bool" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n nginx_ingress_controller_config_last_reload_successful{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_namespace=~\"$namespace\"\n } == 0\n) OR vector(0)\n" + } + ], + "title": "Last Config Failed", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 7, + "title": "Ingress", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n irate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n ingress=~\"$ingress\",\n exported_namespace=~\"$exported_namespace\"\n }[$__rate_interval]\n )\n ) by (ingress, exported_namespace), 0.001\n)\n", + "legendFormat": "{{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Ingress Request Volume", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status!~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n) by (ingress, exported_namespace)\n/\nsum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (ingress, exported_namespace)\n", + "legendFormat": "{{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Ingress Success Rate (non $error_codes-xx responses)", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "dtdurations" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Ingress" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Ingress", + "type": "dashboard", + "url": "/d/ingress-nginx-request-handling-jqkw/ingress-nginx-overview?var-exported_namespace=${__data.fields.Namespace}&var-ingress=${__data.fields.Ingress}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "IN" + }, + "properties": [ + { + "id": "unit", + "value": "binBps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OUT" + }, + "properties": [ + { + "id": "unit", + "value": "binBps" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 10, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "P50 Latency" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.50, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.90, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.99, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n irate(\n nginx_ingress_controller_request_size_sum{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (job, ingress, exported_namespace)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n irate(\n nginx_ingress_controller_response_size_sum{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (job, ingress, exported_namespace)\n", + "format": "table", + "instant": true + } + ], + "title": "Ingress Percentile Response Times and Transfer Rates", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #E": 6, + "exported_namespace": 0, + "ingress": 1 + }, + "renameByName": { + "Value #A": "P50 Latency", + "Value #B": "P95 Latency", + "Value #C": "P99 Latency", + "Value #D": "IN", + "Value #E": "OUT", + "exported_namespace": "Namespace", + "ingress": "Ingress", + "job": "Job" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 11, + "title": "Certificates", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Host" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "Go To Site", + "type": "link", + "url": "https://${__data.fields.Host}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "TTL" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1814400 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 12, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": false, + "displayName": "TTL" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "avg(\n nginx_ingress_controller_ssl_expire_time_seconds{\n job=~\"$job\",\n pod=~\"$controller\"\n }\n) by (host) - time()\n", + "format": "table", + "instant": true + } + ], + "title": "Ingress Certificate Expiry", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": { + "Value": 1, + "host": 0 + }, + "renameByName": { + "Value": "TTL", + "host": "Host" + } + } + } + ], + "type": "table" + } + ], + "schemaVersion": 39, + "tags": [ + "ingress-nginx", + "ingress-nginx-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(nginx_ingress_controller_config_hash{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Controller Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\"}, controller_namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Controller Class", + "multi": true, + "name": "controller_class", + "query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\", controller_namespace=~\"$namespace\"}, controller_class)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Controller", + "multi": true, + "name": "controller", + "query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\", controller_namespace=~\"$namespace\", controller_class=~\"$controller_class\"}, controller_pod)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Ingress Namespace", + "multi": true, + "name": "exported_namespace", + "query": "label_values(nginx_ingress_controller_requests{job=\"$job\", namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\"}, exported_namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Ingress", + "multi": true, + "name": "ingress", + "query": "label_values(nginx_ingress_controller_requests{job=\"$job\", namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", exported_namespace=~\"$exported_namespace\"}, ingress)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": "4-5", + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "description": "4 represents all 4xx codes, 5 represents all 5xx codes", + "includeAll": true, + "label": "Error Codes", + "multi": true, + "name": "error_codes", + "options": [ + { + "selected": true, + "text": "4", + "value": "4" + }, + { + "selected": false, + "text": "5", + "value": "5" + } + ], + "query": "4 : 4,5 : 5", + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timezone": "utc", + "title": "Ingress Nginx / Overview", + "uid": "ingress-nginx-overview-12mk" +} diff --git a/assets/ingress-nginx-mixin/dashboards/ingress-nginx-request-handling-performance.json b/assets/ingress-nginx-mixin/dashboards/ingress-nginx-request-handling-performance.json new file mode 100644 index 0000000..fee174d --- /dev/null +++ b/assets/ingress-nginx-mixin/dashboards/ingress-nginx-request-handling-performance.json @@ -0,0 +1,594 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Ingress-nginx. It is created using the (Ingress-Nginx-mixin)[https://github.com/adinhodovic/ingress-nginx-mixin]", + "editable": true, + "links": [ + { + "tags": [ + "ingress-nginx", + "ingress-nginx-mixin" + ], + "targetBlank": true, + "title": "Ingress Nginx Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Ingress Response Times", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.5,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": ".5 - {{ ingress }}/{{ exported_namespace }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.95,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": ".95 - {{ ingress }}/{{ exported_namespace }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.99,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": ".99 - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Total Request Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.5,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": ".5 - {{ ingress }}/{{ exported_namespace }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.95,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": ".95 - {{ ingress }}/{{ exported_namespace }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.99,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": ".99 - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Upstream Response Time", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 4, + "title": "Ingress Paths", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (path, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n", + "legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Request Volume", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n .5,\n sum by (le, path, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n", + "legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Median upstream response time", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status=~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n)\n/\nsum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n exported_namespace =~ \"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n", + "legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Response error rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_response_duration_seconds_sum{\n job=~\"$job\",\n exported_namespace =~ \"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n", + "legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Upstream time consumed", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status=~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n) by(path, ingress, exported_namespace, status)\n", + "legendFormat": "{{ status }} {{ path }} - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Response error volume", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum (\n rate (\n nginx_ingress_controller_response_size_sum {\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (path, ingress, exported_namespace)\n/\nsum (\n rate(\n nginx_ingress_controller_response_size_count {\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n }[$__rate_interval]\n )\n) by (path, ingress, exported_namespace)\n", + "legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}" + } + ], + "title": "Average response size", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "ingress-nginx", + "ingress-nginx-mixin" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(nginx_ingress_controller_config_hash{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Ingress Namespace", + "multi": true, + "name": "exported_namespace", + "query": "label_values(nginx_ingress_controller_requests{job=\"$job\"}, exported_namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Ingress", + "multi": true, + "name": "ingress", + "query": "label_values(nginx_ingress_controller_requests{job=\"$job\", exported_namespace=~\"$exported_namespace\"}, ingress)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": "4-5", + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "description": "4 represents all 4xx codes, 5 represents all 5xx codes", + "includeAll": true, + "label": "Error Codes", + "multi": true, + "name": "error_codes", + "options": [ + { + "selected": true, + "text": "4", + "value": "4" + }, + { + "selected": false, + "text": "5", + "value": "5" + } + ], + "query": "4 : 4,5 : 5", + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timezone": "utc", + "title": "Ingress Nginx / Request Handling Performance", + "uid": "ingress-nginx-request-handling-jqkw" +} diff --git a/assets/ingress-nginx-mixin/rules.yaml b/assets/ingress-nginx-mixin/rules.yaml new file mode 100644 index 0000000..2ae2220 --- /dev/null +++ b/assets/ingress-nginx-mixin/rules.yaml @@ -0,0 +1 @@ +groups: [] diff --git a/assets/kubernetes-autoscaling/alerts.yaml b/assets/kubernetes-autoscaling/alerts.yaml new file mode 100644 index 0000000..2ab623e --- /dev/null +++ b/assets/kubernetes-autoscaling/alerts.yaml @@ -0,0 +1,73 @@ +groups: +- name: karpenter + rules: + - alert: KarpenterCloudProviderErrors + annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance + description: The Karpenter provider {{ $labels.provider }} with the controller + {{ $labels.controller }} has errors with the method {{ $labels.method }}. + summary: Karpenter has Cloud Provider Errors. + expr: | + sum( + increase( + karpenter_cloudprovider_errors_total{ + job=~"karpenter" + }[5m] + ) + ) by (namespace, job, provider, controller, method) > 0 + for: 5m + labels: + severity: warning + - alert: KarpenterNodepoolNearCapacity + annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview + description: The resource {{ $labels.resource_type }} in the Karpenter node + pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding + resources. + summary: Karpenter Nodepool near capacity. + expr: | + sum ( + karpenter_nodepools_usage{job=~"karpenter"} + ) by (namespace, job, nodepool, resource_type) + / + sum ( + karpenter_nodepools_limit{job=~"karpenter"} + ) by (namespace, job, nodepool, resource_type) + * 100 > 75 + for: 15m + labels: + severity: warning +- name: cluster-autoscaler + rules: + - alert: ClusterAutoscalerNodeCountNearCapacity + annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler + description: The node count for the cluster autoscaler job {{ $labels.job }} + is reaching max limit. Consider scaling node groups. + summary: Cluster Autoscaler Node Count near Capacity. + expr: | + sum ( + cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"} + ) by (namespace, job) + / + sum ( + cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"} + ) by (namespace, job) + * 100 > 75 + for: 15m + labels: + severity: warning + - alert: ClusterAutoscalerUnschedulablePods + annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler + description: The cluster currently has unschedulable pods, indicating resource + shortages. Consider adding more nodes or increasing node group capacity. + summary: Pods Pending Scheduling - Cluster Node Group Scaling Required + expr: | + sum ( + cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"} + ) by (namespace, job) + > 0 + for: 15m + labels: + severity: warning diff --git a/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-ca.json b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-ca.json new file mode 100644 index 0000000..1993f0e --- /dev/null +++ b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-ca.json @@ -0,0 +1,643 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Kubernetes and focuses on giving a overview for cluster autoscaler. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "cluster-autoscaler" + ], + "targetBlank": true, + "title": "Kubernetes / Autoscaling / Cluster Autoscaler", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n )\n)\n" + } + ], + "title": "Total Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_max_nodes_count{\n job=~\"$job\"\n }\n )\n)\n" + } + ], + "title": "Max Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_node_groups_count{\n job=~\"$job\"\n }\n )\n)\n" + } + ], + "title": "Node Groups", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\",\n state=\"ready\"\n }\n ) /\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n ) * 100\n)\n" + } + ], + "title": "Healthy Nodes", + "type": "gauge" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "No" + }, + "1": { + "color": "green", + "text": "Yes" + } + }, + "type": "value" + } + ], + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_cluster_safe_to_autoscale{\n job=~\"$job\"\n }\n )\n)\n" + } + ], + "title": "Safe To Scale", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 7, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_unschedulable_pods_count{\n job=~\"$job\"\n }\n )\n)\n" + } + ], + "title": "Unscheduled Pods", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 18, + "y": 1 + }, + "id": 8, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "time() - sum(\n cluster_autoscaler_last_activity{\n job=~\"$job\",\n activity=\"scaleDown\"\n }\n)\n" + } + ], + "title": "Last Scale Down", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 9, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "time() - sum(\n cluster_autoscaler_last_activity{\n job=~\"$job\",\n activity=\"scaleUp\"\n }\n)\n" + } + ], + "title": "Last Scale Up", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n cluster_autoscaler_unschedulable_pods_count{\n job=~\"$job\"\n }[2m]\n )\n ) by (type)\n)\n", + "legendFormat": "{{ type }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n cluster_autoscaler_evicted_pods_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n", + "legendFormat": "Evicted Pods" + } + ], + "title": "Pod Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n ) by (state)\n)\n", + "legendFormat": "{{ state }}" + } + ], + "title": "Node Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n )\n)\n", + "legendFormat": "Total Nodes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n cluster_autoscaler_unneeded_nodes_count{\n job=~\"$job\"\n }\n )\n)\n", + "legendFormat": "Unneeded Nodes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n cluster_autoscaler_scaled_up_nodes_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n", + "legendFormat": "Scaled Up Nodes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n cluster_autoscaler_scaled_down_nodes_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n", + "legendFormat": "Scaled Down Nodes" + } + ], + "title": "Autoscaling Activity", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "cluster-autoscaler" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(cluster_autoscaler_last_activity{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timezone": "utc", + "title": "Kubernetes / Autoscaling / Cluster Autoscaler", + "uid": "kubernetes-autoscaling-mixin-ca-jkwq" +} diff --git a/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-hpa.json b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-hpa.json new file mode 100644 index 0000000..3abaccc --- /dev/null +++ b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-hpa.json @@ -0,0 +1,507 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Kubernetes and focuses on giving a overview for horizontal pod autoscalers. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "kubernetes-core" + ], + "targetBlank": true, + "title": "Kubernetes / Autoscaling", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_desired_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n" + } + ], + "title": "Desired Replicas", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_current_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n" + } + ], + "title": "Current Replicas", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_min_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n" + } + ], + "title": "Min Replicas", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_max_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n" + } + ], + "title": "Max Replicas", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 6, + "options": { + "sortBy": [ + { + "displayName": "Horitzontal Pod Autoscaler" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Metric Targets", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 4, + "horizontalpodautoscaler": 1, + "metric_name": 2, + "metric_target_type": 3, + "namespace": 0 + }, + "renameByName": { + "Value #A": "Threshold", + "horizontalpodautoscaler": "Horitzontal Pod Autoscaler", + "metric_name": "Metric Name", + "metric_target_type": "Metric Target Type", + "namespace": "Namespace" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\",\n metric_target_type=\"utilization\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n", + "legendFormat": "Utilization / {{ metric_name }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\",\n metric_target_type=\"utilization\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n", + "legendFormat": "Threshold / {{ metric_name }}" + } + ], + "title": "Utilization & Threshold", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_desired_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n", + "legendFormat": "Desired Replicas" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_current_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n", + "legendFormat": "Current Replicas" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_min_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n", + "legendFormat": "Min Replicas" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_max_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n", + "legendFormat": "Max Replicas" + } + ], + "title": "Replicas", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "kubernetes-core" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(kube_horizontalpodautoscaler_metadata_generation{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(kube_horizontalpodautoscaler_metadata_generation{job=~\"$job\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Horitzontal Pod Autoscaler", + "name": "hpa", + "query": "label_values(kube_horizontalpodautoscaler_spec_target_metric{job=~\"$job\", namespace=\"$namespace\"},horizontalpodautoscaler)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Metric Name", + "multi": true, + "name": "metric_name", + "query": "label_values(kube_horizontalpodautoscaler_spec_target_metric{job=~\"$job\", namespace=\"$namespace\", horizontalpodautoscaler=\"$hpa\"}, metric_name)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Kubernetes / Autoscaling / Horitzontal Pod Autoscaler", + "uid": "kubernetes-autoscaling-mixin-hpa-jkwq" +} diff --git a/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-act.json b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-act.json new file mode 100644 index 0000000..89075a7 --- /dev/null +++ b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-act.json @@ -0,0 +1,482 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Karpenter and focuses on Karpenter deletion/creation activity. It is created using the [Kubernetes Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "karpenter" + ], + "targetBlank": true, + "title": "Kubernetes / Autoscaling / Karpenter", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Node Pool Activity", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n karpenter_nodes_created_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n", + "interval": "1m", + "legendFormat": "{{ nodepool }}" + } + ], + "title": "Nodes Created by Node Pool", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n karpenter_nodes_terminated_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n", + "interval": "1m", + "legendFormat": "{{ nodepool }}" + } + ], + "title": "Nodes Terminated by Node Pool", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n karpenter_voluntary_disruption_decisions_total{\n job=~\"$job\",\n }[$__rate_interval]\n )\n ) by (decision, reason, consolidation_type)\n)\n", + "interval": "1m", + "legendFormat": "{{ decision }} - {{ reason }} - {{ consolidation_type }}" + } + ], + "title": "Node Disruption Decisions by Reason, Decision, and Consolidation Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n karpenter_voluntary_disruption_eligible_nodes{\n job=~\"$job\",\n }[$__rate_interval]\n )\n ) by (reason)\n)\n", + "interval": "1m", + "legendFormat": "{{ reason }}" + } + ], + "title": "Nodes Eligible for Disruption by Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n karpenter_nodeclaims_disrupted_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool, capacity_type, reason)\n)\n", + "interval": "1m", + "legendFormat": "{{ nodepool }} - {{ capacity_type }} - {{ reason }}" + } + ], + "title": "Nodes Disrupted by Node Pool", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 7, + "title": "Pod Activity", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n karpenter_pods_state{\n job=~\"$job\"\n }\n ) by (phase)\n)\n", + "interval": "1m", + "legendFormat": "{{ phase }}" + } + ], + "title": "Pods by Phase", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P99" + } + ], + "title": "Pods Startup Duration", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "karpenter" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(karpenter_nodes_allocatable{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Node Pool", + "multi": true, + "name": "nodepool", + "query": "label_values(karpenter_nodepools_allowed_disruptions{job=~\"$job\"}, nodepool)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timezone": "utc", + "title": "Kubernetes / Autoscaling / Karpenter / Activity", + "uid": "kubernetes-autoscaling-mixin-kact-jkwq" +} diff --git a/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-over.json b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-over.json new file mode 100644 index 0000000..0866681 --- /dev/null +++ b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-over.json @@ -0,0 +1,1532 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Karpenter and focuses on giving a overview for Karpenter. It is created using the [Kubernetes Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "karpenter" + ], + "targetBlank": true, + "title": "Kubernetes / Autoscaling / Karpenter", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Node Pool Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n count(\n karpenter_nodepools_allowed_disruptions{\n job=~\"$job\",\n }\n ) by (nodepool)\n)\n" + } + ], + "title": "Node Pools", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n count(\n karpenter_nodes_allocatable{\n job=~\"$job\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n ) by (node_name)\n)\n" + } + ], + "title": "Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_usage{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"cpu\"\n }\n)\n" + } + ], + "title": "Node Pool CPU Usage", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_usage{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"memory\"\n }\n)\n" + } + ], + "title": "Node Pool Memory Usage", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_limit{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"cpu\"\n }\n)\n" + } + ], + "title": "Node Pool CPU Limits", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 7, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_limit{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"memory\"\n }\n)\n" + } + ], + "title": "Node Pool Memory Limits", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 4 + }, + "id": 8, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count by (nodepool) (\n count by (node_name, nodepool) (\n karpenter_nodes_allocatable{\n job=~\"$job\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n )\n)\n", + "instant": true, + "legendFormat": "{{ nodepool }}" + } + ], + "title": "Nodes by Node Pool", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 4 + }, + "id": 9, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count by (instance_type) (\n count by (node_name, instance_type) (\n karpenter_nodes_allocatable{\n job=~\"$job\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n )\n)\n", + "instant": true, + "legendFormat": "{{ instance_type }}" + } + ], + "title": "Nodes by Instance Type", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 4 + }, + "id": 10, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count by (capacity_type) (\n count by (node_name, capacity_type) (\n karpenter_nodes_allocatable{\n job=~\"$job\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n )\n)\n", + "instant": true, + "legendFormat": "{{ capacity_type }}" + } + ], + "title": "Nodes by Capacity Type", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 11, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count by (region) (\n count by (node_name, region) (\n karpenter_nodes_allocatable{\n job=~\"$job\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n )\n)\n", + "instant": true, + "legendFormat": "{{ region }}" + } + ], + "title": "Nodes by Region", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 12, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count by (zone) (\n count by (node_name, zone) (\n karpenter_nodes_allocatable{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n )\n)\n", + "instant": true, + "legendFormat": "{{ zone }}" + } + ], + "title": "Nodes by Zone", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 9 + }, + "id": 13, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count by (arch) (\n count by (node_name, arch) (\n karpenter_nodes_allocatable{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n )\n)\n", + "instant": true, + "legendFormat": "{{ arch }}" + } + ], + "title": "Nodes by Arch", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 14, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count by (os) (\n count by (node_name, os) (\n karpenter_nodes_allocatable{\n job=~\"$job\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n )\n)\n", + "instant": true, + "legendFormat": "{{ os }}" + } + ], + "title": "Nodes by OS", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 15, + "title": "Pod Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 15 + }, + "id": 16, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodes_total_pod_requests{\n job=~\"$job\",\n resource_type=\"cpu\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n) +\nsum(\n karpenter_nodes_total_daemon_requests{\n job=~\"$job\",\n resource_type=\"cpu\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n)\n" + } + ], + "title": "Pod CPU Requests", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 15 + }, + "id": 17, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodes_total_pod_requests{\n job=~\"$job\",\n resource_type=\"memory\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n) +\nsum(\n karpenter_nodes_total_daemon_requests{\n job=~\"$job\",\n resource_type=\"memory\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n)\n" + } + ], + "title": "Pod Memory Requests", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 15 + }, + "id": 18, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodes_total_pod_limits{\n job=~\"$job\",\n resource_type=\"cpu\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n) +\nsum(\n karpenter_nodes_total_daemon_limits{\n job=~\"$job\",\n resource_type=\"cpu\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n)\n" + } + ], + "title": "Pod CPU Limits", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 15 + }, + "id": 19, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodes_total_pod_limits{\n job=~\"$job\",\n resource_type=\"memory\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n) +\nsum(\n karpenter_nodes_total_daemon_limits{\n job=~\"$job\",\n resource_type=\"memory\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n)\n" + } + ], + "title": "Pod Memory Limits", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 18 + }, + "id": 20, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_pods_state{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }\n) by (nodepool)\n", + "instant": true, + "legendFormat": "{{ nodepool }}" + } + ], + "title": "Pods by Node Pool", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 21, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_pods_state{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }\n) by (instance_type)\n", + "instant": true, + "legendFormat": "{{ instance_type }}" + } + ], + "title": "Pods by Instance Type", + "type": "piechart" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 18 + }, + "id": 22, + "options": { + "legend": { + "asTable": true, + "displayMode": "table", + "placement": "right", + "sortDesc": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie" + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_pods_state{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }\n) by (capacity_type)\n", + "instant": true, + "legendFormat": "{{ capacity_type }}" + } + ], + "title": "Pods by Capacity Type", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 23, + "title": "Node Pools", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Usage" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Allocated" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Limit" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Storage Usage" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Storage Limit" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 24, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Node Pool" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_usage{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"cpu\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodes_allocatable{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"cpu\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_limit{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"cpu\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_usage{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"memory\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodes_allocatable{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"memory\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_limit{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"memory\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_usage{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"nodes\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_limit{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"nodes\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_usage{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"pods\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_limit{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"pods\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_usage{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"ephemeral_storage\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_nodepools_limit{\n job=~\"$job\",\n nodepool=~\"$nodepool\",\n resource_type=\"ephemeral_storage\"\n }\n) by (job, namespace, nodepool)\n", + "format": "table", + "instant": true + } + ], + "title": "Node Pools", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #E": 6, + "Value #F": 7, + "Value #G": 8, + "Value #H": 9, + "Value #I": 10, + "Value #J": 11, + "Value #K": 12, + "Value #L": 13, + "namespace": 0, + "nodepool": 1 + }, + "renameByName": { + "Value #A": "CPU Usage", + "Value #B": "CPU Allocated", + "Value #C": "CPU Limit", + "Value #D": "Memory Usage", + "Value #E": "Memory Allocated", + "Value #F": "Memory Limit", + "Value #G": "Nodes Count", + "Value #H": "Nodes Limit", + "Value #I": "Max Pods Count", + "Value #J": "Max Pods Limit", + "Value #K": "Storage Usage", + "Value #L": "Storage Limit", + "namespace": "Namespace", + "nodepool": "Node Pool" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 25, + "title": "Nodes", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Utilization" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "gauge" + } + }, + { + "id": "max", + "value": 100 + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 33 + }, + { + "color": "red", + "value": 66 + } + ] + } + }, + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Utilization" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "gauge" + } + }, + { + "id": "max", + "value": 100 + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 33 + }, + { + "color": "red", + "value": 66 + } + ] + } + }, + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Instance Memory" + }, + "properties": [ + { + "id": "unit", + "value": "decmbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 26, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "CPU Utilization" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "(\n (\n sum(\n karpenter_nodes_total_pod_requests{\n job=~\"$job\",\n resource_type=\"cpu\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n ) by (node_name, nodepool, instance_type, instance_memory, instance_cpu, instance_network_bandwidth, region, zone, os, capacity_type, arch)\n +\n sum(\n karpenter_nodes_total_daemon_requests{\n job=~\"$job\",\n resource_type=\"cpu\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n ) by (node_name, nodepool, instance_type, instance_memory, instance_cpu, instance_network_bandwidth, region, zone, os, capacity_type, arch)\n ) /\n sum(\n karpenter_nodes_allocatable{\n job=~\"$job\",\n resource_type=\"cpu\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n ) by (node_name, nodepool, instance_type, instance_memory, instance_cpu, instance_network_bandwidth, region, zone, os, capacity_type, arch)\n) * 100\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "(\n (\n sum(\n karpenter_nodes_total_pod_requests{\n job=~\"$job\",\n resource_type=\"memory\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n ) by (node_name, nodepool, instance_type, instance_memory, instance_memory, instance_network_bandwidth, region, zone, os, capacity_type, arch)\n +\n sum(\n karpenter_nodes_total_daemon_requests{\n job=~\"$job\",\n resource_type=\"memory\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n ) by (node_name, nodepool, instance_type, instance_memory, instance_memory, instance_network_bandwidth, region, zone, os, capacity_type, arch)\n ) /\n sum(\n karpenter_nodes_allocatable{\n job=~\"$job\",\n resource_type=\"memory\",\n region=~\"$region\",\n zone=~\"$zone\",\n arch=~\"$arch\",\n os=~\"$os\",\n instance_type=~\"$instance_type\",\n capacity_type=~\"$capacity_type\",\n nodepool=~\"$nodepool\"\n }\n ) by (node_name, nodepool, instance_type, instance_memory, instance_memory, instance_network_bandwidth, region, zone, os, capacity_type, arch)\n) * 100\n", + "format": "table", + "instant": true + } + ], + "title": "Nodes", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 6, + "Value #B": 7, + "instance_cpu": 4, + "instance_memory": 5, + "instance_type": 3, + "namespace": 0, + "node_name": 1, + "nodepool": 2 + }, + "renameByName": { + "Value #A": "CPU Utilization", + "Value #B": "Memory Utilization", + "arch": "Architecture", + "capacity_type": "Capacity Type", + "instance_cpu": "Instance CPU", + "instance_memory": "Instance Memory", + "instance_network_bandwidth": "Instance Network Bandwidth", + "instance_type": "Instance Type", + "namespace": "Namespace", + "node_name": "Node Name", + "nodepool": "Node Pool", + "os": "OS", + "region": "Region", + "zone": "Zone" + } + } + } + ], + "type": "table" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "karpenter" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(karpenter_nodes_allocatable{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Region", + "multi": true, + "name": "region", + "query": "label_values(karpenter_nodes_allocatable{job=~\"$job\"}, region)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Zone", + "multi": true, + "name": "zone", + "query": "label_values(karpenter_nodes_allocatable{job=~\"$job\", region=~\"$region\"}, zone)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Architecture", + "multi": true, + "name": "arch", + "query": "label_values(karpenter_nodes_allocatable{job=~\"$job\", region=~\"$region\", zone=~\"$zone\"}, arch)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Operating System", + "multi": true, + "name": "os", + "query": "label_values(karpenter_nodes_allocatable{job=~\"$job\", region=~\"$region\", zone=~\"$zone\", arch=~\"$arch\"}, os)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Instance Type", + "multi": true, + "name": "instance_type", + "query": "label_values(karpenter_nodes_allocatable{job=~\"$job\", region=~\"$region\", zone=~\"$zone\", arch=~\"$arch\", os=~\"$os\"}, instance_type)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Capacity Type", + "multi": true, + "name": "capacity_type", + "query": "label_values(karpenter_nodes_allocatable{job=~\"$job\", region=~\"$region\", zone=~\"$zone\", arch=~\"$arch\", os=~\"$os\", instance_type=~\"$instance_type\"}, capacity_type)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Node Pool", + "multi": true, + "name": "nodepool", + "query": "label_values(karpenter_nodes_allocatable{job=~\"$job\", region=~\"$region\", zone=~\"$zone\", arch=~\"$arch\", os=~\"$os\", instance_type=~\"$instance_type\", capacity_type=~\"$capacity_type\"}, nodepool)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timezone": "utc", + "title": "Kubernetes / Autoscaling / Karpenter / Overview", + "uid": "kubernetes-autoscaling-mixin-kover-jkwq" +} diff --git a/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-perf.json b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-perf.json new file mode 100644 index 0000000..74343de --- /dev/null +++ b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-perf.json @@ -0,0 +1,839 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Karpenter and focuses on Karpenter performance. It is created using the [Kubernetes Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "karpenter" + ], + "targetBlank": true, + "title": "Kubernetes / Autoscaling / Karpenter", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "No" + }, + "1": { + "color": "green", + "text": "Yes" + } + }, + "type": "value" + } + ], + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_cluster_state_synced{\n job=~\"$job\",\n }\n) by (job)\n" + } + ], + "title": "Cluster State Synced", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 4 + }, + "id": 3, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_cluster_state_node_count{\n job=~\"$job\",\n }\n) by (job)\n" + } + ], + "title": "Cluster State Node Count", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 18, + "x": 6, + "y": 1 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n increase(\n karpenter_cloudprovider_errors_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n ) by (job, provider, controller, method)\n)\n", + "interval": "1m", + "legendFormat": "{{ provider }} - {{ controller }} - {{ method }}" + } + ], + "title": "Cloud Provider Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P99" + } + ], + "title": "Node Termination Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n", + "interval": "1m", + "legendFormat": "P99" + } + ], + "title": "Pods Startup Duration", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 7, + "title": "Interruption Queue", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 14 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n karpenter_interruption_received_messages_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job, message_type)\n", + "legendFormat": "{{ message_type }}" + } + ], + "title": "Interruption Received Messages", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 14 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n karpenter_interruption_deleted_messages_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job)\n", + "legendFormat": "Deleted Messages" + } + ], + "title": "Interruption Deleted Messages", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 14 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Interruption Duration", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 11, + "title": "Work Queue", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 21 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n karpenter_workqueue_depth{\n job=~\"$job\"\n }\n) by (job)\n", + "legendFormat": "Queue Depth" + } + ], + "title": "Work Queue Depth", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 21 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Work Queue In Queue Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "spanNulls": false + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 21 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Work Queue Work Duration", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 15, + "title": "Controller", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 100, + "spanNulls": false, + "stacking": { + "mode": "value" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n controller_runtime_reconcile_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job, controller) > 0\n", + "legendFormat": "{{ controller }}" + } + ], + "title": "Controller Reconcile", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "karpenter" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(karpenter_nodes_allocatable{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Node Pool", + "multi": true, + "name": "nodepool", + "query": "label_values(karpenter_nodepools_allowed_disruptions{job=~\"$job\"}, nodepool)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timezone": "utc", + "title": "Kubernetes / Autoscaling / Karpenter / Performance", + "uid": "kubernetes-autoscaling-mixin-kperf-jkwq" +} diff --git a/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-pdb.json b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-pdb.json new file mode 100644 index 0000000..12d05cf --- /dev/null +++ b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-pdb.json @@ -0,0 +1,568 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Kubernetes and focuses on giving a overview for pod disruption budgets. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "kubernetes-core" + ], + "targetBlank": true, + "title": "Kubernetes / Autoscaling", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "$namespace Namespace Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Disruptions Allowed" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "sortBy": [ + { + "displayName": "Pod Disruption Budget" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Summary", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "namespace": 0, + "poddisruptionbudget": 1 + }, + "renameByName": { + "Value #A": "Disruptions Allowed", + "Value #B": "Desired Healthy", + "Value #C": "Currently Healthy", + "Value #D": "Expected Pods", + "namespace": "Namespace", + "poddisruptionbudget": "Pod Disruption Budget" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 3, + "title": "$pdb Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n" + } + ], + "title": "Disruptions Allowed", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n" + } + ], + "title": "Desired Healthy", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 9 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n" + } + ], + "title": "Currently Healthy", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 0.10000000000000001 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 7, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n" + } + ], + "title": "Expected Pods", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Currently Healthy" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Disruptions Allowed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Desired Healthy" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Expected Pods" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n", + "legendFormat": "Disruptions Allowed" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n", + "legendFormat": "Desired Healthy" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n", + "legendFormat": "Currently Healthy" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n", + "legendFormat": "Expected Pods" + } + ], + "title": "Status", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "kubernetes-core" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(kube_horizontalpodautoscaler_metadata_generation{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(kube_poddisruptionbudget_status_current_healthy{job=~\"$job\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Pod Disruption Budget", + "name": "pdb", + "query": "label_values(kube_poddisruptionbudget_status_current_healthy{job=~\"$job\", namespace=~\"$namespace\"}, poddisruptionbudget)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Kubernetes / Autoscaling / Pod Disruption Budget", + "uid": "kubernetes-autoscaling-mixin-pdb-jkwq" +} diff --git a/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-vpa.json b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-vpa.json new file mode 100644 index 0000000..dce500d --- /dev/null +++ b/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-vpa.json @@ -0,0 +1,895 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "description": "A dashboard that monitors Kubernetes and focuses on giving a overview for vertical pod autoscalers. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).", + "editable": true, + "links": [ + { + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "kubernetes-core" + ], + "targetBlank": true, + "title": "Kubernetes / Autoscaling", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "$namespace Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Lower Bound" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "CPU Target" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "CPU Upper Bound" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Vertical Pod Autoscaler" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "format": "table", + "instant": true, + "legendFormat": "CPU Lower Bound" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "format": "table", + "instant": true, + "legendFormat": "CPU Target" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "format": "table", + "instant": true, + "legendFormat": "CPU Upper Bound" + } + ], + "title": "CPU Resource Recommendations", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 4, + "Value #B": 5, + "Value #C": 6, + "container": 2, + "namespace": 0, + "resource": 3, + "verticalpodautoscaler": 1 + }, + "renameByName": { + "Value #A": "CPU Lower Bound", + "Value #B": "CPU Target", + "Value #C": "CPU Upper Bound", + "container": "Container", + "namespace": "Namespace", + "resource": "Resource", + "verticalpodautoscaler": "Vertical Pod Autoscaler" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Lower Bound" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Target" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Upper Bound" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 3, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "displayName": "Vertical Pod Autoscaler" + } + ] + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "format": "table", + "instant": true, + "legendFormat": "Memory Lower Bound" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "format": "table", + "instant": true, + "legendFormat": "Memory Target" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "format": "table", + "instant": true, + "legendFormat": "Memory Upper Bound" + } + ], + "title": "Memory Resource Recommendations", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 4, + "Value #B": 5, + "Value #C": 6, + "container": 2, + "namespace": 0, + "resource": 3, + "verticalpodautoscaler": 1 + }, + "renameByName": { + "Value #A": "Memory Lower Bound", + "Value #B": "Memory Target", + "Value #C": "Memory Upper Bound", + "container": "Container", + "namespace": "Namespace", + "resource": "Resource", + "verticalpodautoscaler": "Vertical Pod Autoscaler" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 4, + "repeat": "container", + "title": "$vpa / $container Summary", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "yellow", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 18 + }, + "id": 5, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "CPU Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "CPU Limits" + } + ], + "title": "CPU Guaranteed QoS", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "CPU Limits" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 18 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "CPU Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "CPU Limits" + } + ], + "title": "CPU Burstable QoS", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "yellow", + "value": 0 + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 18 + }, + "id": 7, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "Memory Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "Memory Limits" + } + ], + "title": "Memory Guaranteed QoS", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Limits" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 18 + }, + "id": 8, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "Memory Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "Memory Limits" + } + ], + "title": "Memory Burstable QoS", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "{{ container }} - Lower Bound" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "{{ container }} - Target" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "{{ container }} - Upper Bound" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n container=\"$container\"\n }\n) by (container)\n", + "legendFormat": "{{ container }} - Usage" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_pod_container_resource_requests{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"cpu\",\n container=\"$container\"\n }\n) by (container)\n", + "legendFormat": "{{ container }} - Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_pod_container_resource_limits{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"cpu\",\n container=\"$container\"\n }\n) by (container)\n", + "legendFormat": "{{ container }} - Limits" + } + ], + "title": "VPA CPU Recommendations Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "spanNulls": false + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "{{ container }} - Lower Bound" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "{{ container }} - Target" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n", + "legendFormat": "{{ container }} - Upper Bound" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n container_memory_working_set_bytes{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa(.+)\",\n container=\"$container\"\n }\n) by (container)\n", + "legendFormat": "{{ container }} - Usage" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_pod_container_resource_requests{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"memory\",\n container=\"$container\"\n }\n) by (container)\n", + "legendFormat": "{{ container }} - Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n kube_pod_container_resource_limits{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"memory\",\n container=\"$container\"\n }\n) by (container)\n", + "legendFormat": "{{ container }} - Limits" + } + ], + "title": "VPA Memory Recommendations Over Time", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "kubernetes", + "autoscaling", + "kubernetes-autoscaling-mixin", + "kubernetes-core" + ], + "templating": { + "list": [ + { + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(kube_customresource_verticalpodautoscaler_labels{}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(kube_customresource_verticalpodautoscaler_labels{job=~\"$job\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "VPA Pod Autoscaler", + "name": "vpa", + "query": "label_values(kube_customresource_verticalpodautoscaler_labels{job=~\"$job\", namespace=~\"$namespace\"}, verticalpodautoscaler)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Container", + "multi": true, + "name": "container", + "query": "label_values(kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{job=~\"$job\", namespace=~\"$namespace\", verticalpodautoscaler=~\"$vpa\"}, container)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Kubernetes / Autoscaling / Vertical Pod Autoscaler", + "uid": "kubernetes-autoscaling-mixin-vpa-jkwq" +} diff --git a/assets/kubernetes-autoscaling/rules.yaml b/assets/kubernetes-autoscaling/rules.yaml new file mode 100644 index 0000000..19765bd --- /dev/null +++ b/assets/kubernetes-autoscaling/rules.yaml @@ -0,0 +1 @@ +null diff --git a/mixins.json b/mixins.json index 4aec91c..fcded1e 100644 --- a/mixins.json +++ b/mixins.json @@ -516,6 +516,31 @@ "name": "gitea", "source": "https://github.com/go-gitea/gitea", "subdir": "contrib/gitea-monitoring-mixin" + }, + { + "name": "django", + "source": "https://github.com/adinhodovic/django-mixin", + "subdir": "" + }, + { + "name": "celery", + "source": "https://github.com/danihodovic/celery-exporter", + "subdir": "celery-mixin" + }, + { + "name": "argo-cd-2", + "source": "https://github.com/adinhodovic/argo-cd-mixin", + "subdir": "" + }, + { + "name": "ingress-nginx-mixin", + "source": "https://github.com/adinhodovic/ingress-nginx-mixin", + "subdir": "" + }, + { + "name": "kubernetes-autoscaling", + "source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin", + "subdir": "" } ] } diff --git a/site/content/argo-cd-2/_index.md b/site/content/argo-cd-2/_index.md new file mode 100644 index 0000000..1573c19 --- /dev/null +++ b/site/content/argo-cd-2/_index.md @@ -0,0 +1,156 @@ +--- +title: argo-cd-2 +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/adinhodovic/argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/alerts.yaml). +{{< /panel >}} + +### argo-cd + +##### ArgoCdAppOutOfSync + +{{< code lang="yaml" >}} +alert: ArgoCdAppOutOfSync +annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name + }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} is out of sync with the sync status {{ $labels.sync_status }} + for the past 15m. + summary: An ArgoCD Application is Out Of Sync. +expr: | + sum( + argocd_app_info{ + job=~".*", + sync_status!="Synced" + } + ) by (job, dest_server, project, name, sync_status) + > 0 +for: 15m +labels: + severity: warning +{{< /code >}} + +##### ArgoCdAppUnhealthy + +{{< code lang="yaml" >}} +alert: ArgoCdAppUnhealthy +annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name + }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} is unhealthy with the health status {{ $labels.health_status }} + for the past 15m. + summary: An ArgoCD Application is Unhealthy. +expr: | + sum( + argocd_app_info{ + job=~".*", + health_status!~"Healthy|Progressing" + } + ) by (job, dest_server, project, name, health_status) + > 0 +for: 15m +labels: + severity: warning +{{< /code >}} + +##### ArgoCdAppAutoSyncDisabled + +{{< code lang="yaml" >}} +alert: ArgoCdAppAutoSyncDisabled +annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name + }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} has autosync disabled for the past 2h. + summary: An ArgoCD Application has AutoSync Disabled. +expr: | + sum( + argocd_app_info{ + job=~".*", + autosync_enabled!="true", + name!~"" + } + ) by (job, dest_server, project, name, autosync_enabled) + > 0 +for: 2h +labels: + severity: warning +{{< /code >}} + +##### ArgoCdAppSyncFailed + +{{< code lang="yaml" >}} +alert: ArgoCdAppSyncFailed +annotations: + dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{ + $labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name + }} + description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{ + $labels.name }} has failed to sync with the status {{ $labels.phase }} the past + 10m. + summary: An ArgoCD Application has Failed to Sync. +expr: | + sum( + round( + increase( + argocd_app_sync_total{ + job=~".*", + phase!="Succeeded" + }[10m] + ) + ) + ) by (job, dest_server, project, name, phase) > 0 +for: 1m +labels: + severity: warning +{{< /code >}} + +##### ArgoCdNotificationDeliveryFailed + +{{< code lang="yaml" >}} +alert: ArgoCdNotificationDeliveryFailed +annotations: + dashboard_url: https://grafana.com/d/argo-cd-notifications-overview-kask/argocd-notifications-overview?var-job={{ + $labels.job }}&var-exported_service={{ $labels.exported_service }} + description: The notification job {{ $labels.job }} has failed to deliver to {{ + $labels.exported_service }} for the past 10m. + summary: ArgoCD Notification Delivery Failed. +expr: | + sum( + round( + increase( + argocd_notifications_deliveries_total{ + job=~".*", + succeeded!="true" + }[10m] + ) + ) + ) by (job, exported_service, succeeded) > 0 +for: 1m +labels: + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [argo-cd-application-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-application-overview.json) +- [argo-cd-notifications-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json) +- [argo-cd-operational-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-operational-overview.json) diff --git a/site/content/celery/_index.md b/site/content/celery/_index.md new file mode 100644 index 0000000..d5c3825 --- /dev/null +++ b/site/content/celery/_index.md @@ -0,0 +1,114 @@ +--- +title: celery +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/danihodovic/celery-exporter](https://github.com/danihodovic/celery-exporter/tree/master/celery-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/celery/alerts.yaml). +{{< /panel >}} + +### celery + +##### CeleryTaskHighFailRate + +{{< code lang="yaml" >}} +alert: CeleryTaskHighFailRate +annotations: + dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{ + $labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name + }} + description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name + }}/{{ $labels.name }} the past 10m. + summary: Celery high task fail rate. +expr: | + sum( + increase( + celery_task_failed_total{ + job=~".*celery.*", + queue_name!~"None", + name!~"None" + }[10m] + ) + ) by (job, namespace, queue_name, name) + / + ( + sum( + increase( + celery_task_failed_total{ + job=~".*celery.*", + queue_name!~"None", + name!~"None" + }[10m] + ) + ) by (job, namespace, queue_name, name) + + + sum( + increase( + celery_task_succeeded_total{ + job=~".*celery.*", + queue_name!~"None", + name!~"None" + }[10m] + ) + ) by (job, namespace, queue_name, name) + ) + * 100 > 5 +for: 1m +labels: + severity: warning +{{< /code >}} + +##### CeleryHighQueueLength + +{{< code lang="yaml" >}} +alert: CeleryHighQueueLength +annotations: + dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ + $labels.job }}&var-queue_name={{ $labels.queue_name }} + description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name + }} the past 20m. + summary: Celery high queue length. +expr: | + sum( + celery_queue_length{ + job=~".*celery.*", + queue_name!~"None" + } + ) by (job, namespace, queue_name) + > 100 +for: 20m +labels: + severity: warning +{{< /code >}} + +##### CeleryWorkerDown + +{{< code lang="yaml" >}} +alert: CeleryWorkerDown +annotations: + dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ + $labels.job }} + description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline. + summary: A Celery worker is offline. +expr: | + celery_worker_up{job=~".*celery.*"} == 0 +for: 15m +labels: + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [celery-tasks-by-task](https://github.com/monitoring-mixins/website/blob/master/assets/celery/dashboards/celery-tasks-by-task.json) +- [celery-tasks-overview](https://github.com/monitoring-mixins/website/blob/master/assets/celery/dashboards/celery-tasks-overview.json) diff --git a/site/content/cilium-enterprise/_index.md b/site/content/cilium-enterprise/_index.md index 452f24c..5fb0be4 100644 --- a/site/content/cilium-enterprise/_index.md +++ b/site/content/cilium-enterprise/_index.md @@ -68,10 +68,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni Following dashboards are generated from mixins and hosted on github: -- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json) -- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json) -- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json) - [cilium-agent](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent.json) +- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json) - [cilium-api](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-api.json) - [cilium-bpf](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-bpf.json) - [cilium-conntrack](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-conntrack.json) @@ -80,6 +78,8 @@ Following dashboards are generated from mixins and hosted on github: - [cilium-fqdn-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-fqdn-proxy.json) - [cilium-identities](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-identities.json) - [cilium-kubernetes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-kubernetes.json) +- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json) +- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json) - [cilium-network](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-network.json) - [cilium-nodes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-nodes.json) - [cilium-operator](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-operator.json) diff --git a/site/content/cortex/_index.md b/site/content/cortex/_index.md index bf69a7d..cd90f8a 100644 --- a/site/content/cortex/_index.md +++ b/site/content/cortex/_index.md @@ -2491,8 +2491,8 @@ Following dashboards are generated from mixins and hosted on github: - [alertmanager](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/alertmanager.json) -- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json) - [cortex-compactor](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor.json) +- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json) - [cortex-config](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-config.json) - [cortex-object-store](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-object-store.json) - [cortex-queries](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-queries.json) diff --git a/site/content/django/_index.md b/site/content/django/_index.md new file mode 100644 index 0000000..e3197b8 --- /dev/null +++ b/site/content/django/_index.md @@ -0,0 +1,141 @@ +--- +title: django +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/adinhodovic/django-mixin](https://github.com/adinhodovic/django-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/django/alerts.yaml). +{{< /panel >}} + +### django + +##### DjangoMigrationsUnapplied + +{{< code lang="yaml" >}} +alert: DjangoMigrationsUnapplied +annotations: + dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }} + description: The job {{ $labels.job }} has unapplied migrations. + summary: Django has unapplied migrations. +expr: | + sum( + django_migrations_unapplied_total{ + job=~"django" + } + ) by (namespace, job) + > 0 +for: 15m +labels: + severity: warning +{{< /code >}} + +##### DjangoDatabaseException + +{{< code lang="yaml" >}} +alert: DjangoDatabaseException +annotations: + dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }} + description: The job {{ $labels.job }} has hit the database exception {{ $labels.type + }}. + summary: Django database exception. +expr: | + sum ( + increase( + django_db_errors_total{ + job=~"django" + }[10m] + ) + ) by (type, namespace, job) + > 0 +labels: + severity: info +{{< /code >}} + +##### DjangoHighHttp4xxErrorRate + +{{< code lang="yaml" >}} +alert: DjangoHighHttp4xxErrorRate +annotations: + dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }} + description: More than 5% HTTP requests with status 4xx for {{ $labels.job }}/{{ + $labels.view }} the past 5m. + summary: Django high HTTP 4xx error rate. +expr: | + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + status=~"^4.*", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + / + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + * 100 > 5 +for: 1m +labels: + severity: warning +{{< /code >}} + +##### DjangoHighHttp5xxErrorRate + +{{< code lang="yaml" >}} +alert: DjangoHighHttp5xxErrorRate +annotations: + dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{ + $labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }} + description: More than 5% HTTP requests with status 5xx for {{ $labels.job }}/{{ + $labels.view }} the past 5m. + summary: Django high HTTP 5xx error rate. +expr: | + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + status=~"^5.*", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + / + sum( + rate( + django_http_responses_total_by_status_view_method_total{ + job=~"django", + view!~"|health_check:health_check_home|prometheus-django-metrics" + }[5m] + ) + ) by (namespace, job, view) + * 100 > 5 +for: 1m +labels: + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [django-overview](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-overview.json) +- [django-requests-by-view](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-requests-by-view.json) +- [django-requests-overview](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-requests-overview.json) diff --git a/site/content/ingress-nginx-mixin/_index.md b/site/content/ingress-nginx-mixin/_index.md new file mode 100644 index 0000000..128811a --- /dev/null +++ b/site/content/ingress-nginx-mixin/_index.md @@ -0,0 +1,80 @@ +--- +title: ingress-nginx-mixin +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/adinhodovic/ingress-nginx-mixin](https://github.com/adinhodovic/ingress-nginx-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/alerts.yaml). +{{< /panel >}} + +### nginx.rules + +##### NginxConfigReloadFailed + +{{< code lang="yaml" >}} +alert: NginxConfigReloadFailed +annotations: + dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{ + $labels.job }}&var-controller_class={{ $labels.controller_class }} + description: Nginx config reload failed for the controller with the class {{ $labels.controller_class + }}. + summary: Nginx config reload failed. +expr: | + sum( + nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"} + ) by (job, controller_class) + == 0 +for: 5m +labels: + severity: warning +{{< /code >}} + +##### NginxHighHttp4xxErrorRate + +{{< code lang="yaml" >}} +alert: NginxHighHttp4xxErrorRate +annotations: + dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{ + $labels.exported_namespace }}&var-ingress={{ $labels.ingress }} + description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace + }}/{{ $labels.ingress }} the past 5m. + summary: Nginx high HTTP 4xx error rate. +expr: | + (sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^4.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5 +for: 1m +labels: + severity: info +{{< /code >}} + +##### NginxHighHttp5xxErrorRate + +{{< code lang="yaml" >}} +alert: NginxHighHttp5xxErrorRate +annotations: + dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{ + $labels.exported_namespace }}&var-ingress={{ $labels.ingress }} + description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace + }}/{{ $labels.ingress }} the past 5m. + summary: Nginx high HTTP 5xx error rate. +expr: | + (sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^5.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5 +for: 1m +labels: + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [ingress-nginx-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/dashboards/ingress-nginx-overview.json) +- [ingress-nginx-request-handling-performance](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/dashboards/ingress-nginx-request-handling-performance.json) diff --git a/site/content/kubernetes-autoscaling/_index.md b/site/content/kubernetes-autoscaling/_index.md new file mode 100644 index 0000000..83e0dca --- /dev/null +++ b/site/content/kubernetes-autoscaling/_index.md @@ -0,0 +1,120 @@ +--- +title: kubernetes-autoscaling +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/adinhodovic/kubernetes-autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/alerts.yaml). +{{< /panel >}} + +### karpenter + +##### KarpenterCloudProviderErrors + +{{< code lang="yaml" >}} +alert: KarpenterCloudProviderErrors +annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance + description: The Karpenter provider {{ $labels.provider }} with the controller {{ + $labels.controller }} has errors with the method {{ $labels.method }}. + summary: Karpenter has Cloud Provider Errors. +expr: | + sum( + increase( + karpenter_cloudprovider_errors_total{ + job=~"karpenter" + }[5m] + ) + ) by (namespace, job, provider, controller, method) > 0 +for: 5m +labels: + severity: warning +{{< /code >}} + +##### KarpenterNodepoolNearCapacity + +{{< code lang="yaml" >}} +alert: KarpenterNodepoolNearCapacity +annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview + description: The resource {{ $labels.resource_type }} in the Karpenter node pool + {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources. + summary: Karpenter Nodepool near capacity. +expr: | + sum ( + karpenter_nodepools_usage{job=~"karpenter"} + ) by (namespace, job, nodepool, resource_type) + / + sum ( + karpenter_nodepools_limit{job=~"karpenter"} + ) by (namespace, job, nodepool, resource_type) + * 100 > 75 +for: 15m +labels: + severity: warning +{{< /code >}} + +### cluster-autoscaler + +##### ClusterAutoscalerNodeCountNearCapacity + +{{< code lang="yaml" >}} +alert: ClusterAutoscalerNodeCountNearCapacity +annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler + description: The node count for the cluster autoscaler job {{ $labels.job }} is + reaching max limit. Consider scaling node groups. + summary: Cluster Autoscaler Node Count near Capacity. +expr: | + sum ( + cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"} + ) by (namespace, job) + / + sum ( + cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"} + ) by (namespace, job) + * 100 > 75 +for: 15m +labels: + severity: warning +{{< /code >}} + +##### ClusterAutoscalerUnschedulablePods + +{{< code lang="yaml" >}} +alert: ClusterAutoscalerUnschedulablePods +annotations: + dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler + description: The cluster currently has unschedulable pods, indicating resource shortages. + Consider adding more nodes or increasing node group capacity. + summary: Pods Pending Scheduling - Cluster Node Group Scaling Required +expr: | + sum ( + cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"} + ) by (namespace, job) + > 0 +for: 15m +labels: + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [kubernetes-autoscaling-mixin-ca](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-ca.json) +- [kubernetes-autoscaling-mixin-hpa](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-hpa.json) +- [kubernetes-autoscaling-mixin-karpenter-act](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-act.json) +- [kubernetes-autoscaling-mixin-karpenter-over](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-over.json) +- [kubernetes-autoscaling-mixin-karpenter-perf](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-perf.json) +- [kubernetes-autoscaling-mixin-pdb](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-pdb.json) +- [kubernetes-autoscaling-mixin-vpa](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-vpa.json) diff --git a/site/content/loki/_index.md b/site/content/loki/_index.md index e4f414f..84234a7 100644 --- a/site/content/loki/_index.md +++ b/site/content/loki/_index.md @@ -281,8 +281,8 @@ Following dashboards are generated from mixins and hosted on github: - [loki-logs](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-logs.json) - [loki-mixin-recording-rules](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-mixin-recording-rules.json) - [loki-operational](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-operational.json) -- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json) - [loki-reads](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads.json) +- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json) - [loki-retention](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-retention.json) -- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json) - [loki-writes](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes.json) +- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json) diff --git a/site/content/prometheus/_index.md b/site/content/prometheus/_index.md index 00d41fb..2c9a368 100644 --- a/site/content/prometheus/_index.md +++ b/site/content/prometheus/_index.md @@ -433,5 +433,5 @@ labels: Following dashboards are generated from mixins and hosted on github: -- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json) - [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json) +- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json) diff --git a/site/static/mixins.json b/site/static/mixins.json index 4aec91c..fcded1e 100644 --- a/site/static/mixins.json +++ b/site/static/mixins.json @@ -516,6 +516,31 @@ "name": "gitea", "source": "https://github.com/go-gitea/gitea", "subdir": "contrib/gitea-monitoring-mixin" + }, + { + "name": "django", + "source": "https://github.com/adinhodovic/django-mixin", + "subdir": "" + }, + { + "name": "celery", + "source": "https://github.com/danihodovic/celery-exporter", + "subdir": "celery-mixin" + }, + { + "name": "argo-cd-2", + "source": "https://github.com/adinhodovic/argo-cd-mixin", + "subdir": "" + }, + { + "name": "ingress-nginx-mixin", + "source": "https://github.com/adinhodovic/ingress-nginx-mixin", + "subdir": "" + }, + { + "name": "kubernetes-autoscaling", + "source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin", + "subdir": "" } ] }