mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
feat: Add additional mixins
https://github.com/danihodovic/celery-exporter/tree/master/celery-mixin https://github.com/adinhodovic/argo-cd-mixin https://github.com/adinhodovic/kubernetes-autoscaling-mixin https://github.com/adinhodovic/django-mixin https://github.com/adinhodovic/ingress-nginx-mixin
This commit is contained in:
parent
526200ba6e
commit
a672deef1c
38 changed files with 14343 additions and 7 deletions
107
assets/argo-cd-2/alerts.yaml
Normal file
107
assets/argo-cd-2/alerts.yaml
Normal file
|
@ -0,0 +1,107 @@
|
|||
groups:
|
||||
- name: argo-cd
|
||||
rules:
|
||||
- alert: ArgoCdAppOutOfSync
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
|
||||
$labels.name }}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} is out of sync with the sync status {{ $labels.sync_status
|
||||
}} for the past 15m.
|
||||
summary: An ArgoCD Application is Out Of Sync.
|
||||
expr: |
|
||||
sum(
|
||||
argocd_app_info{
|
||||
job=~".*",
|
||||
sync_status!="Synced"
|
||||
}
|
||||
) by (job, dest_server, project, name, sync_status)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ArgoCdAppUnhealthy
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
|
||||
$labels.name }}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} is unhealthy with the health status {{ $labels.health_status
|
||||
}} for the past 15m.
|
||||
summary: An ArgoCD Application is Unhealthy.
|
||||
expr: |
|
||||
sum(
|
||||
argocd_app_info{
|
||||
job=~".*",
|
||||
health_status!~"Healthy|Progressing"
|
||||
}
|
||||
) by (job, dest_server, project, name, health_status)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ArgoCdAppAutoSyncDisabled
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
|
||||
$labels.name }}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} has autosync disabled for the past 2h.
|
||||
summary: An ArgoCD Application has AutoSync Disabled.
|
||||
expr: |
|
||||
sum(
|
||||
argocd_app_info{
|
||||
job=~".*",
|
||||
autosync_enabled!="true",
|
||||
name!~""
|
||||
}
|
||||
) by (job, dest_server, project, name, autosync_enabled)
|
||||
> 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ArgoCdAppSyncFailed
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{
|
||||
$labels.name }}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} has failed to sync with the status {{ $labels.phase }} the
|
||||
past 10m.
|
||||
summary: An ArgoCD Application has Failed to Sync.
|
||||
expr: |
|
||||
sum(
|
||||
round(
|
||||
increase(
|
||||
argocd_app_sync_total{
|
||||
job=~".*",
|
||||
phase!="Succeeded"
|
||||
}[10m]
|
||||
)
|
||||
)
|
||||
) by (job, dest_server, project, name, phase) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ArgoCdNotificationDeliveryFailed
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-notifications-overview-kask/argocd-notifications-overview?var-job={{
|
||||
$labels.job }}&var-exported_service={{ $labels.exported_service }}
|
||||
description: The notification job {{ $labels.job }} has failed to deliver to
|
||||
{{ $labels.exported_service }} for the past 10m.
|
||||
summary: ArgoCD Notification Delivery Failed.
|
||||
expr: |
|
||||
sum(
|
||||
round(
|
||||
increase(
|
||||
argocd_notifications_deliveries_total{
|
||||
job=~".*",
|
||||
succeeded!="true"
|
||||
}[10m]
|
||||
)
|
||||
)
|
||||
) by (job, exported_service, succeeded) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
945
assets/argo-cd-2/dashboards/argo-cd-application-overview.json
Normal file
945
assets/argo-cd-2/dashboards/argo-cd-application-overview.json
Normal file
|
@ -0,0 +1,945 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors ArgoCD with a focus on Application status. It is created using the [argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin). Requires custom configuration to add application badges. Please refer to the mixin.",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"ci/cd",
|
||||
"argo-cd"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "ArgoCD Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Summary by Cluster, Project",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 9,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, health_status)\n",
|
||||
"legendFormat": "{{ dest_server }}/{{ project }} - {{ health_status }}"
|
||||
}
|
||||
],
|
||||
"title": "Application Health Status",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 9,
|
||||
"x": 9,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, sync_status)\n",
|
||||
"legendFormat": "{{ dest_server }}/{{ project }} - {{ sync_status }}"
|
||||
}
|
||||
],
|
||||
"title": "Application Sync Status",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 9,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }[$__rate_interval]\n )\n )\n) by (job, dest_server, project, phase)\n",
|
||||
"legendFormat": "{{ dest_server }}/{{ project }} - {{ phase }}"
|
||||
}
|
||||
],
|
||||
"title": "Application Syncs",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 9,
|
||||
"x": 9,
|
||||
"y": 6
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n }\n) by (job, dest_server, project, autosync_enabled)\n",
|
||||
"legendFormat": "{{ dest_server }}/{{ project }} - {{ autosync_enabled }}"
|
||||
}
|
||||
],
|
||||
"title": "Application Auto Sync Enabled",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 1
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"content": "No applications defined",
|
||||
"mode": "markdown"
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"title": "Application Badges",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 18,
|
||||
"x": 0,
|
||||
"y": 11
|
||||
},
|
||||
"id": 7,
|
||||
"title": "Applications (Unhealthy/OutOfSync/AutoSyncDisabled) Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "name"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "links",
|
||||
"value": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Go To Application",
|
||||
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "health_status"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-background"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Application"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n health_status!~\"Healthy|Progressing\"\n }\n) by (job, dest_server, project, name, health_status)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Applications Unhealthy",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"dest_server": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"health_status": 2,
|
||||
"name": 0,
|
||||
"project": 1
|
||||
},
|
||||
"renameByName": {
|
||||
"dest_server": "Cluster",
|
||||
"health_status": "Health Status",
|
||||
"job": "Job",
|
||||
"name": "Application",
|
||||
"project": "Project"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "name"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "links",
|
||||
"value": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Go To Application",
|
||||
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "sync_status"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-background"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 12
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Application"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n sync_status!=\"Synced\"\n }\n) by (job, dest_server, project, name, sync_status) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Applications Out Of Sync",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"dest_server": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"name": 0,
|
||||
"project": 1,
|
||||
"sync_status": 2
|
||||
},
|
||||
"renameByName": {
|
||||
"dest_server": "Cluster",
|
||||
"job": "Job",
|
||||
"name": "Application",
|
||||
"project": "Project",
|
||||
"sync_status": "Sync Status"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "name"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "links",
|
||||
"value": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Go To Application",
|
||||
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Value"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-background"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Application"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n phase!=\"Succeeded\"\n }[7d]\n )\n )\n) by (job, dest_server, project, name, phase) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Applications That Failed to Sync[7d]",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"dest_server": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"name": 0,
|
||||
"phase": 2,
|
||||
"project": 1
|
||||
},
|
||||
"renameByName": {
|
||||
"Value": "Count",
|
||||
"dest_server": "Cluster",
|
||||
"job": "Job",
|
||||
"name": "Application",
|
||||
"phase": "Phase",
|
||||
"project": "Project"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "name"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "links",
|
||||
"value": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Go To Application",
|
||||
"url": "https://argocd.com/applications/${__data.fields.Project}/${__value.raw}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "autosync_enabled"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-background"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 18
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Application"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n autosync_enabled!=\"true\"\n }\n) by (job, dest_server, project, name, autosync_enabled) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Applications With Auto Sync Disabled",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Value": true,
|
||||
"dest_server": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"autosync_enabled": 2,
|
||||
"name": 0,
|
||||
"project": 1
|
||||
},
|
||||
"renameByName": {
|
||||
"autosync_enabled": "Auto Sync Enabled",
|
||||
"dest_server": "Cluster",
|
||||
"job": "Job",
|
||||
"name": "Application",
|
||||
"project": "Project"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"id": 12,
|
||||
"title": "Application ($application)",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"id": 13,
|
||||
"interval": "5m",
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }\n) by (namespace, job, dest_server, project, name, health_status)\n",
|
||||
"legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ health_status }}"
|
||||
}
|
||||
],
|
||||
"title": "Application Health Status",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 24
|
||||
},
|
||||
"id": 14,
|
||||
"interval": "5m",
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n argocd_app_info{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }\n) by (namespace, job, dest_server, project, name, sync_status)\n",
|
||||
"legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ sync_status }}"
|
||||
}
|
||||
],
|
||||
"title": "Application Sync Status",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 24
|
||||
},
|
||||
"id": 15,
|
||||
"interval": "5m",
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n round(\n increase(\n argocd_app_sync_total{\n namespace=~'$namespace',\njob=~'$job',\ndest_server=~'$cluster',\nproject=~'$project',\n\n name=~\"$application\",\n }[$__rate_interval]\n )\n )\n) by (namespace, job, dest_server, project, name, phase)\n",
|
||||
"legendFormat": "{{ dest_server }}/{{ project }}/{{ name }} - {{ phase }}"
|
||||
}
|
||||
],
|
||||
"title": "Application Sync Result",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"ci/cd",
|
||||
"argo-cd"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Namespace",
|
||||
"multi": true,
|
||||
"name": "namespace",
|
||||
"query": "label_values(argocd_app_info{}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(argocd_app_info{namespace=~\"$namespace\"}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Cluster",
|
||||
"multi": true,
|
||||
"name": "cluster",
|
||||
"query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\"}, dest_server)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Project",
|
||||
"multi": true,
|
||||
"name": "project",
|
||||
"query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\", dest_server=~\"$cluster\"}, project)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Application",
|
||||
"multi": true,
|
||||
"name": "application",
|
||||
"query": "label_values(argocd_app_info{namespace=~\"$namespace\", job=~\"$job\", dest_server=~\"$cluster\", project=~\"$project\"}, name)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "ArgoCD / Application / Overview",
|
||||
"uid": "argo-cd-application-overview-kask"
|
||||
}
|
198
assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json
Normal file
198
assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json
Normal file
|
@ -0,0 +1,198 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors ArgoCD notifications. It is created using the [argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"ci/cd",
|
||||
"argo-cd"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "ArgoCD Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n round(\n increase(\n argocd_notifications_deliveries_total{\n namespace=~'$namespace',\njob=~'$job',\n\n exported_service=~\"$exported_service\",\n }[$__rate_interval]\n )\n )\n) by (job, exported_service, succeeded)\n",
|
||||
"legendFormat": "{{ exported_service }} - Succeeded: {{ succeeded }}"
|
||||
}
|
||||
],
|
||||
"title": "Notification Deliveries",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"last",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n round(\n increase(\n argocd_notifications_trigger_eval_total{\n namespace=~'$namespace',\njob=~'$job',\n\n }[$__rate_interval]\n )\n )\n) by (job, name, triggered)\n",
|
||||
"legendFormat": "{{ name }} - Triggered: {{ triggered }}"
|
||||
}
|
||||
],
|
||||
"title": "Trigger Evaluations",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"ci/cd",
|
||||
"argo-cd"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Namespace",
|
||||
"multi": true,
|
||||
"name": "namespace",
|
||||
"query": "label_values(argocd_notifications_deliveries_total{}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(argocd_notifications_deliveries_total{namespace=~\"$namespace\"}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Notifications Service",
|
||||
"multi": true,
|
||||
"name": "exported_service",
|
||||
"query": "label_values(argocd_notifications_deliveries_total{namespace=~\"$namespace\", job=~\"$job\"}, exported_service)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-2d",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "ArgoCD / Notifications / Overview",
|
||||
"uid": "argo-cd-notifications-overview-kask"
|
||||
}
|
1139
assets/argo-cd-2/dashboards/argo-cd-operational-overview.json
Normal file
1139
assets/argo-cd-2/dashboards/argo-cd-operational-overview.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/argo-cd-2/rules.yaml
Normal file
1
assets/argo-cd-2/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
76
assets/celery/alerts.yaml
Normal file
76
assets/celery/alerts.yaml
Normal file
|
@ -0,0 +1,76 @@
|
|||
groups:
|
||||
- name: celery
|
||||
rules:
|
||||
- alert: CeleryTaskHighFailRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{
|
||||
$labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name
|
||||
}}
|
||||
description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name
|
||||
}}/{{ $labels.name }} the past 10m.
|
||||
summary: Celery high task fail rate.
|
||||
expr: |
|
||||
sum(
|
||||
increase(
|
||||
celery_task_failed_total{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None",
|
||||
name!~"None"
|
||||
}[10m]
|
||||
)
|
||||
) by (job, namespace, queue_name, name)
|
||||
/
|
||||
(
|
||||
sum(
|
||||
increase(
|
||||
celery_task_failed_total{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None",
|
||||
name!~"None"
|
||||
}[10m]
|
||||
)
|
||||
) by (job, namespace, queue_name, name)
|
||||
+
|
||||
sum(
|
||||
increase(
|
||||
celery_task_succeeded_total{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None",
|
||||
name!~"None"
|
||||
}[10m]
|
||||
)
|
||||
) by (job, namespace, queue_name, name)
|
||||
)
|
||||
* 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CeleryHighQueueLength
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
|
||||
$labels.job }}&var-queue_name={{ $labels.queue_name }}
|
||||
description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name
|
||||
}} the past 20m.
|
||||
summary: Celery high queue length.
|
||||
expr: |
|
||||
sum(
|
||||
celery_queue_length{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None"
|
||||
}
|
||||
) by (job, namespace, queue_name)
|
||||
> 100
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CeleryWorkerDown
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
|
||||
$labels.job }}
|
||||
description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline.
|
||||
summary: A Celery worker is offline.
|
||||
expr: |
|
||||
celery_worker_up{job=~".*celery.*"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
590
assets/celery/dashboards/celery-tasks-by-task.json
Normal file
590
assets/celery/dashboards/celery-tasks-by-task.json
Normal file
|
@ -0,0 +1,590 @@
|
|||
{
|
||||
"description": "A dashboard that monitors Celery. It is created using the Celery-mixin for the the (Celery-exporter)[https://github.com/danihodovic/celery-exporter]",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"celery",
|
||||
"celery-mixin"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Celery Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Tasks",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"noValue": 0,
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Success Rate"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "percentunit"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 16,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
"displayName": "Succeeded"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n/(sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n+sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n) > -1\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Task Stats",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value #A": 1,
|
||||
"Value #B": 2,
|
||||
"Value #C": 3,
|
||||
"Value #D": 4,
|
||||
"Value #E": 5,
|
||||
"Value #F": 6,
|
||||
"Value #G": 7,
|
||||
"Value #H": 8,
|
||||
"name": 0
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #A": "Success Rate",
|
||||
"Value #B": "Succeeded",
|
||||
"Value #C": "Failed",
|
||||
"Value #D": "Sent",
|
||||
"Value #E": "Received",
|
||||
"Value #F": "Rejected",
|
||||
"Value #G": "Retried",
|
||||
"Value #H": "Revoked",
|
||||
"name": "Name"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
"displayName": "Value"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum (\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n ) by (name, exception) > 0\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Task Exceptions",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value": 2,
|
||||
"exception": 1,
|
||||
"name": 0
|
||||
},
|
||||
"renameByName": {
|
||||
"exception": "Exception",
|
||||
"name": "Task"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
|
||||
"legendFormat": "Succeeded - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
|
||||
"legendFormat": "Failed - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
|
||||
"legendFormat": "Sent - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
|
||||
"legendFormat": "Received - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
|
||||
"legendFormat": "Retried - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
|
||||
"legendFormat": "Revoked - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n",
|
||||
"legendFormat": "Rejected - {{ name }}"
|
||||
}
|
||||
],
|
||||
"title": "Tasks Completed",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 17
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name, exception) > 0\n",
|
||||
"legendFormat": "{{ name }}/{{ exception }}"
|
||||
}
|
||||
],
|
||||
"title": "Task Exceptions",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "P50"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "P95"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "P99"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 25
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n",
|
||||
"legendFormat": "P50 - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n",
|
||||
"legendFormat": "P95 - {{ name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n",
|
||||
"legendFormat": "P99 - {{ name }}"
|
||||
}
|
||||
],
|
||||
"title": "Tasks Runtime",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"celery",
|
||||
"celery-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"query": "label_values(celery_worker_up{}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Job",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"query": "label_values(celery_worker_up{namespace=\"$namespace\"}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Queue Name",
|
||||
"multi": false,
|
||||
"name": "queue_name",
|
||||
"query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", name!~\"None\"}, queue_name)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Task",
|
||||
"multi": true,
|
||||
"name": "task",
|
||||
"query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", queue_name=~\"$queue_name\", name!~\"None\"}, name)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-2d",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Celery / Tasks / By Task",
|
||||
"uid": "celery-tasks-by-task-32s3"
|
||||
}
|
1008
assets/celery/dashboards/celery-tasks-overview.json
Normal file
1008
assets/celery/dashboards/celery-tasks-overview.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/celery/rules.yaml
Normal file
1
assets/celery/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
97
assets/django/alerts.yaml
Normal file
97
assets/django/alerts.yaml
Normal file
|
@ -0,0 +1,97 @@
|
|||
groups:
|
||||
- name: django
|
||||
rules:
|
||||
- alert: DjangoMigrationsUnapplied
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}
|
||||
description: The job {{ $labels.job }} has unapplied migrations.
|
||||
summary: Django has unapplied migrations.
|
||||
expr: |
|
||||
sum(
|
||||
django_migrations_unapplied_total{
|
||||
job=~"django"
|
||||
}
|
||||
) by (namespace, job)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: DjangoDatabaseException
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}
|
||||
description: The job {{ $labels.job }} has hit the database exception {{ $labels.type
|
||||
}}.
|
||||
summary: Django database exception.
|
||||
expr: |
|
||||
sum (
|
||||
increase(
|
||||
django_db_errors_total{
|
||||
job=~"django"
|
||||
}[10m]
|
||||
)
|
||||
) by (type, namespace, job)
|
||||
> 0
|
||||
labels:
|
||||
severity: info
|
||||
- alert: DjangoHighHttp4xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
|
||||
description: More than 5% HTTP requests with status 4xx for {{ $labels.job }}/{{
|
||||
$labels.view }} the past 5m.
|
||||
summary: Django high HTTP 4xx error rate.
|
||||
expr: |
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
status=~"^4.*",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
* 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: DjangoHighHttp5xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
|
||||
description: More than 5% HTTP requests with status 5xx for {{ $labels.job }}/{{
|
||||
$labels.view }} the past 5m.
|
||||
summary: Django high HTTP 5xx error rate.
|
||||
expr: |
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
status=~"^5.*",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
* 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
741
assets/django/dashboards/django-overview.json
Normal file
741
assets/django/dashboards/django-overview.json
Normal file
|
@ -0,0 +1,741 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Django which focuses on giving a overview for the system (requests, db, cache). It is created using the [Django-mixin](https://github.com/adinhodovic/django-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"django",
|
||||
"django-mixin"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Django Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_requests_total_by_view_transport_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n }[$__rate_interval]\n )\n ), 0.001\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Request Volume",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ops"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n rate (\n django_db_execute_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n )\n) by (namespace, job)\n"
|
||||
}
|
||||
],
|
||||
"title": "Database Ops",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n rate (\n django_cache_get_hits_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[30m]\n )\n) by (namespace, job)\n/\nsum (\n rate (\n django_cache_get_total {\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[30m]\n )\n) by (namespace, job)\n"
|
||||
}
|
||||
],
|
||||
"title": "Cache Hitrate [30m]",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "percent"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "2xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "3xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "blue",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "4xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "5xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"2.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "2xx"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"3.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "3xx"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"4.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "4xx"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\",\n status=~\"5.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "5xx"
|
||||
}
|
||||
],
|
||||
"title": "Responses",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 11
|
||||
},
|
||||
"id": 6,
|
||||
"title": "Database",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max (\n django_migrations_applied_total {\n namespace=\"$namespace\",\n job=~\"$job\"\n }\n) by (namespace, job)\n"
|
||||
}
|
||||
],
|
||||
"title": "Migrations Applied",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 12
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max (\n django_migrations_unapplied_total {\n namespace=\"$namespace\",\n job=~\"$job\"\n }\n) by (namespace, job)\n"
|
||||
}
|
||||
],
|
||||
"title": "Migrations Unapplied",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 15
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Type"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n topk(10,\n sum by (type) (\n increase(\n django_db_errors_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[1w]\n ) > 0\n )\n )\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Top Database Errors (1w)",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true
|
||||
},
|
||||
"indexByName": {
|
||||
"job": 1,
|
||||
"namespace": 0,
|
||||
"type": 2
|
||||
},
|
||||
"renameByName": {
|
||||
"job": "Job",
|
||||
"namespace": "Namespace",
|
||||
"type": "Type"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 12
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n django_db_new_connections_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, vendor)\n)\n",
|
||||
"legendFormat": "{{ vendor }}"
|
||||
}
|
||||
],
|
||||
"title": "Database Connections",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 18
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
|
||||
"legendFormat": "50 - {{ vendor }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
|
||||
"legendFormat": "95 - {{ vendor }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
|
||||
"legendFormat": "99 - {{ vendor }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.999,\n sum(\n irate(\n django_db_query_duration_seconds_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n ) by (vendor, namespace, job, le)\n)\n",
|
||||
"legendFormat": "99.9 - {{ vendor }}"
|
||||
}
|
||||
],
|
||||
"title": "Database Latency",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"id": 12,
|
||||
"title": "Cache",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "percent"
|
||||
}
|
||||
},
|
||||
"unit": "ops"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 25
|
||||
},
|
||||
"id": 13,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n rate(\n django_cache_get_hits_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n) by (namespace, job, backend)\n",
|
||||
"legendFormat": "Hit - {{ backend }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n rate(\n django_cache_get_misses_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n }[$__rate_interval]\n ) > 0\n) by (namespace, job, backend)\n",
|
||||
"legendFormat": "Miss - {{ backend }}"
|
||||
}
|
||||
],
|
||||
"title": "Cache Get",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"django",
|
||||
"django-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"query": "label_values(django_http_responses_total_by_status_view_method_total{}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Job",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\"}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Django / Overview",
|
||||
"uid": "django-overview-jkwq"
|
||||
}
|
673
assets/django/dashboards/django-requests-by-view.json
Normal file
673
assets/django/dashboards/django-requests-by-view.json
Normal file
|
@ -0,0 +1,673 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Django which focuses on breaking down requests by view. It is created using the [Django-mixin](https://github.com/adinhodovic/django-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"django",
|
||||
"django-mixin"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Django Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0.90000000000000002
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 0.94999999999999996
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.98999999999999999
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\",\n status!~\"[4-5].*\"\n }[1w]\n )\n) /\nsum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[1w]\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Success Rate (non 4xx-5xx responses) [1w]",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 10
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 100
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum by (view) (\n increase(\n django_http_exceptions_total_by_view_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n }[1w]\n ) > 0\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "HTTP Exceptions [1w]",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1000
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 2000
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50,\n sum (\n rate (\n django_http_requests_latency_seconds_by_view_method_bucket {\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__range]\n )\n ) by (job, le)\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Average Request Latency (P50) [1w]",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2500
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5000
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 1
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95,\n sum (\n rate (\n django_http_requests_latency_seconds_by_view_method_bucket {\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__range]\n )\n ) by (job, le)\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Average Request Latency (P95) [1w]",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"id": 6,
|
||||
"title": "Request & Responses",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_requests_total_by_view_transport_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\"\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "reqps"
|
||||
}
|
||||
],
|
||||
"title": "Requests",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "percent"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "2xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "3xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "blue",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "4xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "5xx"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 6
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"2.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "2xx"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"3.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "3xx"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"4.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "4xx"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n status=~\"5.*\",\n }[$__rate_interval]\n ) > 0\n ) by (job), 0.001\n)\n",
|
||||
"legendFormat": "5xx"
|
||||
}
|
||||
],
|
||||
"title": "Responses",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"id": 9,
|
||||
"title": "Latency & Status Codes",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "value"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 15
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n rate(\n django_http_responses_total_by_status_view_method_total{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\",\n }[$__rate_interval]\n ) > 0\n ) by (namespace, job, view, status, method), 0.001\n)\n",
|
||||
"legendFormat": "{{ view }} / {{ status }} / {{ method }}"
|
||||
}
|
||||
],
|
||||
"title": "Responses Status Codes",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 15
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
|
||||
"legendFormat": "50 - {{ view }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
|
||||
"legendFormat": "95 - {{ view }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
|
||||
"legendFormat": "99 - {{ view }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.999,\n sum(\n irate(\n django_http_requests_latency_seconds_by_view_method_bucket{\n namespace=~\"$namespace\",\n job=~\"$job\",\n view=\"$view\",\n method=~\"$method\"\n }[$__rate_interval]\n ) > 0\n ) by (view, le)\n)\n",
|
||||
"legendFormat": "99.9 - {{ view }}"
|
||||
}
|
||||
],
|
||||
"title": "Request Latency",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"django",
|
||||
"django-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"query": "label_values(django_http_responses_total_by_status_view_method_total{}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Job",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\"}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "View",
|
||||
"multi": false,
|
||||
"name": "view",
|
||||
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view!~\"<unnamed view>|health_check:health_check_home|prometheus-django-metrics\"}, view)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Method",
|
||||
"multi": true,
|
||||
"name": "method",
|
||||
"query": "label_values(django_http_responses_total_by_status_view_method_total{namespace=~\"$namespace\", job=~\"$job\", view=~\"$view\"}, method)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Django / Requests / By View",
|
||||
"uid": "django-requests-by-view-jkwq"
|
||||
}
|
1109
assets/django/dashboards/django-requests-overview.json
Normal file
1109
assets/django/dashboards/django-requests-overview.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/django/rules.yaml
Normal file
1
assets/django/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
42
assets/ingress-nginx-mixin/alerts.yaml
Normal file
42
assets/ingress-nginx-mixin/alerts.yaml
Normal file
|
@ -0,0 +1,42 @@
|
|||
groups:
|
||||
- name: nginx.rules
|
||||
rules:
|
||||
- alert: NginxConfigReloadFailed
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{
|
||||
$labels.job }}&var-controller_class={{ $labels.controller_class }}
|
||||
description: Nginx config reload failed for the controller with the class {{
|
||||
$labels.controller_class }}.
|
||||
summary: Nginx config reload failed.
|
||||
expr: |
|
||||
sum(
|
||||
nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"}
|
||||
) by (job, controller_class)
|
||||
== 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NginxHighHttp4xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
|
||||
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
|
||||
description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace
|
||||
}}/{{ $labels.ingress }} the past 5m.
|
||||
summary: Nginx high HTTP 4xx error rate.
|
||||
expr: |
|
||||
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^4.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: NginxHighHttp5xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
|
||||
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
|
||||
description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace
|
||||
}}/{{ $labels.ingress }} the past 5m.
|
||||
summary: Nginx high HTTP 5xx error rate.
|
||||
expr: |
|
||||
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^5.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
|
@ -0,0 +1,812 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Ingress-nginx. It is created using the (Ingress-Nginx-mixin)[https://github.com/adinhodovic/ingress-nginx-mixin]",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"ingress-nginx",
|
||||
"ingress-nginx-mixin"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Ingress Nginx Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Controller",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n irate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n namespace=~\"$namespace\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\"\n }[$__rate_interval]\n )\n ), 0.001\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Controller Request Volume",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n avg_over_time(\n nginx_ingress_controller_nginx_process_connections{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Controller Connections",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 0.94999999999999996
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.98999999999999999
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n status!~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n exported_namespace=~\"$exported_namespace\",\n namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Controller Success Rate (non $error_codes-xx responses)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 18,
|
||||
"y": 1
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "avg(\n irate(\n nginx_ingress_controller_success{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\"\n }[$__rate_interval]\n )\n) * 60\n"
|
||||
}
|
||||
],
|
||||
"title": "Config Reloads",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bool"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 21,
|
||||
"y": 1
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "count(\n nginx_ingress_controller_config_last_reload_successful{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_namespace=~\"$namespace\"\n } == 0\n) OR vector(0)\n"
|
||||
}
|
||||
],
|
||||
"title": "Last Config Failed",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"id": 7,
|
||||
"title": "Ingress",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "value"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n irate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n ingress=~\"$ingress\",\n exported_namespace=~\"$exported_namespace\"\n }[$__rate_interval]\n )\n ) by (ingress, exported_namespace), 0.001\n)\n",
|
||||
"legendFormat": "{{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Ingress Request Volume",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "percentunit"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 6
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status!~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n) by (ingress, exported_namespace)\n/\nsum(\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (ingress, exported_namespace)\n",
|
||||
"legendFormat": "{{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Ingress Success Rate (non $error_codes-xx responses)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "dtdurations"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Ingress"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "links",
|
||||
"value": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Go To Ingress",
|
||||
"type": "dashboard",
|
||||
"url": "/d/ingress-nginx-request-handling-jqkw/ingress-nginx-overview?var-exported_namespace=${__data.fields.Namespace}&var-ingress=${__data.fields.Ingress}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "IN"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "binBps"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "OUT"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "binBps"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
"displayName": "P50 Latency"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.50, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.90, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.99, sum(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n ) by (le, job, ingress, exported_namespace)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n irate(\n nginx_ingress_controller_request_size_sum{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (job, ingress, exported_namespace)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n irate(\n nginx_ingress_controller_response_size_sum{\n job=~\"$job\",\n ingress!=\"\",\n controller_pod=~\"$controller\",\n controller_class=~\"$controller_class\",\n controller_namespace=~\"$namespace\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (job, ingress, exported_namespace)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Ingress Percentile Response Times and Transfer Rates",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value #A": 2,
|
||||
"Value #B": 3,
|
||||
"Value #C": 4,
|
||||
"Value #D": 5,
|
||||
"Value #E": 6,
|
||||
"exported_namespace": 0,
|
||||
"ingress": 1
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #A": "P50 Latency",
|
||||
"Value #B": "P95 Latency",
|
||||
"Value #C": "P99 Latency",
|
||||
"Value #D": "IN",
|
||||
"Value #E": "OUT",
|
||||
"exported_namespace": "Namespace",
|
||||
"ingress": "Ingress",
|
||||
"job": "Job"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"id": 11,
|
||||
"title": "Certificates",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Host"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "links",
|
||||
"value": [
|
||||
{
|
||||
"targetBlank": true,
|
||||
"title": "Go To Site",
|
||||
"type": "link",
|
||||
"url": "https://${__data.fields.Host}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "TTL"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"type": "color-text"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1814400
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 25
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": false,
|
||||
"displayName": "TTL"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "avg(\n nginx_ingress_controller_ssl_expire_time_seconds{\n job=~\"$job\",\n pod=~\"$controller\"\n }\n) by (host) - time()\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Ingress Certificate Expiry",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value": 1,
|
||||
"host": 0
|
||||
},
|
||||
"renameByName": {
|
||||
"Value": "TTL",
|
||||
"host": "Host"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"ingress-nginx",
|
||||
"ingress-nginx-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Job",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"query": "label_values(nginx_ingress_controller_config_hash{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Controller Namespace",
|
||||
"multi": true,
|
||||
"name": "namespace",
|
||||
"query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\"}, controller_namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Controller Class",
|
||||
"multi": true,
|
||||
"name": "controller_class",
|
||||
"query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\", controller_namespace=~\"$namespace\"}, controller_class)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Controller",
|
||||
"multi": true,
|
||||
"name": "controller",
|
||||
"query": "label_values(nginx_ingress_controller_config_hash{job=\"$job\", controller_namespace=~\"$namespace\", controller_class=~\"$controller_class\"}, controller_pod)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Ingress Namespace",
|
||||
"multi": true,
|
||||
"name": "exported_namespace",
|
||||
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\", namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\"}, exported_namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Ingress",
|
||||
"multi": true,
|
||||
"name": "ingress",
|
||||
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\", namespace=~\"$namespace\", controller_class=~\"$controller_class\", controller_pod=~\"$controller\", exported_namespace=~\"$exported_namespace\"}, ingress)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"allValue": "4-5",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": [
|
||||
"All"
|
||||
],
|
||||
"value": [
|
||||
"$__all"
|
||||
]
|
||||
},
|
||||
"description": "4 represents all 4xx codes, 5 represents all 5xx codes",
|
||||
"includeAll": true,
|
||||
"label": "Error Codes",
|
||||
"multi": true,
|
||||
"name": "error_codes",
|
||||
"options": [
|
||||
{
|
||||
"selected": true,
|
||||
"text": "4",
|
||||
"value": "4"
|
||||
},
|
||||
{
|
||||
"selected": false,
|
||||
"text": "5",
|
||||
"value": "5"
|
||||
}
|
||||
],
|
||||
"query": "4 : 4,5 : 5",
|
||||
"type": "custom"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Ingress Nginx / Overview",
|
||||
"uid": "ingress-nginx-overview-12mk"
|
||||
}
|
|
@ -0,0 +1,594 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Ingress-nginx. It is created using the (Ingress-Nginx-mixin)[https://github.com/adinhodovic/ingress-nginx-mixin]",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"ingress-nginx",
|
||||
"ingress-nginx-mixin"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Ingress Nginx Dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Ingress Response Times",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.5,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
|
||||
"legendFormat": ".5 - {{ ingress }}/{{ exported_namespace }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.95,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
|
||||
"legendFormat": ".95 - {{ ingress }}/{{ exported_namespace }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.99,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
|
||||
"legendFormat": ".99 - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Total Request Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.5,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
|
||||
"legendFormat": ".5 - {{ ingress }}/{{ exported_namespace }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.95,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
|
||||
"legendFormat": ".95 - {{ ingress }}/{{ exported_namespace }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n 0.99,\n sum by (le, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
|
||||
"legendFormat": ".99 - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Upstream Response Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"id": 4,
|
||||
"title": "Ingress Paths",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "value"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum by (path, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n",
|
||||
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Request Volume",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(\n .5,\n sum by (le, path, ingress, exported_namespace)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n )\n)\n",
|
||||
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Median upstream response time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "value"
|
||||
}
|
||||
},
|
||||
"unit": "percentunit"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status=~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n)\n/\nsum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_requests{\n job=~\"$job\",\n exported_namespace =~ \"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n",
|
||||
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Response error rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 14
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum by (path, ingress, exported_namespace) (\n rate(\n nginx_ingress_controller_response_duration_seconds_sum{\n job=~\"$job\",\n exported_namespace =~ \"$exported_namespace\",\n ingress =~ \"$ingress\"\n }[$__rate_interval]\n )\n)\n",
|
||||
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Upstream time consumed",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "value"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n status=~\"[$error_codes].*\"\n }[$__rate_interval]\n )\n) by(path, ingress, exported_namespace, status)\n",
|
||||
"legendFormat": "{{ status }} {{ path }} - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Response error volume",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "value"
|
||||
}
|
||||
},
|
||||
"unit": "decbytes"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 20
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum (\n rate (\n nginx_ingress_controller_response_size_sum {\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\"\n }[$__rate_interval]\n )\n) by (path, ingress, exported_namespace)\n/\nsum (\n rate(\n nginx_ingress_controller_response_size_count {\n job=~\"$job\",\n exported_namespace=~\"$exported_namespace\",\n ingress=~\"$ingress\",\n }[$__rate_interval]\n )\n) by (path, ingress, exported_namespace)\n",
|
||||
"legendFormat": "{{ path }} - {{ ingress }}/{{ exported_namespace }}"
|
||||
}
|
||||
],
|
||||
"title": "Average response size",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"ingress-nginx",
|
||||
"ingress-nginx-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Job",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"query": "label_values(nginx_ingress_controller_config_hash{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Ingress Namespace",
|
||||
"multi": true,
|
||||
"name": "exported_namespace",
|
||||
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\"}, exported_namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": false,
|
||||
"label": "Ingress",
|
||||
"multi": true,
|
||||
"name": "ingress",
|
||||
"query": "label_values(nginx_ingress_controller_requests{job=\"$job\", exported_namespace=~\"$exported_namespace\"}, ingress)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"allValue": "4-5",
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": [
|
||||
"All"
|
||||
],
|
||||
"value": [
|
||||
"$__all"
|
||||
]
|
||||
},
|
||||
"description": "4 represents all 4xx codes, 5 represents all 5xx codes",
|
||||
"includeAll": true,
|
||||
"label": "Error Codes",
|
||||
"multi": true,
|
||||
"name": "error_codes",
|
||||
"options": [
|
||||
{
|
||||
"selected": true,
|
||||
"text": "4",
|
||||
"value": "4"
|
||||
},
|
||||
{
|
||||
"selected": false,
|
||||
"text": "5",
|
||||
"value": "5"
|
||||
}
|
||||
],
|
||||
"query": "4 : 4,5 : 5",
|
||||
"type": "custom"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Ingress Nginx / Request Handling Performance",
|
||||
"uid": "ingress-nginx-request-handling-jqkw"
|
||||
}
|
1
assets/ingress-nginx-mixin/rules.yaml
Normal file
1
assets/ingress-nginx-mixin/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
groups: []
|
73
assets/kubernetes-autoscaling/alerts.yaml
Normal file
73
assets/kubernetes-autoscaling/alerts.yaml
Normal file
|
@ -0,0 +1,73 @@
|
|||
groups:
|
||||
- name: karpenter
|
||||
rules:
|
||||
- alert: KarpenterCloudProviderErrors
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance
|
||||
description: The Karpenter provider {{ $labels.provider }} with the controller
|
||||
{{ $labels.controller }} has errors with the method {{ $labels.method }}.
|
||||
summary: Karpenter has Cloud Provider Errors.
|
||||
expr: |
|
||||
sum(
|
||||
increase(
|
||||
karpenter_cloudprovider_errors_total{
|
||||
job=~"karpenter"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, provider, controller, method) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KarpenterNodepoolNearCapacity
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview
|
||||
description: The resource {{ $labels.resource_type }} in the Karpenter node
|
||||
pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding
|
||||
resources.
|
||||
summary: Karpenter Nodepool near capacity.
|
||||
expr: |
|
||||
sum (
|
||||
karpenter_nodepools_usage{job=~"karpenter"}
|
||||
) by (namespace, job, nodepool, resource_type)
|
||||
/
|
||||
sum (
|
||||
karpenter_nodepools_limit{job=~"karpenter"}
|
||||
) by (namespace, job, nodepool, resource_type)
|
||||
* 100 > 75
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: cluster-autoscaler
|
||||
rules:
|
||||
- alert: ClusterAutoscalerNodeCountNearCapacity
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
|
||||
description: The node count for the cluster autoscaler job {{ $labels.job }}
|
||||
is reaching max limit. Consider scaling node groups.
|
||||
summary: Cluster Autoscaler Node Count near Capacity.
|
||||
expr: |
|
||||
sum (
|
||||
cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"}
|
||||
) by (namespace, job)
|
||||
/
|
||||
sum (
|
||||
cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"}
|
||||
) by (namespace, job)
|
||||
* 100 > 75
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ClusterAutoscalerUnschedulablePods
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
|
||||
description: The cluster currently has unschedulable pods, indicating resource
|
||||
shortages. Consider adding more nodes or increasing node group capacity.
|
||||
summary: Pods Pending Scheduling - Cluster Node Group Scaling Required
|
||||
expr: |
|
||||
sum (
|
||||
cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"}
|
||||
) by (namespace, job)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
|
@ -0,0 +1,643 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for cluster autoscaler. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"cluster-autoscaler"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Kubernetes / Autoscaling / Cluster Autoscaler",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Total Nodes",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 3,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_max_nodes_count{\n job=~\"$job\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Max Nodes",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 6,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_node_groups_count{\n job=~\"$job\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Node Groups",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 9,
|
||||
"y": 1
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\",\n state=\"ready\"\n }\n ) /\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n ) * 100\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Healthy Nodes",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"text": "No"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"text": "Yes"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_cluster_safe_to_autoscale{\n job=~\"$job\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Safe To Scale",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 15,
|
||||
"y": 1
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_unschedulable_pods_count{\n job=~\"$job\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Unscheduled Pods",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 18,
|
||||
"y": 1
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "time() - sum(\n cluster_autoscaler_last_activity{\n job=~\"$job\",\n activity=\"scaleDown\"\n }\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Last Scale Down",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 3,
|
||||
"x": 21,
|
||||
"y": 1
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "time() - sum(\n cluster_autoscaler_last_activity{\n job=~\"$job\",\n activity=\"scaleUp\"\n }\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Last Scale Up",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_unschedulable_pods_count{\n job=~\"$job\"\n }[2m]\n )\n ) by (type)\n)\n",
|
||||
"legendFormat": "{{ type }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_evicted_pods_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n",
|
||||
"legendFormat": "Evicted Pods"
|
||||
}
|
||||
],
|
||||
"title": "Pod Activity",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 4
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n ) by (state)\n)\n",
|
||||
"legendFormat": "{{ state }}"
|
||||
}
|
||||
],
|
||||
"title": "Node Activity",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_nodes_count{\n job=~\"$job\"\n }\n )\n)\n",
|
||||
"legendFormat": "Total Nodes"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n cluster_autoscaler_unneeded_nodes_count{\n job=~\"$job\"\n }\n )\n)\n",
|
||||
"legendFormat": "Unneeded Nodes"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_scaled_up_nodes_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n",
|
||||
"legendFormat": "Scaled Up Nodes"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n cluster_autoscaler_scaled_down_nodes_total{\n job=~\"$job\"\n }[2m]\n )\n )\n)\n",
|
||||
"legendFormat": "Scaled Down Nodes"
|
||||
}
|
||||
],
|
||||
"title": "Autoscaling Activity",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"cluster-autoscaler"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Job",
|
||||
"name": "job",
|
||||
"query": "label_values(cluster_autoscaler_last_activity{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Kubernetes / Autoscaling / Cluster Autoscaler",
|
||||
"uid": "kubernetes-autoscaling-mixin-ca-jkwq"
|
||||
}
|
|
@ -0,0 +1,507 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for horizontal pod autoscalers. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"kubernetes-core"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Kubernetes / Autoscaling",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_desired_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Desired Replicas",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_current_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Current Replicas",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_min_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Min Replicas",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 1
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_max_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Max Replicas",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Horitzontal Pod Autoscaler"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Metric Targets",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value #A": 4,
|
||||
"horizontalpodautoscaler": 1,
|
||||
"metric_name": 2,
|
||||
"metric_target_type": 3,
|
||||
"namespace": 0
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #A": "Threshold",
|
||||
"horizontalpodautoscaler": "Horitzontal Pod Autoscaler",
|
||||
"metric_name": "Metric Name",
|
||||
"metric_target_type": "Metric Target Type",
|
||||
"namespace": "Namespace"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\",\n metric_target_type=\"utilization\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n",
|
||||
"legendFormat": "Utilization / {{ metric_name }}"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_target_metric{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\",\n metric_name=~\"$metric_name\",\n metric_target_type=\"utilization\"\n }\n ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)\n)\n",
|
||||
"legendFormat": "Threshold / {{ metric_name }}"
|
||||
}
|
||||
],
|
||||
"title": "Utilization & Threshold",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_desired_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
|
||||
"legendFormat": "Desired Replicas"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_status_current_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
|
||||
"legendFormat": "Current Replicas"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_min_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
|
||||
"legendFormat": "Min Replicas"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_horizontalpodautoscaler_spec_max_replicas{\n job=~\"$job\",\n namespace=~\"$namespace\",\n horizontalpodautoscaler=\"$hpa\"\n }\n )\n)\n",
|
||||
"legendFormat": "Max Replicas"
|
||||
}
|
||||
],
|
||||
"title": "Replicas",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"kubernetes-core"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Job",
|
||||
"name": "job",
|
||||
"query": "label_values(kube_horizontalpodautoscaler_metadata_generation{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Namespace",
|
||||
"multi": true,
|
||||
"name": "namespace",
|
||||
"query": "label_values(kube_horizontalpodautoscaler_metadata_generation{job=~\"$job\"}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Horitzontal Pod Autoscaler",
|
||||
"name": "hpa",
|
||||
"query": "label_values(kube_horizontalpodautoscaler_spec_target_metric{job=~\"$job\", namespace=\"$namespace\"},horizontalpodautoscaler)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Metric Name",
|
||||
"multi": true,
|
||||
"name": "metric_name",
|
||||
"query": "label_values(kube_horizontalpodautoscaler_spec_target_metric{job=~\"$job\", namespace=\"$namespace\", horizontalpodautoscaler=\"$hpa\"}, metric_name)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Kubernetes / Autoscaling / Horitzontal Pod Autoscaler",
|
||||
"uid": "kubernetes-autoscaling-mixin-hpa-jkwq"
|
||||
}
|
|
@ -0,0 +1,482 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Karpenter and focuses on Karpenter deletion/creation activity. It is created using the [Kubernetes Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"karpenter"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Kubernetes / Autoscaling / Karpenter",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Node Pool Activity",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n karpenter_nodes_created_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "{{ nodepool }}"
|
||||
}
|
||||
],
|
||||
"title": "Nodes Created by Node Pool",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n karpenter_nodes_terminated_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "{{ nodepool }}"
|
||||
}
|
||||
],
|
||||
"title": "Nodes Terminated by Node Pool",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n karpenter_voluntary_disruption_decisions_total{\n job=~\"$job\",\n }[$__rate_interval]\n )\n ) by (decision, reason, consolidation_type)\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "{{ decision }} - {{ reason }} - {{ consolidation_type }}"
|
||||
}
|
||||
],
|
||||
"title": "Node Disruption Decisions by Reason, Decision, and Consolidation Type",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 7
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n karpenter_voluntary_disruption_eligible_nodes{\n job=~\"$job\",\n }[$__rate_interval]\n )\n ) by (reason)\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "{{ reason }}"
|
||||
}
|
||||
],
|
||||
"title": "Nodes Eligible for Disruption by Reason",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n karpenter_nodeclaims_disrupted_total{\n job=~\"$job\",\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool, capacity_type, reason)\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "{{ nodepool }} - {{ capacity_type }} - {{ reason }}"
|
||||
}
|
||||
],
|
||||
"title": "Nodes Disrupted by Node Pool",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 19
|
||||
},
|
||||
"id": 7,
|
||||
"title": "Pod Activity",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n karpenter_pods_state{\n job=~\"$job\"\n }\n ) by (phase)\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "{{ phase }}"
|
||||
}
|
||||
],
|
||||
"title": "Pods by Phase",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 20
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"title": "Pods Startup Duration",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"karpenter"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Job",
|
||||
"name": "job",
|
||||
"query": "label_values(karpenter_nodes_allocatable{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Node Pool",
|
||||
"multi": true,
|
||||
"name": "nodepool",
|
||||
"query": "label_values(karpenter_nodepools_allowed_disruptions{job=~\"$job\"}, nodepool)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Kubernetes / Autoscaling / Karpenter / Activity",
|
||||
"uid": "kubernetes-autoscaling-mixin-kact-jkwq"
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,839 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Karpenter and focuses on Karpenter performance. It is created using the [Kubernetes Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"karpenter"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Kubernetes / Autoscaling / Karpenter",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"text": "No"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"text": "Yes"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n karpenter_cluster_state_synced{\n job=~\"$job\",\n }\n) by (job)\n"
|
||||
}
|
||||
],
|
||||
"title": "Cluster State Synced",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n karpenter_cluster_state_node_count{\n job=~\"$job\",\n }\n) by (job)\n"
|
||||
}
|
||||
],
|
||||
"title": "Cluster State Node Count",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 18,
|
||||
"x": 6,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n increase(\n karpenter_cloudprovider_errors_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n ) by (job, provider, controller, method)\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "{{ provider }} - {{ controller }} - {{ method }}"
|
||||
}
|
||||
],
|
||||
"title": "Cloud Provider Errors",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_nodes_termination_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"title": "Node Termination Duration",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 7
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.5\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.95\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "max(\n karpenter_pods_startup_duration_seconds{\n job=~\"$job\",\n quantile=\"0.99\"\n }\n)\n",
|
||||
"interval": "1m",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"title": "Pods Startup Duration",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"id": 7,
|
||||
"title": "Interruption Queue",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n increase(\n karpenter_interruption_received_messages_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job, message_type)\n",
|
||||
"legendFormat": "{{ message_type }}"
|
||||
}
|
||||
],
|
||||
"title": "Interruption Received Messages",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 14
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n increase(\n karpenter_interruption_deleted_messages_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job)\n",
|
||||
"legendFormat": "Deleted Messages"
|
||||
}
|
||||
],
|
||||
"title": "Interruption Deleted Messages",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 14
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_interruption_message_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"title": "Interruption Duration",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 11,
|
||||
"title": "Work Queue",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n karpenter_workqueue_depth{\n job=~\"$job\"\n }\n) by (job)\n",
|
||||
"legendFormat": "Queue Depth"
|
||||
}
|
||||
],
|
||||
"title": "Work Queue Depth",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 21
|
||||
},
|
||||
"id": 13,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_workqueue_queue_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"title": "Work Queue In Queue Duration",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 21
|
||||
},
|
||||
"id": 14,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.50,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P50"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.95,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P95"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "histogram_quantile(0.99,\n sum(\n irate(\n karpenter_workqueue_work_duration_seconds_bucket{\n job=~\"$job\"\n }[$__rate_interval]\n ) > 0\n ) by (job, le)\n)\n",
|
||||
"legendFormat": "P99"
|
||||
}
|
||||
],
|
||||
"title": "Work Queue Work Duration",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 27
|
||||
},
|
||||
"id": 15,
|
||||
"title": "Controller",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 100,
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"mode": "value"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 28
|
||||
},
|
||||
"id": 16,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n rate(\n controller_runtime_reconcile_total{\n job=~\"$job\"\n }[$__rate_interval]\n )\n) by (job, controller) > 0\n",
|
||||
"legendFormat": "{{ controller }}"
|
||||
}
|
||||
],
|
||||
"title": "Controller Reconcile",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"karpenter"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Job",
|
||||
"name": "job",
|
||||
"query": "label_values(karpenter_nodes_allocatable{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Node Pool",
|
||||
"multi": true,
|
||||
"name": "nodepool",
|
||||
"query": "label_values(karpenter_nodepools_allowed_disruptions{job=~\"$job\"}, nodepool)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Kubernetes / Autoscaling / Karpenter / Performance",
|
||||
"uid": "kubernetes-autoscaling-mixin-kperf-jkwq"
|
||||
}
|
|
@ -0,0 +1,568 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for pod disruption budgets. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"kubernetes-core"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Kubernetes / Autoscaling",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "$namespace Namespace Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Disruptions Allowed"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"type": "color-text"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Pod Disruption Budget"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\"\n }\n ) by (job, namespace, poddisruptionbudget)\n)\n",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"title": "Summary",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value #A": 2,
|
||||
"Value #B": 3,
|
||||
"Value #C": 4,
|
||||
"Value #D": 5,
|
||||
"namespace": 0,
|
||||
"poddisruptionbudget": 1
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #A": "Disruptions Allowed",
|
||||
"Value #B": "Desired Healthy",
|
||||
"Value #C": "Currently Healthy",
|
||||
"Value #D": "Expected Pods",
|
||||
"namespace": "Namespace",
|
||||
"poddisruptionbudget": "Pod Disruption Budget"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"title": "$pdb Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Disruptions Allowed",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 9
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Desired Healthy",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 9
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Currently Healthy",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0.10000000000000001
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 9
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n"
|
||||
}
|
||||
],
|
||||
"title": "Expected Pods",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Currently Healthy"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Disruptions Allowed"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Desired Healthy"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Expected Pods"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "blue",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Last *",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_pod_disruptions_allowed{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
|
||||
"legendFormat": "Disruptions Allowed"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_desired_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
|
||||
"legendFormat": "Desired Healthy"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_current_healthy{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
|
||||
"legendFormat": "Currently Healthy"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "round(\n sum(\n kube_poddisruptionbudget_status_expected_pods{\n job=~\"$job\",\n namespace=~\"$namespace\",\n poddisruptionbudget=~\"$pdb\"\n }\n )\n)\n",
|
||||
"legendFormat": "Expected Pods"
|
||||
}
|
||||
],
|
||||
"title": "Status",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"kubernetes-core"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Job",
|
||||
"name": "job",
|
||||
"query": "label_values(kube_horizontalpodautoscaler_metadata_generation{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Namespace",
|
||||
"multi": true,
|
||||
"name": "namespace",
|
||||
"query": "label_values(kube_poddisruptionbudget_status_current_healthy{job=~\"$job\"}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Pod Disruption Budget",
|
||||
"name": "pdb",
|
||||
"query": "label_values(kube_poddisruptionbudget_status_current_healthy{job=~\"$job\", namespace=~\"$namespace\"}, poddisruptionbudget)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Kubernetes / Autoscaling / Pod Disruption Budget",
|
||||
"uid": "kubernetes-autoscaling-mixin-pdb-jkwq"
|
||||
}
|
|
@ -0,0 +1,895 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"description": "A dashboard that monitors Kubernetes and focuses on giving a overview for vertical pod autoscalers. It is created using the [Kubernetes / Autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).",
|
||||
"editable": true,
|
||||
"links": [
|
||||
{
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"kubernetes-core"
|
||||
],
|
||||
"targetBlank": true,
|
||||
"title": "Kubernetes / Autoscaling",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "$namespace Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "CPU Lower Bound"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "dark-red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "CPU Target"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "CPU Upper Bound"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Vertical Pod Autoscaler"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "CPU Lower Bound"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "CPU Target"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "CPU Upper Bound"
|
||||
}
|
||||
],
|
||||
"title": "CPU Resource Recommendations",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value #A": 4,
|
||||
"Value #B": 5,
|
||||
"Value #C": 6,
|
||||
"container": 2,
|
||||
"namespace": 0,
|
||||
"resource": 3,
|
||||
"verticalpodautoscaler": 1
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #A": "CPU Lower Bound",
|
||||
"Value #B": "CPU Target",
|
||||
"Value #C": "CPU Upper Bound",
|
||||
"container": "Container",
|
||||
"namespace": "Namespace",
|
||||
"resource": "Resource",
|
||||
"verticalpodautoscaler": "Vertical Pod Autoscaler"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Memory Lower Bound"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "dark-red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Memory Target"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "yellow",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Memory Upper Bound"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "custom.cellOptions",
|
||||
"value": {
|
||||
"mode": "basic",
|
||||
"type": "color-background"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"footer": {
|
||||
"enablePagination": true
|
||||
},
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Vertical Pod Autoscaler"
|
||||
}
|
||||
]
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "Memory Lower Bound"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "Memory Target"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "Memory Upper Bound"
|
||||
}
|
||||
],
|
||||
"title": "Memory Resource Recommendations",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {
|
||||
"Value #A": 4,
|
||||
"Value #B": 5,
|
||||
"Value #C": 6,
|
||||
"container": 2,
|
||||
"namespace": 0,
|
||||
"resource": 3,
|
||||
"verticalpodautoscaler": 1
|
||||
},
|
||||
"renameByName": {
|
||||
"Value #A": "Memory Lower Bound",
|
||||
"Value #B": "Memory Target",
|
||||
"Value #C": "Memory Upper Bound",
|
||||
"container": "Container",
|
||||
"namespace": "Namespace",
|
||||
"resource": "Resource",
|
||||
"verticalpodautoscaler": "Vertical Pod Autoscaler"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 17
|
||||
},
|
||||
"id": 4,
|
||||
"repeat": "container",
|
||||
"title": "$vpa / $container Summary",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 0
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "CPU Requests"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "CPU Limits"
|
||||
}
|
||||
],
|
||||
"title": "CPU Guaranteed QoS",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "CPU Requests"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "CPU Limits"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 18
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "CPU Requests"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "CPU Limits"
|
||||
}
|
||||
],
|
||||
"title": "CPU Burstable QoS",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 0
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 18
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "Memory Requests"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "Memory Limits"
|
||||
}
|
||||
],
|
||||
"title": "Memory Guaranteed QoS",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Memory Requests"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "red",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Memory Limits"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 18
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "Memory Requests"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "Memory Limits"
|
||||
}
|
||||
],
|
||||
"title": "Memory Burstable QoS",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 26
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "{{ container }} - Lower Bound"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "{{ container }} - Target"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"cpu\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "{{ container }} - Upper Bound"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n container=\"$container\"\n }\n) by (container)\n",
|
||||
"legendFormat": "{{ container }} - Usage"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_pod_container_resource_requests{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"cpu\",\n container=\"$container\"\n }\n) by (container)\n",
|
||||
"legendFormat": "{{ container }} - Requests"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_pod_container_resource_limits{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"cpu\",\n container=\"$container\"\n }\n) by (container)\n",
|
||||
"legendFormat": "{{ container }} - Limits"
|
||||
}
|
||||
],
|
||||
"title": "VPA CPU Recommendations Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"spanNulls": false
|
||||
},
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 26
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [
|
||||
"lastNotNull",
|
||||
"mean",
|
||||
"max"
|
||||
],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"sortBy": "Mean",
|
||||
"sortDesc": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "{{ container }} - Lower Bound"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "{{ container }} - Target"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound{\n job=~\"$job\",\n namespace=~\"$namespace\",\n resource=\"memory\",\n verticalpodautoscaler=\"$vpa\",\n container=\"$container\"\n }\n) by (job, namespace, verticalpodautoscaler, container, resource)\n",
|
||||
"legendFormat": "{{ container }} - Upper Bound"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n container_memory_working_set_bytes{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa(.+)\",\n container=\"$container\"\n }\n) by (container)\n",
|
||||
"legendFormat": "{{ container }} - Usage"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_pod_container_resource_requests{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"memory\",\n container=\"$container\"\n }\n) by (container)\n",
|
||||
"legendFormat": "{{ container }} - Requests"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "$datasource"
|
||||
},
|
||||
"expr": "sum(\n kube_pod_container_resource_limits{\n job=~\"$job\",\n namespace=\"$namespace\",\n pod=~\"$vpa-(.+)\",\n resource=\"memory\",\n container=\"$container\"\n }\n) by (container)\n",
|
||||
"legendFormat": "{{ container }} - Limits"
|
||||
}
|
||||
],
|
||||
"title": "VPA Memory Recommendations Over Time",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"kubernetes",
|
||||
"autoscaling",
|
||||
"kubernetes-autoscaling-mixin",
|
||||
"kubernetes-core"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Job",
|
||||
"name": "job",
|
||||
"query": "label_values(kube_customresource_verticalpodautoscaler_labels{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "Namespace",
|
||||
"multi": true,
|
||||
"name": "namespace",
|
||||
"query": "label_values(kube_customresource_verticalpodautoscaler_labels{job=~\"$job\"}, namespace)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"label": "VPA Pod Autoscaler",
|
||||
"name": "vpa",
|
||||
"query": "label_values(kube_customresource_verticalpodautoscaler_labels{job=~\"$job\", namespace=~\"$namespace\"}, verticalpodautoscaler)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Container",
|
||||
"multi": true,
|
||||
"name": "container",
|
||||
"query": "label_values(kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{job=~\"$job\", namespace=~\"$namespace\", verticalpodautoscaler=~\"$vpa\"}, container)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Kubernetes / Autoscaling / Vertical Pod Autoscaler",
|
||||
"uid": "kubernetes-autoscaling-mixin-vpa-jkwq"
|
||||
}
|
1
assets/kubernetes-autoscaling/rules.yaml
Normal file
1
assets/kubernetes-autoscaling/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
25
mixins.json
25
mixins.json
|
@ -516,6 +516,31 @@
|
|||
"name": "gitea",
|
||||
"source": "https://github.com/go-gitea/gitea",
|
||||
"subdir": "contrib/gitea-monitoring-mixin"
|
||||
},
|
||||
{
|
||||
"name": "django",
|
||||
"source": "https://github.com/adinhodovic/django-mixin",
|
||||
"subdir": ""
|
||||
},
|
||||
{
|
||||
"name": "celery",
|
||||
"source": "https://github.com/danihodovic/celery-exporter",
|
||||
"subdir": "celery-mixin"
|
||||
},
|
||||
{
|
||||
"name": "argo-cd-2",
|
||||
"source": "https://github.com/adinhodovic/argo-cd-mixin",
|
||||
"subdir": ""
|
||||
},
|
||||
{
|
||||
"name": "ingress-nginx-mixin",
|
||||
"source": "https://github.com/adinhodovic/ingress-nginx-mixin",
|
||||
"subdir": ""
|
||||
},
|
||||
{
|
||||
"name": "kubernetes-autoscaling",
|
||||
"source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin",
|
||||
"subdir": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
156
site/content/argo-cd-2/_index.md
Normal file
156
site/content/argo-cd-2/_index.md
Normal file
|
@ -0,0 +1,156 @@
|
|||
---
|
||||
title: argo-cd-2
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/adinhodovic/argo-cd-mixin](https://github.com/adinhodovic/argo-cd-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### argo-cd
|
||||
|
||||
##### ArgoCdAppOutOfSync
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ArgoCdAppOutOfSync
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
|
||||
}}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} is out of sync with the sync status {{ $labels.sync_status }}
|
||||
for the past 15m.
|
||||
summary: An ArgoCD Application is Out Of Sync.
|
||||
expr: |
|
||||
sum(
|
||||
argocd_app_info{
|
||||
job=~".*",
|
||||
sync_status!="Synced"
|
||||
}
|
||||
) by (job, dest_server, project, name, sync_status)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ArgoCdAppUnhealthy
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ArgoCdAppUnhealthy
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
|
||||
}}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} is unhealthy with the health status {{ $labels.health_status }}
|
||||
for the past 15m.
|
||||
summary: An ArgoCD Application is Unhealthy.
|
||||
expr: |
|
||||
sum(
|
||||
argocd_app_info{
|
||||
job=~".*",
|
||||
health_status!~"Healthy|Progressing"
|
||||
}
|
||||
) by (job, dest_server, project, name, health_status)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ArgoCdAppAutoSyncDisabled
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ArgoCdAppAutoSyncDisabled
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
|
||||
}}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} has autosync disabled for the past 2h.
|
||||
summary: An ArgoCD Application has AutoSync Disabled.
|
||||
expr: |
|
||||
sum(
|
||||
argocd_app_info{
|
||||
job=~".*",
|
||||
autosync_enabled!="true",
|
||||
name!~""
|
||||
}
|
||||
) by (job, dest_server, project, name, autosync_enabled)
|
||||
> 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ArgoCdAppSyncFailed
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ArgoCdAppSyncFailed
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-application-overview-kask/argocd-application-overview?var-dest_server={{
|
||||
$labels.dest_server }}&var-project={{ $labels.project }}&var-application={{ $labels.name
|
||||
}}
|
||||
description: The application {{ $labels.dest_server }}/{{ $labels.project }}/{{
|
||||
$labels.name }} has failed to sync with the status {{ $labels.phase }} the past
|
||||
10m.
|
||||
summary: An ArgoCD Application has Failed to Sync.
|
||||
expr: |
|
||||
sum(
|
||||
round(
|
||||
increase(
|
||||
argocd_app_sync_total{
|
||||
job=~".*",
|
||||
phase!="Succeeded"
|
||||
}[10m]
|
||||
)
|
||||
)
|
||||
) by (job, dest_server, project, name, phase) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ArgoCdNotificationDeliveryFailed
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ArgoCdNotificationDeliveryFailed
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/argo-cd-notifications-overview-kask/argocd-notifications-overview?var-job={{
|
||||
$labels.job }}&var-exported_service={{ $labels.exported_service }}
|
||||
description: The notification job {{ $labels.job }} has failed to deliver to {{
|
||||
$labels.exported_service }} for the past 10m.
|
||||
summary: ArgoCD Notification Delivery Failed.
|
||||
expr: |
|
||||
sum(
|
||||
round(
|
||||
increase(
|
||||
argocd_notifications_deliveries_total{
|
||||
job=~".*",
|
||||
succeeded!="true"
|
||||
}[10m]
|
||||
)
|
||||
)
|
||||
) by (job, exported_service, succeeded) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [argo-cd-application-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-application-overview.json)
|
||||
- [argo-cd-notifications-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-notifications-overview.json)
|
||||
- [argo-cd-operational-overview](https://github.com/monitoring-mixins/website/blob/master/assets/argo-cd-2/dashboards/argo-cd-operational-overview.json)
|
114
site/content/celery/_index.md
Normal file
114
site/content/celery/_index.md
Normal file
|
@ -0,0 +1,114 @@
|
|||
---
|
||||
title: celery
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/danihodovic/celery-exporter](https://github.com/danihodovic/celery-exporter/tree/master/celery-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/celery/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### celery
|
||||
|
||||
##### CeleryTaskHighFailRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CeleryTaskHighFailRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{
|
||||
$labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name
|
||||
}}
|
||||
description: More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name
|
||||
}}/{{ $labels.name }} the past 10m.
|
||||
summary: Celery high task fail rate.
|
||||
expr: |
|
||||
sum(
|
||||
increase(
|
||||
celery_task_failed_total{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None",
|
||||
name!~"None"
|
||||
}[10m]
|
||||
)
|
||||
) by (job, namespace, queue_name, name)
|
||||
/
|
||||
(
|
||||
sum(
|
||||
increase(
|
||||
celery_task_failed_total{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None",
|
||||
name!~"None"
|
||||
}[10m]
|
||||
)
|
||||
) by (job, namespace, queue_name, name)
|
||||
+
|
||||
sum(
|
||||
increase(
|
||||
celery_task_succeeded_total{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None",
|
||||
name!~"None"
|
||||
}[10m]
|
||||
)
|
||||
) by (job, namespace, queue_name, name)
|
||||
)
|
||||
* 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CeleryHighQueueLength
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CeleryHighQueueLength
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
|
||||
$labels.job }}&var-queue_name={{ $labels.queue_name }}
|
||||
description: More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name
|
||||
}} the past 20m.
|
||||
summary: Celery high queue length.
|
||||
expr: |
|
||||
sum(
|
||||
celery_queue_length{
|
||||
job=~".*celery.*",
|
||||
queue_name!~"None"
|
||||
}
|
||||
) by (job, namespace, queue_name)
|
||||
> 100
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CeleryWorkerDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CeleryWorkerDown
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{
|
||||
$labels.job }}
|
||||
description: The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline.
|
||||
summary: A Celery worker is offline.
|
||||
expr: |
|
||||
celery_worker_up{job=~".*celery.*"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [celery-tasks-by-task](https://github.com/monitoring-mixins/website/blob/master/assets/celery/dashboards/celery-tasks-by-task.json)
|
||||
- [celery-tasks-overview](https://github.com/monitoring-mixins/website/blob/master/assets/celery/dashboards/celery-tasks-overview.json)
|
|
@ -68,10 +68,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni
|
|||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json)
|
||||
- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json)
|
||||
- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json)
|
||||
- [cilium-agent](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent.json)
|
||||
- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json)
|
||||
- [cilium-api](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-api.json)
|
||||
- [cilium-bpf](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-bpf.json)
|
||||
- [cilium-conntrack](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-conntrack.json)
|
||||
|
@ -80,6 +78,8 @@ Following dashboards are generated from mixins and hosted on github:
|
|||
- [cilium-fqdn-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-fqdn-proxy.json)
|
||||
- [cilium-identities](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-identities.json)
|
||||
- [cilium-kubernetes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-kubernetes.json)
|
||||
- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json)
|
||||
- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json)
|
||||
- [cilium-network](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-network.json)
|
||||
- [cilium-nodes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-nodes.json)
|
||||
- [cilium-operator](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-operator.json)
|
||||
|
|
|
@ -2491,8 +2491,8 @@ Following dashboards are generated from mixins and hosted on github:
|
|||
|
||||
|
||||
- [alertmanager](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/alertmanager.json)
|
||||
- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json)
|
||||
- [cortex-compactor](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor.json)
|
||||
- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json)
|
||||
- [cortex-config](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-config.json)
|
||||
- [cortex-object-store](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-object-store.json)
|
||||
- [cortex-queries](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-queries.json)
|
||||
|
|
141
site/content/django/_index.md
Normal file
141
site/content/django/_index.md
Normal file
|
@ -0,0 +1,141 @@
|
|||
---
|
||||
title: django
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/adinhodovic/django-mixin](https://github.com/adinhodovic/django-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/django/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### django
|
||||
|
||||
##### DjangoMigrationsUnapplied
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: DjangoMigrationsUnapplied
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}
|
||||
description: The job {{ $labels.job }} has unapplied migrations.
|
||||
summary: Django has unapplied migrations.
|
||||
expr: |
|
||||
sum(
|
||||
django_migrations_unapplied_total{
|
||||
job=~"django"
|
||||
}
|
||||
) by (namespace, job)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### DjangoDatabaseException
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: DjangoDatabaseException
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-overview-jkwq/django-overview?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}
|
||||
description: The job {{ $labels.job }} has hit the database exception {{ $labels.type
|
||||
}}.
|
||||
summary: Django database exception.
|
||||
expr: |
|
||||
sum (
|
||||
increase(
|
||||
django_db_errors_total{
|
||||
job=~"django"
|
||||
}[10m]
|
||||
)
|
||||
) by (type, namespace, job)
|
||||
> 0
|
||||
labels:
|
||||
severity: info
|
||||
{{< /code >}}
|
||||
|
||||
##### DjangoHighHttp4xxErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: DjangoHighHttp4xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
|
||||
description: More than 5% HTTP requests with status 4xx for {{ $labels.job }}/{{
|
||||
$labels.view }} the past 5m.
|
||||
summary: Django high HTTP 4xx error rate.
|
||||
expr: |
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
status=~"^4.*",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
* 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### DjangoHighHttp5xxErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: DjangoHighHttp5xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/django-requests-by-view-jkwq/django-requests-by-view?var-namespace={{
|
||||
$labels.namespace }}&var-job={{ $labels.job }}&var-view={{ $labels.view }}
|
||||
description: More than 5% HTTP requests with status 5xx for {{ $labels.job }}/{{
|
||||
$labels.view }} the past 5m.
|
||||
summary: Django high HTTP 5xx error rate.
|
||||
expr: |
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
status=~"^5.*",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
/
|
||||
sum(
|
||||
rate(
|
||||
django_http_responses_total_by_status_view_method_total{
|
||||
job=~"django",
|
||||
view!~"<unnamed view>|health_check:health_check_home|prometheus-django-metrics"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, view)
|
||||
* 100 > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [django-overview](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-overview.json)
|
||||
- [django-requests-by-view](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-requests-by-view.json)
|
||||
- [django-requests-overview](https://github.com/monitoring-mixins/website/blob/master/assets/django/dashboards/django-requests-overview.json)
|
80
site/content/ingress-nginx-mixin/_index.md
Normal file
80
site/content/ingress-nginx-mixin/_index.md
Normal file
|
@ -0,0 +1,80 @@
|
|||
---
|
||||
title: ingress-nginx-mixin
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/adinhodovic/ingress-nginx-mixin](https://github.com/adinhodovic/ingress-nginx-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### nginx.rules
|
||||
|
||||
##### NginxConfigReloadFailed
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NginxConfigReloadFailed
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-job={{
|
||||
$labels.job }}&var-controller_class={{ $labels.controller_class }}
|
||||
description: Nginx config reload failed for the controller with the class {{ $labels.controller_class
|
||||
}}.
|
||||
summary: Nginx config reload failed.
|
||||
expr: |
|
||||
sum(
|
||||
nginx_ingress_controller_config_last_reload_successful{job=~"ingress-nginx-controller-metrics"}
|
||||
) by (job, controller_class)
|
||||
== 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NginxHighHttp4xxErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NginxHighHttp4xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
|
||||
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
|
||||
description: More than 5% HTTP requests with status 4xx for {{ $labels.exported_namespace
|
||||
}}/{{ $labels.ingress }} the past 5m.
|
||||
summary: Nginx high HTTP 4xx error rate.
|
||||
expr: |
|
||||
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^4.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
{{< /code >}}
|
||||
|
||||
##### NginxHighHttp5xxErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NginxHighHttp5xxErrorRate
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/ingress-nginx-overview-12mk/ingress-nginx-overview?var-exported_namespace={{
|
||||
$labels.exported_namespace }}&var-ingress={{ $labels.ingress }}
|
||||
description: More than 5% HTTP requests with status 5xx for {{ $labels.exported_namespace
|
||||
}}/{{ $labels.ingress }} the past 5m.
|
||||
summary: Nginx high HTTP 5xx error rate.
|
||||
expr: |
|
||||
(sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", status=~"^5.*", ingress!~""}[5m])) by (exported_namespace, ingress) / sum(rate(nginx_ingress_controller_requests{job=~"ingress-nginx-controller-metrics", ingress!~""}[5m])) by (exported_namespace, ingress) * 100) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [ingress-nginx-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/dashboards/ingress-nginx-overview.json)
|
||||
- [ingress-nginx-request-handling-performance](https://github.com/monitoring-mixins/website/blob/master/assets/ingress-nginx-mixin/dashboards/ingress-nginx-request-handling-performance.json)
|
120
site/content/kubernetes-autoscaling/_index.md
Normal file
120
site/content/kubernetes-autoscaling/_index.md
Normal file
|
@ -0,0 +1,120 @@
|
|||
---
|
||||
title: kubernetes-autoscaling
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/adinhodovic/kubernetes-autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### karpenter
|
||||
|
||||
##### KarpenterCloudProviderErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KarpenterCloudProviderErrors
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance
|
||||
description: The Karpenter provider {{ $labels.provider }} with the controller {{
|
||||
$labels.controller }} has errors with the method {{ $labels.method }}.
|
||||
summary: Karpenter has Cloud Provider Errors.
|
||||
expr: |
|
||||
sum(
|
||||
increase(
|
||||
karpenter_cloudprovider_errors_total{
|
||||
job=~"karpenter"
|
||||
}[5m]
|
||||
)
|
||||
) by (namespace, job, provider, controller, method) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### KarpenterNodepoolNearCapacity
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KarpenterNodepoolNearCapacity
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview
|
||||
description: The resource {{ $labels.resource_type }} in the Karpenter node pool
|
||||
{{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources.
|
||||
summary: Karpenter Nodepool near capacity.
|
||||
expr: |
|
||||
sum (
|
||||
karpenter_nodepools_usage{job=~"karpenter"}
|
||||
) by (namespace, job, nodepool, resource_type)
|
||||
/
|
||||
sum (
|
||||
karpenter_nodepools_limit{job=~"karpenter"}
|
||||
) by (namespace, job, nodepool, resource_type)
|
||||
* 100 > 75
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### cluster-autoscaler
|
||||
|
||||
##### ClusterAutoscalerNodeCountNearCapacity
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ClusterAutoscalerNodeCountNearCapacity
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
|
||||
description: The node count for the cluster autoscaler job {{ $labels.job }} is
|
||||
reaching max limit. Consider scaling node groups.
|
||||
summary: Cluster Autoscaler Node Count near Capacity.
|
||||
expr: |
|
||||
sum (
|
||||
cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"}
|
||||
) by (namespace, job)
|
||||
/
|
||||
sum (
|
||||
cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"}
|
||||
) by (namespace, job)
|
||||
* 100 > 75
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ClusterAutoscalerUnschedulablePods
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ClusterAutoscalerUnschedulablePods
|
||||
annotations:
|
||||
dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
|
||||
description: The cluster currently has unschedulable pods, indicating resource shortages.
|
||||
Consider adding more nodes or increasing node group capacity.
|
||||
summary: Pods Pending Scheduling - Cluster Node Group Scaling Required
|
||||
expr: |
|
||||
sum (
|
||||
cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"}
|
||||
) by (namespace, job)
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [kubernetes-autoscaling-mixin-ca](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-ca.json)
|
||||
- [kubernetes-autoscaling-mixin-hpa](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-hpa.json)
|
||||
- [kubernetes-autoscaling-mixin-karpenter-act](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-act.json)
|
||||
- [kubernetes-autoscaling-mixin-karpenter-over](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-over.json)
|
||||
- [kubernetes-autoscaling-mixin-karpenter-perf](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-karpenter-perf.json)
|
||||
- [kubernetes-autoscaling-mixin-pdb](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-pdb.json)
|
||||
- [kubernetes-autoscaling-mixin-vpa](https://github.com/monitoring-mixins/website/blob/master/assets/kubernetes-autoscaling/dashboards/kubernetes-autoscaling-mixin-vpa.json)
|
|
@ -281,8 +281,8 @@ Following dashboards are generated from mixins and hosted on github:
|
|||
- [loki-logs](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-logs.json)
|
||||
- [loki-mixin-recording-rules](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-mixin-recording-rules.json)
|
||||
- [loki-operational](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-operational.json)
|
||||
- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json)
|
||||
- [loki-reads](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads.json)
|
||||
- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json)
|
||||
- [loki-retention](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-retention.json)
|
||||
- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json)
|
||||
- [loki-writes](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes.json)
|
||||
- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json)
|
||||
|
|
|
@ -433,5 +433,5 @@ labels:
|
|||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
||||
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
|
||||
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
||||
|
|
|
@ -516,6 +516,31 @@
|
|||
"name": "gitea",
|
||||
"source": "https://github.com/go-gitea/gitea",
|
||||
"subdir": "contrib/gitea-monitoring-mixin"
|
||||
},
|
||||
{
|
||||
"name": "django",
|
||||
"source": "https://github.com/adinhodovic/django-mixin",
|
||||
"subdir": ""
|
||||
},
|
||||
{
|
||||
"name": "celery",
|
||||
"source": "https://github.com/danihodovic/celery-exporter",
|
||||
"subdir": "celery-mixin"
|
||||
},
|
||||
{
|
||||
"name": "argo-cd-2",
|
||||
"source": "https://github.com/adinhodovic/argo-cd-mixin",
|
||||
"subdir": ""
|
||||
},
|
||||
{
|
||||
"name": "ingress-nginx-mixin",
|
||||
"source": "https://github.com/adinhodovic/ingress-nginx-mixin",
|
||||
"subdir": ""
|
||||
},
|
||||
{
|
||||
"name": "kubernetes-autoscaling",
|
||||
"source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin",
|
||||
"subdir": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue