diff --git a/assets/kubernetes/alerts.yaml b/assets/kubernetes/alerts.yaml index 9ed4d83..9b728f8 100644 --- a/assets/kubernetes/alerts.yaml +++ b/assets/kubernetes/alerts.yaml @@ -266,7 +266,7 @@ groups: - alert: KubeMemoryOvercommit annotations: description: Cluster has overcommitted memory resource requests for Pods by - {{ $value }} bytes and cannot tolerate node failure. + {{ $value | humanize }} bytes and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: | diff --git a/assets/kubernetes/dashboards/k8s-resources-node.json b/assets/kubernetes/dashboards/k8s-resources-node.json index 0429589..32bfaf8 100644 --- a/assets/kubernetes/dashboards/k8s-resources-node.json +++ b/assets/kubernetes/dashboards/k8s-resources-node.json @@ -41,12 +41,32 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "max capacity", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hiddenSeries": true, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + } + ], "spaceLength": 10, "span": 12, "stack": true, "steppedLine": false, "targets": [ + { + "expr": "sum(kube_node_status_capacity{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "max capacity", + "legendLink": null, + "step": 10 + }, { "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", @@ -370,12 +390,32 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ ], + "seriesOverrides": [ + { + "alias": "max capacity", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hiddenSeries": true, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + } + ], "spaceLength": 10, "span": 12, "stack": true, "steppedLine": false, "targets": [ + { + "expr": "sum(kube_node_status_capacity{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "max capacity", + "legendLink": null, + "step": 10 + }, { "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)", "format": "time_series", diff --git a/assets/loki/dashboards/loki-chunks.json b/assets/loki/dashboards/loki-chunks.json index b8423e4..2258c24 100644 --- a/assets/loki/dashboards/loki-chunks.json +++ b/assets/loki/dashboards/loki-chunks.json @@ -939,6 +939,6 @@ }, "timezone": "utc", "title": "Loki / Chunks", - "uid": "", + "uid": "chunks", "version": 0 } diff --git a/assets/loki/dashboards/loki-logs.json b/assets/loki/dashboards/loki-logs.json index becd583..312ce28 100644 --- a/assets/loki/dashboards/loki-logs.json +++ b/assets/loki/dashboards/loki-logs.json @@ -1068,6 +1068,6 @@ }, "timezone": "utc", "title": "Loki / Logs", - "uid": "", + "uid": "logs", "version": 0 } diff --git a/assets/loki/dashboards/loki-operational.json b/assets/loki/dashboards/loki-operational.json index 9f3f431..6cd5012 100644 --- a/assets/loki/dashboards/loki-operational.json +++ b/assets/loki/dashboards/loki-operational.json @@ -6660,6 +6660,6 @@ }, "timezone": "utc", "title": "Loki / Operational", - "uid": "", + "uid": "operational", "version": 0 } diff --git a/assets/loki/dashboards/loki-reads-resources.json b/assets/loki/dashboards/loki-reads-resources.json index aba3ec4..cc87fda 100644 --- a/assets/loki/dashboards/loki-reads-resources.json +++ b/assets/loki/dashboards/loki-reads-resources.json @@ -2263,6 +2263,6 @@ }, "timezone": "utc", "title": "Loki / Reads Resources", - "uid": "", + "uid": "reads-resources", "version": 0 } diff --git a/assets/loki/dashboards/loki-reads.json b/assets/loki/dashboards/loki-reads.json index bac9e6b..f7fddf8 100644 --- a/assets/loki/dashboards/loki-reads.json +++ b/assets/loki/dashboards/loki-reads.json @@ -1251,6 +1251,6 @@ }, "timezone": "utc", "title": "Loki / Reads", - "uid": "", + "uid": "reads", "version": 0 } diff --git a/assets/loki/dashboards/loki-writes-resources.json b/assets/loki/dashboards/loki-writes-resources.json index aa543f6..765ff4e 100644 --- a/assets/loki/dashboards/loki-writes-resources.json +++ b/assets/loki/dashboards/loki-writes-resources.json @@ -1235,6 +1235,6 @@ }, "timezone": "utc", "title": "Loki / Writes Resources", - "uid": "", + "uid": "writes-resources", "version": 0 } diff --git a/assets/loki/dashboards/loki-writes.json b/assets/loki/dashboards/loki-writes.json index 26fd55e..db063e5 100644 --- a/assets/loki/dashboards/loki-writes.json +++ b/assets/loki/dashboards/loki-writes.json @@ -1063,6 +1063,6 @@ }, "timezone": "utc", "title": "Loki / Writes", - "uid": "", + "uid": "writes", "version": 0 } diff --git a/assets/promtail/dashboards/promtail.json b/assets/promtail/dashboards/promtail.json new file mode 100644 index 0000000..b43c4c0 --- /dev/null +++ b/assets/promtail/dashboards/promtail.json @@ -0,0 +1,644 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "loki" + ], + "targetBlank": false, + "title": "Loki Dashboards", + "type": "dashboards" + } + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(promtail_targets_active_total{cluster=~\"$cluster\",job=\"$namespace/$name\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Active Targets", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Active Targets", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(promtail_files_active_total{cluster=~\"$cluster\",job=\"$namespace/$name\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Active Targets", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Active Files", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Targets & Files", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(promtail_read_bytes_total{cluster=~\"$cluster\",job=\"$namespace/$name\"}[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "logs read", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bps", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(promtail_read_lines_total{cluster=~\"$cluster\",job=\"$namespace/$name\"}[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "lines read", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Lines", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "IO", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(promtail_request_duration_seconds_count{cluster=~\"$cluster\",job=\"$namespace/$name\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{status}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "QPS", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (job:promtail_request_duration_seconds_bucket:sum_rate{job=\"$namespace/$name\", cluster=~\"$cluster\"})) * 1e3", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "99th Percentile", + "refId": "A", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum by (le) (job:promtail_request_duration_seconds_bucket:sum_rate{job=\"$namespace/$name\", cluster=~\"$cluster\"})) * 1e3", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "50th Percentile", + "refId": "B", + "step": 10 + }, + { + "expr": "1e3 * sum(job:promtail_request_duration_seconds_sum:sum_rate{job=\"$namespace/$name\", cluster=~\"$cluster\"}) / sum(job:promtail_request_duration_seconds_count:sum_rate{job=\"$namespace/$name\", cluster=~\"$cluster\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Average", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Latency", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Requests", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "loki" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(loki_build_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(loki_build_info{cluster=~\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Loki / Promtail", + "uid": "promtail", + "version": 0 +} diff --git a/site/content/kubernetes/_index.md b/site/content/kubernetes/_index.md index f5971c7..3c27529 100644 --- a/site/content/kubernetes/_index.md +++ b/site/content/kubernetes/_index.md @@ -384,7 +384,7 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md alert: KubeMemoryOvercommit annotations: description: Cluster has overcommitted memory resource requests for Pods by {{ $value - }} bytes and cannot tolerate node failure. + | humanize }} bytes and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: | diff --git a/site/content/promtail/_index.md b/site/content/promtail/_index.md index ebaf2a5..9fdae0b 100644 --- a/site/content/promtail/_index.md +++ b/site/content/promtail/_index.md @@ -222,3 +222,8 @@ expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_cod record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate {{< /code >}} +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [promtail](https://github.com/monitoring-mixins/website/blob/master/assets/promtail/dashboards/promtail.json)