1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00

assets,site/content: daily assets regeneration

This commit is contained in:
github-actions[bot] 2024-12-12 03:42:29 +00:00
parent 82075a7fb7
commit 2355980377
19 changed files with 262 additions and 262 deletions

View file

@ -6,8 +6,8 @@ groups:
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
summary: JVM heap memory filling up.
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without
(id) (jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80
for: 5m
keep_firing_for: 5m
labels:
@ -18,7 +18,7 @@ groups:
are in a cyclic dependency with each other. The restart is required to resolve
the deadlock.'
summary: JVM deadlock detected.
expr: (jvm_threads_deadlocked{job!=""}) > 0
expr: (jvm_threads_deadlocked{}) > 0
for: 2m
keep_firing_for: 5m
labels:

View file

@ -51,7 +51,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory used(heap)",
@ -100,7 +100,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory used(nonheap)",
@ -151,7 +151,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_threads_current{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_threads_current{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Threads",
@ -202,7 +202,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_classes_loaded{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_classes_loaded{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Classes loaded",
@ -284,7 +284,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "time()-process_start_time_seconds{job!=\"\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "time()-process_start_time_seconds{job=~\"$job\",instance=~\"$instance\"}",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Uptime",
@ -344,7 +344,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,instance) (\n process_start_time_seconds{job!=\"\",job=~\"$job\",instance=~\"$instance\"} * 1000\n)",
"expr": "avg by (job,instance) (\n process_start_time_seconds{job=~\"$job\",instance=~\"$instance\"} * 1000\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Process start time",
@ -487,7 +487,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,instance) (\n rate(process_cpu_seconds_total{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 100\n)",
"expr": "avg by (job,instance) (\n rate(process_cpu_seconds_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 100\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: CPU usage (process)",
@ -582,7 +582,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,instance) (\n process_resident_memory_bytes{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job,instance) (\n process_resident_memory_bytes{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Process memory used (rss)",
@ -658,7 +658,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,instance) (\n process_open_fds{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job,instance) (\n process_open_fds{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Process files open",
@ -669,7 +669,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,instance) (\n process_max_fds{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job,instance) (\n process_max_fds{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Process files max",
@ -801,7 +801,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"})\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory used(heap)",
@ -812,7 +812,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory max(heap)",
@ -823,7 +823,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"})\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory committed(heap)",
@ -944,7 +944,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"})\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory used(nonheap)",
@ -955,7 +955,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory max(nonheap)",
@ -966,7 +966,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"})\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: JVM memory committed(nonheap)",
@ -1098,7 +1098,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Eden Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Eden Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Eden Space (used)",
@ -1109,7 +1109,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Eden Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Eden Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Eden Space (max)",
@ -1120,7 +1120,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Eden Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Eden Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Eden Space (committed)",
@ -1239,7 +1239,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Survivor Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Survivor Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Survival space (used)",
@ -1250,7 +1250,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Survivor Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1\n)",
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Survivor Space\", job=~\"$job\",instance=~\"$instance\"} != -1\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Survival space (max)",
@ -1261,7 +1261,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Survivor Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Survivor Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Survival space (committed)",
@ -1380,7 +1380,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Tenured space (used)",
@ -1391,7 +1391,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "\n avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)\n != -1",
"expr": "\n avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job=~\"$job\",instance=~\"$instance\"}\n)\n != -1",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Tenured space (max)",
@ -1402,7 +1402,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=\"PS Old Gen\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=\"PS Old Gen\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Tenured space (committed)",
@ -1466,7 +1466,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,gc) (\n rate(jvm_gc_collection_seconds_count{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
"expr": "avg by (job,gc) (\n rate(jvm_gc_collection_seconds_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{gc}}",
@ -1530,7 +1530,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job, gc) (\n rate(jvm_gc_collection_seconds_sum{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n /\n rate(jvm_gc_collection_seconds_count{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)\n",
"expr": "avg by (job, gc) (\n rate(jvm_gc_collection_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n /\n rate(jvm_gc_collection_seconds_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)\n",
"format": "time_series",
"instant": false,
"legendFormat": "{{ job }}: {{ gc }} (avg)",
@ -1625,7 +1625,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n rate(jvm_memory_pool_allocated_bytes_total{pool=~\"(G1 |PS )?Eden Space\",job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
"expr": "avg by (job) (\n rate(jvm_memory_pool_allocated_bytes_total{pool=~\"(G1 |PS )?Eden Space\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Allocated (bytes)",
@ -1766,7 +1766,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_threads_current{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_threads_current{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Threads",
@ -1777,7 +1777,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_threads_daemon{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_threads_daemon{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Threads (daemon)",
@ -1788,7 +1788,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_threads_peak{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_threads_peak{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Threads (peak)",
@ -1799,7 +1799,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_threads_deadlocked{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_threads_deadlocked{job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Threads (deadlocked)",
@ -1863,7 +1863,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (state, job) (jvm_threads_state{job!=\"\",job=~\"$job\",instance=~\"$instance\"})",
"expr": "sum by (state, job) (jvm_threads_state{job=~\"$job\",instance=~\"$instance\"})",
"format": "time_series",
"instant": false,
"legendFormat": "{{ state }}",
@ -1982,7 +1982,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"direct\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"direct\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Direct buffer used bytes",
@ -1993,7 +1993,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"direct\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"direct\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Direct buffer capacity",
@ -2100,7 +2100,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"mapped\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"mapped\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Mapped buffer used",
@ -2111,7 +2111,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"mapped\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"mapped\", job=~\"$job\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{job}}: Mapped buffer capacity",
@ -2147,7 +2147,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(jvm_memory_used_bytes{job!=\"\"}, job)",
"query": "label_values(jvm_memory_used_bytes{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -2162,7 +2162,7 @@
"label": "Instance",
"multi": true,
"name": "instance",
"query": "label_values(jvm_memory_used_bytes{job!=\"\",job=~\"$job\"}, instance)",
"query": "label_values(jvm_memory_used_bytes{job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -6,7 +6,7 @@ groups:
description: 'Kafka lag keeps increasing over the last 15 minutes for consumer
group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
summary: Kafka lag keeps increasing.
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""}[5m]))
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"}[5m]))
> 0
for: 15m
keep_firing_for: 10m
@ -17,7 +17,7 @@ groups:
description: 'Total kafka lag across all partitions is too high ({{ printf "%.0f"
$value }}) for consumer group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
summary: Kafka lag is too high.
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""})
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"})
> 100
for: 15m
keep_firing_for: 5m
@ -118,8 +118,8 @@ groups:
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
}} has disconected from Zookeeper.
summary: Kafka Zookeeper sync disconected.
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job="integrations/kafka",
quantile="0.95"}[5m])) < 0
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{quantile="0.95",job="integrations/kafka"}[5m]))
< 0
for: 5m
labels:
severity: critical

View file

@ -1495,7 +1495,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "request queue time",
@ -1506,7 +1506,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "local time",
@ -1517,7 +1517,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "remote time",
@ -1528,7 +1528,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "response queue time",
@ -1539,7 +1539,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "response time",
@ -1656,7 +1656,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "request queue time",
@ -1667,7 +1667,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "local time",
@ -1678,7 +1678,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "remote time",
@ -1689,7 +1689,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "response queue time",
@ -1700,7 +1700,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "response time",
@ -1817,7 +1817,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "request queue time",
@ -1828,7 +1828,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "local time",
@ -1839,7 +1839,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "remote time",
@ -1850,7 +1850,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "response queue time",
@ -1861,7 +1861,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "response time",
@ -2079,7 +2079,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_server_zookeeperclientmetrics_zookeeperrequestlatencyms{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_server_zookeeperclientmetrics_zookeeperrequestlatencyms{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Zookeeper request latency",
@ -2179,7 +2179,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Zookeeper connections",
@ -2190,7 +2190,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperexpirespersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperexpirespersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Zookeeper expired connections",
@ -2201,7 +2201,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperdisconnectspersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperdisconnectspersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Zookeeper disconnects",
@ -2212,7 +2212,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperauthfailurespersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperauthfailurespersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{instance}}: Zookeeper auth failures",

View file

@ -95,7 +95,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ topic }}",
@ -106,7 +106,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logendoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logendoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ topic }}",
@ -117,7 +117,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (job,kafka_cluster,topic,partition) (\n rate(kafka_topic_partition_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"expr": "sum by (job,kafka_cluster,topic,partition) (\n rate(kafka_topic_partition_current_offset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ topic }}",
@ -128,7 +128,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_size{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_size{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ topic }}",
@ -223,7 +223,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_topic_partition_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_topic_partition_current_offset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ topic }}",
@ -289,7 +289,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesinpersec{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesinpersec{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ topic }}",
@ -355,7 +355,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesoutpersec{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesoutpersec{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ topic }}",
@ -435,7 +435,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ consumergroup }} ({{ topic }})",
@ -446,7 +446,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ consumergroup }} ({{ topic }})",
@ -457,7 +457,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ consumergroup }} ({{ topic }})",
@ -552,7 +552,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ consumergroup }} ({{ topic }})",
@ -616,7 +616,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ consumergroup }} ({{ topic }})",
@ -680,7 +680,7 @@
"type": "prometheus",
"uid": "${datasource}"
},
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
"format": "time_series",
"instant": false,
"legendFormat": "{{ consumergroup }} ({{ topic }})",
@ -714,7 +714,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\"}, job)",
"query": "label_values(kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\"}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -729,7 +729,7 @@
"label": "Kafka_cluster",
"multi": true,
"name": "kafka_cluster",
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\"}, kafka_cluster)",
"query": "label_values(kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\"}, kafka_cluster)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -744,7 +744,7 @@
"label": "Topic",
"multi": true,
"name": "topic",
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\"}, topic)",
"query": "label_values(kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\"}, topic)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -759,7 +759,7 @@
"label": "Consumergroup",
"multi": true,
"name": "consumergroup",
"query": "label_values(kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}, consumergroup)",
"query": "label_values(kafka_consumergroup_uncommitted_offsets{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}, consumergroup)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -573,8 +573,8 @@
},
"fieldConfig": {
"defaults": {
"max": "150",
"min": "0",
"max": 150,
"min": 0,
"thresholds": {
"steps": [
{
@ -617,8 +617,8 @@
},
"fieldConfig": {
"defaults": {
"max": "150",
"min": "0",
"max": 150,
"min": 0,
"thresholds": {
"steps": [
{

View file

@ -7,7 +7,7 @@ groups:
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High CPU usage on Windows host.
expr: |
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -18,9 +18,9 @@ groups:
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High memory usage on Windows host.
expr: |
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
100 - ((windows_os_physical_memory_free_bytes{}
/
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
windows_cs_physical_memory_bytes{}) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -31,7 +31,7 @@ groups:
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
summary: Disk is almost full on Windows host.
expr: |
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
for: 15m
keep_firing_for: 5m
labels:
@ -42,7 +42,7 @@ groups:
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
summary: Windows service is not healthy.
expr: |
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
windows_service_status{status!~"starting|stopping|ok", } > 0
for: 5m
labels:
severity: critical
@ -52,7 +52,7 @@ groups:
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
summary: Windows physical disk is not healthy.
expr: |
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
windows_disk_drive_status{status="OK", } != 1
for: 5m
labels:
severity: critical
@ -62,7 +62,7 @@ groups:
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
summary: NTP client delay.
expr: |
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
windows_time_ntp_round_trip_delay_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels:
@ -73,7 +73,7 @@ groups:
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
summary: NTP time offset is too large.
expr: |
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
windows_time_computed_time_offset_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels:
@ -85,7 +85,7 @@ groups:
summary: There is a high number of pending replication operations in Active
Directory. A high number of pending operations sustained over a period of
time can indicate a problem with replication.
expr: "windows_ad_replication_pending_operations{job=~\".*windows.*\"} >= 50 \n"
expr: "windows_ad_replication_pending_operations{} >= 50 \n"
for: 10m
keep_firing_for: 5m
labels:
@ -97,7 +97,7 @@ groups:
summary: There are a number of replication synchronization request failures.
These can cause authentication failures, outdated information being propagated
across domain controllers, and potentially data loss or inconsistencies.
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{job=~\".*windows.*\"}[5m])
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{}[5m])
> 0 \n"
for: 5m
keep_firing_for: 5m
@ -111,7 +111,7 @@ groups:
summary: There is a high number of password changes. This may indicate unauthorized
changes or attacks.
expr: |
increase(windows_ad_sam_password_changes_total{job=~".*windows.*"}[5m]) > 25
increase(windows_ad_sam_password_changes_total{}[5m]) > 25
for: 5m
labels:
keep_firing_for: 24h

View file

@ -7,7 +7,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"hide": true,
"iconColor": "light-purple",
"name": "Critical system event",
@ -21,7 +21,7 @@
"uid": "${prometheus_datasource}"
},
"enable": true,
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"hide": true,
"iconColor": "light-yellow",
"name": "Reboot",
@ -35,7 +35,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"hide": true,
"iconColor": "light-orange",
"name": "Service failed",
@ -95,7 +95,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "sum(windows_ad_replication_pending_operations{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
"expr": "sum(windows_ad_replication_pending_operations{job=~\"$job\",instance=~\"$instance\"})",
"legendFormat": "Operations"
}
],
@ -139,7 +139,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "sum(windows_ad_directory_service_threads{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
"expr": "sum(windows_ad_directory_service_threads{job=~\"$job\",instance=~\"$instance\"})",
"legendFormat": "Directory service threads"
}
],
@ -159,7 +159,7 @@
},
"id": 3,
"options": {
"alertInstanceLabelFilter": "job=~\".*windows.*\",job=~\"${job:regex}\",instance=~\"${instance:regex}\""
"alertInstanceLabelFilter": "job=~\"${job:regex}\",instance=~\"${instance:regex}\""
},
"pluginVersion": "v10.0.0",
"title": "Windows Active Directory alerts",
@ -202,7 +202,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "sum(windows_ad_replication_pending_synchronizations{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
"expr": "sum(windows_ad_replication_pending_synchronizations{job=~\"$job\",instance=~\"$instance\"})",
"legendFormat": "Operations"
}
],
@ -251,7 +251,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_binds_total{bind_method=~\"ldap\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "rate(windows_ad_binds_total{bind_method=~\"ldap\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{instance}}"
}
],
@ -300,7 +300,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_directory_operations_total{origin=~\"ldap\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "rate(windows_ad_directory_operations_total{origin=~\"ldap\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{instance}} - {{ operation }}"
}
],
@ -565,7 +565,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_binds_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "rate(windows_ad_binds_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{instance}} - {{ operation }}"
}
],
@ -721,7 +721,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_replication_data_intrasite_bytes_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
"expr": "rate(windows_ad_replication_data_intrasite_bytes_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
"legendFormat": "{{instance}} - {{ direction }}"
}
],
@ -776,7 +776,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_replication_data_intersite_bytes_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
"expr": "rate(windows_ad_replication_data_intersite_bytes_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
"legendFormat": "{{instance}} - {{ direction }}"
}
],
@ -829,7 +829,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_replication_inbound_objects_updated_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "rate(windows_ad_replication_inbound_objects_updated_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{instance}} objects"
},
{
@ -837,7 +837,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_replication_inbound_properties_updated_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "rate(windows_ad_replication_inbound_properties_updated_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{instance}} properties"
}
],
@ -972,7 +972,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_database_operations_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "rate(windows_ad_database_operations_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{instance}} - {{ operation }}"
}
],
@ -1083,7 +1083,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "rate(windows_ad_database_operations_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "rate(windows_ad_database_operations_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{instance}} - {{ operation }}"
}
],
@ -1114,7 +1114,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(windows_ad_directory_service_threads{job=~\".*windows.*\"}, job)",
"query": "label_values(windows_ad_directory_service_threads{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -1128,7 +1128,7 @@
"label": "Instance",
"multi": true,
"name": "instance",
"query": "label_values(windows_ad_directory_service_threads{job=~\".*windows.*\",job=~\"$job\"}, instance)",
"query": "label_values(windows_ad_directory_service_threads{job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -7,7 +7,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"hide": true,
"iconColor": "light-purple",
"name": "Critical system event",
@ -21,7 +21,7 @@
"uid": "${prometheus_datasource}"
},
"enable": true,
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"hide": true,
"iconColor": "light-yellow",
"name": "Reboot",
@ -35,7 +35,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"hide": true,
"iconColor": "light-orange",
"name": "Service failed",
@ -204,7 +204,7 @@
"type": "loki",
"uid": "${loki_datasource}"
},
"expr": "sum by (level) (count_over_time({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
"expr": "sum by (level) (count_over_time({job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
"legendFormat": "{{ level }}"
}
],
@ -246,7 +246,7 @@
"type": "loki",
"uid": "${loki_datasource}"
},
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
"expr": "{job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
}
],
"title": "Logs",
@ -277,7 +277,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values({job=~\".*windows.*\"}, job)",
"query": "label_values({}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -292,7 +292,7 @@
"label": "Instance",
"multi": true,
"name": "instance",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\"}, instance)",
"query": "label_values({,job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -307,7 +307,7 @@
"label": "Channel",
"multi": true,
"name": "channel",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}, channel)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\"}, channel)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -322,7 +322,7 @@
"label": "Source",
"multi": true,
"name": "source",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -337,7 +337,7 @@
"label": "Keywords",
"multi": true,
"name": "keywords",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -352,7 +352,7 @@
"label": "Level",
"multi": true,
"name": "level",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -7,7 +7,7 @@ groups:
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High CPU usage on Windows host.
expr: |
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -18,9 +18,9 @@ groups:
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High memory usage on Windows host.
expr: |
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
100 - ((windows_os_physical_memory_free_bytes{}
/
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
windows_cs_physical_memory_bytes{}) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -31,7 +31,7 @@ groups:
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
summary: Disk is almost full on Windows host.
expr: |
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
for: 15m
keep_firing_for: 5m
labels:
@ -42,7 +42,7 @@ groups:
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
summary: Windows service is not healthy.
expr: |
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
windows_service_status{status!~"starting|stopping|ok", } > 0
for: 5m
labels:
severity: critical
@ -52,7 +52,7 @@ groups:
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
summary: Windows physical disk is not healthy.
expr: |
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
windows_disk_drive_status{status="OK", } != 1
for: 5m
labels:
severity: critical
@ -62,7 +62,7 @@ groups:
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
summary: NTP client delay.
expr: |
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
windows_time_ntp_round_trip_delay_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels:
@ -73,7 +73,7 @@ groups:
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
summary: NTP time offset is too large.
expr: |
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
windows_time_computed_time_offset_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels:

View file

@ -7,7 +7,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"hide": true,
"iconColor": "light-purple",
"name": "Critical system event",
@ -21,7 +21,7 @@
"uid": "${prometheus_datasource}"
},
"enable": true,
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"hide": true,
"iconColor": "light-yellow",
"name": "Reboot",
@ -35,7 +35,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"hide": true,
"iconColor": "light-orange",
"name": "Service failed",
@ -125,7 +125,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
"legendFormat": "{{ volume }} available"
}
],
@ -251,7 +251,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"refId": "TOTAL"
@ -261,7 +261,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"legendFormat": "{{ volume }} available",
@ -426,7 +426,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ volume }} read"
},
{
@ -434,7 +434,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ volume }} written"
},
{
@ -442,7 +442,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
"legendFormat": "{{ volume }} io util"
}
],
@ -507,7 +507,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "irate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"legendFormat": "{{ volume }} reads"
},
{
@ -515,7 +515,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "irate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"legendFormat": "{{ volume }} writes"
}
],
@ -580,7 +580,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_read_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "irate(windows_logical_disk_read_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"legendFormat": "{{ volume }} avg read time"
},
{
@ -588,7 +588,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_write_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "irate(windows_logical_disk_write_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
"legendFormat": "{{ volume }} avg write time"
}
],
@ -652,7 +652,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_avg_read_requests_queued{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_logical_disk_avg_read_requests_queued{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ volume }} read queue"
},
{
@ -660,7 +660,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_avg_write_requests_queued{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_logical_disk_avg_write_requests_queued{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ volume }} write queue"
}
],
@ -692,7 +692,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
"query": "label_values(windows_os_info{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -707,7 +707,7 @@
"label": "Instance",
"multi": false,
"name": "instance",
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -7,7 +7,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"hide": true,
"iconColor": "light-purple",
"name": "Critical system event",
@ -21,7 +21,7 @@
"uid": "${prometheus_datasource}"
},
"enable": true,
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"hide": true,
"iconColor": "light-yellow",
"name": "Reboot",
@ -35,7 +35,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"hide": true,
"iconColor": "light-orange",
"name": "Service failed",
@ -418,7 +418,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"refId": "OS Info"
@ -428,7 +428,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "time() - windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "time() - windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"refId": "Uptime"
@ -438,7 +438,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_cs_logical_processors{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_cs_logical_processors{job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"refId": "Cores"
@ -448,7 +448,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"format": "table",
"instant": true,
"legendFormat": "CPU usage",
@ -459,7 +459,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"legendFormat": "Memory total",
@ -470,7 +470,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100",
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100",
"format": "table",
"instant": true,
"refId": "Memory usage"
@ -480,7 +480,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"refId": "Disk C: total"
@ -490,7 +490,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - windows_logical_disk_free_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*100",
"expr": "100 - windows_logical_disk_free_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}*100",
"format": "table",
"instant": true,
"refId": "Disk C: used"
@ -500,7 +500,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "count by (instance) (max_over_time(ALERTS{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"critical\"}[1m])) * group by (instance) (windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
"expr": "count by (instance) (max_over_time(ALERTS{job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"critical\"}[1m])) * group by (instance) (windows_os_info{job=~\"$job\",instance=~\"$instance\"})",
"format": "table",
"instant": true,
"refId": "CRITICAL"
@ -510,7 +510,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "count by (instance) (max_over_time(ALERTS{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"warning\"}[1m])) * group by (instance) (windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
"expr": "count by (instance) (max_over_time(ALERTS{job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"warning\"}[1m])) * group by (instance) (windows_os_info{job=~\"$job\",instance=~\"$instance\"})",
"format": "table",
"instant": true,
"refId": "WARNING"
@ -648,7 +648,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25,100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
"expr": "topk(25,100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
"legendFormat": "{{instance}}"
},
{
@ -656,7 +656,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "avg(100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
"expr": "avg(100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
"legendFormat": "Mean"
}
],
@ -753,7 +753,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25,100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100)",
"expr": "topk(25,100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100)",
"legendFormat": "{{instance}}"
},
{
@ -761,7 +761,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "avg(100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100)",
"expr": "avg(100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100)",
"legendFormat": "Mean"
}
],
@ -858,7 +858,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25,(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
"expr": "topk(25,(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
"legendFormat": "{{instance}}: {{volume}}"
},
{
@ -866,7 +866,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "avg((1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
"expr": "avg((1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
"legendFormat": "Mean"
}
],
@ -963,7 +963,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25,100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*100)",
"expr": "topk(25,100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}*100)",
"legendFormat": "{{instance}}: {{volume}}"
},
{
@ -971,7 +971,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "avg(100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*100)",
"expr": "avg(100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}*100)",
"legendFormat": "Mean"
}
],
@ -1024,7 +1024,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25, irate(windows_net_packets_outbound_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"expr": "topk(25, irate(windows_net_packets_outbound_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"legendFormat": "{{instance}}: {{ nic }} transmitted"
},
{
@ -1032,7 +1032,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25, irate(windows_net_packets_received_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"expr": "topk(25, irate(windows_net_packets_received_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"legendFormat": "{{instance}}: {{ nic }} received"
},
{
@ -1040,7 +1040,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25, irate(windows_net_packets_received_unknown_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"expr": "topk(25, irate(windows_net_packets_received_unknown_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"legendFormat": "{{instance}}: {{ nic }} received (unknown)"
},
{
@ -1048,7 +1048,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25, irate(windows_net_packets_outbound_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"expr": "topk(25, irate(windows_net_packets_outbound_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"legendFormat": "{{instance}}: {{ nic }} transmitted packets dropped"
},
{
@ -1056,7 +1056,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "topk(25, irate(windows_net_packets_received_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"expr": "topk(25, irate(windows_net_packets_received_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
"legendFormat": "{{instance}}: {{ nic }} received packets dropped"
}
],
@ -1088,7 +1088,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
"query": "label_values(windows_os_info{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -1103,7 +1103,7 @@
"label": "Instance",
"multi": true,
"name": "instance",
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -7,7 +7,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"hide": true,
"iconColor": "light-purple",
"name": "Critical system event",
@ -21,7 +21,7 @@
"uid": "${prometheus_datasource}"
},
"enable": true,
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"hide": true,
"iconColor": "light-yellow",
"name": "Reboot",
@ -35,7 +35,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"hide": true,
"iconColor": "light-orange",
"name": "Service failed",
@ -216,7 +216,7 @@
"type": "loki",
"uid": "${loki_datasource}"
},
"expr": "sum by (level) (count_over_time({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
"expr": "sum by (level) (count_over_time({job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
"legendFormat": "{{ level }}"
}
],
@ -258,7 +258,7 @@
"type": "loki",
"uid": "${loki_datasource}"
},
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
"expr": "{job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
}
],
"title": "Logs",
@ -289,7 +289,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values({job=~\".*windows.*\"}, job)",
"query": "label_values({}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -304,7 +304,7 @@
"label": "Instance",
"multi": true,
"name": "instance",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\"}, instance)",
"query": "label_values({,job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -319,7 +319,7 @@
"label": "Channel",
"multi": true,
"name": "channel",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}, channel)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\"}, channel)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -334,7 +334,7 @@
"label": "Source",
"multi": true,
"name": "source",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -349,7 +349,7 @@
"label": "Keywords",
"multi": true,
"name": "keywords",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -364,7 +364,7 @@
"label": "Level",
"multi": true,
"name": "level",
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -7,7 +7,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"hide": true,
"iconColor": "light-purple",
"name": "Critical system event",
@ -21,7 +21,7 @@
"uid": "${prometheus_datasource}"
},
"enable": true,
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"hide": true,
"iconColor": "light-yellow",
"name": "Reboot",
@ -35,7 +35,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"hide": true,
"iconColor": "light-orange",
"name": "Service failed",
@ -128,7 +128,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "time() - windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
"expr": "time() - windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}"
}
],
"title": "Uptime",
@ -172,7 +172,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
"format": "table"
}
],
@ -217,7 +217,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
"format": "table"
}
],
@ -262,7 +262,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
"format": "table"
}
],
@ -307,7 +307,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_cs_logical_processors{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
"expr": "windows_cs_logical_processors{job=~\"$job\",instance=~\"$instance\"}"
}
],
"title": "CPU count",
@ -351,7 +351,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"}",
"legendFormat": "Memory total"
}
],
@ -396,7 +396,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_os_paging_limit_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
"expr": "windows_os_paging_limit_bytes{job=~\"$job\",instance=~\"$instance\"}"
}
],
"title": "Pagefile size",
@ -440,7 +440,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}"
}
],
"title": "Disk C: size",
@ -498,7 +498,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"legendFormat": "CPU usage"
}
],
@ -553,7 +553,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"legendFormat": "CPU usage"
}
],
@ -612,7 +612,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100"
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100"
}
],
"title": "Memory usage",
@ -692,7 +692,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"}",
"legendFormat": "Memory used"
},
{
@ -700,7 +700,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"}",
"legendFormat": "Memory total"
}
],
@ -783,7 +783,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ volume }} read"
},
{
@ -791,7 +791,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ volume }} written"
},
{
@ -799,7 +799,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
"legendFormat": "{{ volume }} io util"
}
],
@ -925,7 +925,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"refId": "TOTAL"
@ -935,7 +935,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
"format": "table",
"instant": true,
"legendFormat": "{{ volume }} available",
@ -1107,7 +1107,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_net_bytes_received_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
"expr": "irate(windows_net_bytes_received_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
"legendFormat": "{{ nic }} received"
},
{
@ -1115,7 +1115,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_net_bytes_sent_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
"expr": "irate(windows_net_bytes_sent_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
"legendFormat": "{{ nic }} transmitted"
}
],
@ -1182,7 +1182,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_net_packets_outbound_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_net_packets_outbound_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ nic }} transmitted"
},
{
@ -1190,7 +1190,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_net_packets_received_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_net_packets_received_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ nic }} received"
},
{
@ -1198,7 +1198,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_net_packets_received_unknown_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_net_packets_received_unknown_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ nic }} received (unknown)"
},
{
@ -1206,7 +1206,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_net_packets_outbound_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_net_packets_outbound_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ nic }} transmitted packets dropped"
},
{
@ -1214,7 +1214,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_net_packets_received_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_net_packets_received_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "{{ nic }} received packets dropped"
}
],
@ -1246,7 +1246,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
"query": "label_values(windows_os_info{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -1261,7 +1261,7 @@
"label": "Instance",
"multi": false,
"name": "instance",
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -7,7 +7,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
"hide": true,
"iconColor": "light-purple",
"name": "Critical system event",
@ -21,7 +21,7 @@
"uid": "${prometheus_datasource}"
},
"enable": true,
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
"hide": true,
"iconColor": "light-yellow",
"name": "Reboot",
@ -35,7 +35,7 @@
"uid": "${loki_datasource}"
},
"enable": true,
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
"hide": true,
"iconColor": "light-orange",
"name": "Service failed",
@ -122,7 +122,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"legendFormat": "CPU usage"
}
],
@ -177,7 +177,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
"legendFormat": "CPU usage"
}
],
@ -293,7 +293,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "sum by(instance, mode) (irate(windows_cpu_time_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) \n/ on(instance) \ngroup_left sum by (instance) ((irate(windows_cpu_time_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))) * 100\n",
"expr": "sum by(instance, mode) (irate(windows_cpu_time_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) \n/ on(instance) \ngroup_left sum by (instance) ((irate(windows_cpu_time_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))) * 100\n",
"legendFormat": "{{ mode }}"
}
],
@ -342,7 +342,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_system_processor_queue_length{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}\n",
"expr": "windows_system_processor_queue_length{job=~\"$job\",instance=~\"$instance\"}\n",
"legendFormat": "CPU average queue"
}
],
@ -391,7 +391,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "irate(windows_system_context_switches_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"expr": "irate(windows_system_context_switches_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
"legendFormat": "Context switches"
},
{
@ -399,7 +399,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "sum without (core) (irate(windows_cpu_interrupts_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
"expr": "sum without (core) (irate(windows_cpu_interrupts_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
"legendFormat": "Interrupts"
}
],
@ -456,7 +456,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_os_timezone{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_os_timezone{job=~\"$job\",instance=~\"$instance\"}",
"format": "table"
}
],
@ -509,7 +509,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "clamp_max(windows_time_ntp_client_time_sources{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}, 1)",
"expr": "clamp_max(windows_time_ntp_client_time_sources{job=~\"$job\",instance=~\"$instance\"}, 1)",
"legendFormat": "NTP status"
}
],
@ -559,7 +559,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_time_ntp_round_trip_delay_seconds{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_time_ntp_round_trip_delay_seconds{job=~\"$job\",instance=~\"$instance\"}",
"legendFormat": "NTP trip delay"
},
{
@ -567,7 +567,7 @@
"type": "prometheus",
"uid": "${prometheus_datasource}"
},
"expr": "windows_time_computed_time_offset_seconds{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
"expr": "windows_time_computed_time_offset_seconds{job=~\"$job\",instance=~\"$instance\"}",
"legendFormat": "Time offset"
}
],
@ -599,7 +599,7 @@
"label": "Job",
"multi": true,
"name": "job",
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
"query": "label_values(windows_os_info{}, job)",
"refresh": 2,
"sort": 1,
"type": "query"
@ -614,7 +614,7 @@
"label": "Instance",
"multi": false,
"name": "instance",
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
"refresh": 2,
"sort": 1,
"type": "query"

View file

@ -26,8 +26,8 @@ annotations:
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
summary: JVM heap memory filling up.
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without (id)
(jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80
for: 5m
keep_firing_for: 5m
labels:
@ -43,7 +43,7 @@ annotations:
are in a cyclic dependency with each other. The restart is required to resolve
the deadlock.'
summary: JVM deadlock detected.
expr: (jvm_threads_deadlocked{job!=""}) > 0
expr: (jvm_threads_deadlocked{}) > 0
for: 2m
keep_firing_for: 5m
labels:

View file

@ -26,7 +26,7 @@ annotations:
description: 'Kafka lag keeps increasing over the last 15 minutes for consumer group:
{{$labels.consumergroup}}, topic: {{$labels.topic}}.'
summary: Kafka lag keeps increasing.
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""}[5m]))
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"}[5m]))
> 0
for: 15m
keep_firing_for: 10m
@ -42,7 +42,7 @@ annotations:
description: 'Total kafka lag across all partitions is too high ({{ printf "%.0f"
$value }}) for consumer group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
summary: Kafka lag is too high.
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""})
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"})
> 100
for: 15m
keep_firing_for: 5m
@ -183,8 +183,8 @@ annotations:
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
}} has disconected from Zookeeper.
summary: Kafka Zookeeper sync disconected.
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job="integrations/kafka",
quantile="0.95"}[5m])) < 0
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{quantile="0.95",job="integrations/kafka"}[5m]))
< 0
for: 5m
labels:
severity: critical

View file

@ -27,7 +27,7 @@ annotations:
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High CPU usage on Windows host.
expr: |
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -43,9 +43,9 @@ annotations:
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High memory usage on Windows host.
expr: |
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
100 - ((windows_os_physical_memory_free_bytes{}
/
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
windows_cs_physical_memory_bytes{}) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -61,7 +61,7 @@ annotations:
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
summary: Disk is almost full on Windows host.
expr: |
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
for: 15m
keep_firing_for: 5m
labels:
@ -77,7 +77,7 @@ annotations:
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
summary: Windows service is not healthy.
expr: |
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
windows_service_status{status!~"starting|stopping|ok", } > 0
for: 5m
labels:
severity: critical
@ -92,7 +92,7 @@ annotations:
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
summary: Windows physical disk is not healthy.
expr: |
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
windows_disk_drive_status{status="OK", } != 1
for: 5m
labels:
severity: critical
@ -107,7 +107,7 @@ annotations:
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
summary: NTP client delay.
expr: |
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
windows_time_ntp_round_trip_delay_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels:
@ -123,7 +123,7 @@ annotations:
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
summary: NTP time offset is too large.
expr: |
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
windows_time_computed_time_offset_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels:
@ -140,7 +140,7 @@ annotations:
summary: There is a high number of pending replication operations in Active Directory.
A high number of pending operations sustained over a period of time can indicate
a problem with replication.
expr: "windows_ad_replication_pending_operations{job=~\".*windows.*\"} >= 50
expr: "windows_ad_replication_pending_operations{} >= 50
"
for: 10m
keep_firing_for: 5m
@ -158,7 +158,7 @@ annotations:
summary: There are a number of replication synchronization request failures. These
can cause authentication failures, outdated information being propagated across
domain controllers, and potentially data loss or inconsistencies.
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{job=~\".*windows.*\"}[5m])
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{}[5m])
> 0
"
for: 5m
@ -178,7 +178,7 @@ annotations:
summary: There is a high number of password changes. This may indicate unauthorized
changes or attacks.
expr: |
increase(windows_ad_sam_password_changes_total{job=~".*windows.*"}[5m]) > 25
increase(windows_ad_sam_password_changes_total{}[5m]) > 25
for: 5m
labels:
keep_firing_for: 24h

View file

@ -27,7 +27,7 @@ annotations:
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High CPU usage on Windows host.
expr: |
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -43,9 +43,9 @@ annotations:
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
summary: High memory usage on Windows host.
expr: |
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
100 - ((windows_os_physical_memory_free_bytes{}
/
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
windows_cs_physical_memory_bytes{}) * 100) > 90
for: 15m
keep_firing_for: 5m
labels:
@ -61,7 +61,7 @@ annotations:
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
summary: Disk is almost full on Windows host.
expr: |
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
for: 15m
keep_firing_for: 5m
labels:
@ -77,7 +77,7 @@ annotations:
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
summary: Windows service is not healthy.
expr: |
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
windows_service_status{status!~"starting|stopping|ok", } > 0
for: 5m
labels:
severity: critical
@ -92,7 +92,7 @@ annotations:
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
summary: Windows physical disk is not healthy.
expr: |
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
windows_disk_drive_status{status="OK", } != 1
for: 5m
labels:
severity: critical
@ -107,7 +107,7 @@ annotations:
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
summary: NTP client delay.
expr: |
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
windows_time_ntp_round_trip_delay_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels:
@ -123,7 +123,7 @@ annotations:
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
summary: NTP time offset is too large.
expr: |
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
windows_time_computed_time_offset_seconds{} > 1
for: 5m
keep_firing_for: 5m
labels: