mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
assets,site/content: daily assets regeneration
This commit is contained in:
parent
82075a7fb7
commit
2355980377
19 changed files with 262 additions and 262 deletions
|
@ -6,8 +6,8 @@ groups:
|
|||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
|
||||
last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
|
||||
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without
|
||||
(id) (jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -18,7 +18,7 @@ groups:
|
|||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job!=""}) > 0
|
||||
expr: (jvm_threads_deadlocked{}) > 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
|
|
@ -51,7 +51,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
|
||||
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory used(heap)",
|
||||
|
@ -100,7 +100,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
|
||||
"expr": "((avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"})\n))/(avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n))) * 100",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory used(nonheap)",
|
||||
|
@ -151,7 +151,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_threads_current{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_threads_current{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Threads",
|
||||
|
@ -202,7 +202,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_classes_loaded{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_classes_loaded{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Classes loaded",
|
||||
|
@ -284,7 +284,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "time()-process_start_time_seconds{job!=\"\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "time()-process_start_time_seconds{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Uptime",
|
||||
|
@ -344,7 +344,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,instance) (\n process_start_time_seconds{job!=\"\",job=~\"$job\",instance=~\"$instance\"} * 1000\n)",
|
||||
"expr": "avg by (job,instance) (\n process_start_time_seconds{job=~\"$job\",instance=~\"$instance\"} * 1000\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Process start time",
|
||||
|
@ -487,7 +487,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,instance) (\n rate(process_cpu_seconds_total{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 100\n)",
|
||||
"expr": "avg by (job,instance) (\n rate(process_cpu_seconds_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 100\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: CPU usage (process)",
|
||||
|
@ -582,7 +582,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,instance) (\n process_resident_memory_bytes{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job,instance) (\n process_resident_memory_bytes{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Process memory used (rss)",
|
||||
|
@ -658,7 +658,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,instance) (\n process_open_fds{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job,instance) (\n process_open_fds{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Process files open",
|
||||
|
@ -669,7 +669,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,instance) (\n process_max_fds{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job,instance) (\n process_max_fds{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Process files max",
|
||||
|
@ -801,7 +801,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory used(heap)",
|
||||
|
@ -812,7 +812,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory max(heap)",
|
||||
|
@ -823,7 +823,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"heap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"heap\", job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory committed(heap)",
|
||||
|
@ -944,7 +944,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_used_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory used(nonheap)",
|
||||
|
@ -955,7 +955,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_max_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"} != -1)\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory max(nonheap)",
|
||||
|
@ -966,7 +966,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"nonheap\", job!=\"\",job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"expr": "avg by (job) (\n sum without (id) (jvm_memory_committed_bytes{area=\"nonheap\", job=~\"$job\",instance=~\"$instance\"})\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: JVM memory committed(nonheap)",
|
||||
|
@ -1098,7 +1098,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Eden Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Eden Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Eden Space (used)",
|
||||
|
@ -1109,7 +1109,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Eden Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Eden Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Eden Space (max)",
|
||||
|
@ -1120,7 +1120,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Eden Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Eden Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Eden Space (committed)",
|
||||
|
@ -1239,7 +1239,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Survivor Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=~\"(G1 |PS )?Survivor Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Survival space (used)",
|
||||
|
@ -1250,7 +1250,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Survivor Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"} != -1\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_max_bytes{pool=~\"(G1 |PS )?Survivor Space\", job=~\"$job\",instance=~\"$instance\"} != -1\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Survival space (max)",
|
||||
|
@ -1261,7 +1261,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Survivor Space\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=~\"(G1 |PS )?Survivor Space\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Survival space (committed)",
|
||||
|
@ -1380,7 +1380,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Tenured space (used)",
|
||||
|
@ -1391,7 +1391,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "\n avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)\n != -1",
|
||||
"expr": "\n avg by (job) (\n jvm_memory_pool_used_bytes{pool=\"PS Old Gen\", job=~\"$job\",instance=~\"$instance\"}\n)\n != -1",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Tenured space (max)",
|
||||
|
@ -1402,7 +1402,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=\"PS Old Gen\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_memory_pool_committed_bytes{pool=\"PS Old Gen\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Tenured space (committed)",
|
||||
|
@ -1466,7 +1466,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,gc) (\n rate(jvm_gc_collection_seconds_count{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job,gc) (\n rate(jvm_gc_collection_seconds_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{gc}}",
|
||||
|
@ -1530,7 +1530,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job, gc) (\n rate(jvm_gc_collection_seconds_sum{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n /\n rate(jvm_gc_collection_seconds_count{job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)\n",
|
||||
"expr": "avg by (job, gc) (\n rate(jvm_gc_collection_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n /\n rate(jvm_gc_collection_seconds_count{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)\n",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ job }}: {{ gc }} (avg)",
|
||||
|
@ -1625,7 +1625,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n rate(jvm_memory_pool_allocated_bytes_total{pool=~\"(G1 |PS )?Eden Space\",job!=\"\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job) (\n rate(jvm_memory_pool_allocated_bytes_total{pool=~\"(G1 |PS )?Eden Space\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Allocated (bytes)",
|
||||
|
@ -1766,7 +1766,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_threads_current{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_threads_current{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Threads",
|
||||
|
@ -1777,7 +1777,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_threads_daemon{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_threads_daemon{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Threads (daemon)",
|
||||
|
@ -1788,7 +1788,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_threads_peak{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_threads_peak{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Threads (peak)",
|
||||
|
@ -1799,7 +1799,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_threads_deadlocked{job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_threads_deadlocked{job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Threads (deadlocked)",
|
||||
|
@ -1863,7 +1863,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (state, job) (jvm_threads_state{job!=\"\",job=~\"$job\",instance=~\"$instance\"})",
|
||||
"expr": "sum by (state, job) (jvm_threads_state{job=~\"$job\",instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ state }}",
|
||||
|
@ -1982,7 +1982,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"direct\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"direct\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Direct buffer used bytes",
|
||||
|
@ -1993,7 +1993,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"direct\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"direct\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Direct buffer capacity",
|
||||
|
@ -2100,7 +2100,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"mapped\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_used_bytes{pool=\"mapped\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Mapped buffer used",
|
||||
|
@ -2111,7 +2111,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"mapped\", job!=\"\",job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job) (\n jvm_buffer_pool_capacity_bytes{pool=\"mapped\", job=~\"$job\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{job}}: Mapped buffer capacity",
|
||||
|
@ -2147,7 +2147,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(jvm_memory_used_bytes{job!=\"\"}, job)",
|
||||
"query": "label_values(jvm_memory_used_bytes{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -2162,7 +2162,7 @@
|
|||
"label": "Instance",
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"query": "label_values(jvm_memory_used_bytes{job!=\"\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values(jvm_memory_used_bytes{job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -6,7 +6,7 @@ groups:
|
|||
description: 'Kafka lag keeps increasing over the last 15 minutes for consumer
|
||||
group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag keeps increasing.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""}[5m]))
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"}[5m]))
|
||||
> 0
|
||||
for: 15m
|
||||
keep_firing_for: 10m
|
||||
|
@ -17,7 +17,7 @@ groups:
|
|||
description: 'Total kafka lag across all partitions is too high ({{ printf "%.0f"
|
||||
$value }}) for consumer group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag is too high.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""})
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"})
|
||||
> 100
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
|
@ -118,8 +118,8 @@ groups:
|
|||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} has disconected from Zookeeper.
|
||||
summary: Kafka Zookeeper sync disconected.
|
||||
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job="integrations/kafka",
|
||||
quantile="0.95"}[5m])) < 0
|
||||
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{quantile="0.95",job="integrations/kafka"}[5m]))
|
||||
< 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
@ -1495,7 +1495,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "request queue time",
|
||||
|
@ -1506,7 +1506,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "local time",
|
||||
|
@ -1517,7 +1517,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "remote time",
|
||||
|
@ -1528,7 +1528,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "response queue time",
|
||||
|
@ -1539,7 +1539,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Produce\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "response time",
|
||||
|
@ -1656,7 +1656,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "request queue time",
|
||||
|
@ -1667,7 +1667,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "local time",
|
||||
|
@ -1678,7 +1678,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "remote time",
|
||||
|
@ -1689,7 +1689,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "response queue time",
|
||||
|
@ -1700,7 +1700,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"FetchFollower\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "response time",
|
||||
|
@ -1817,7 +1817,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_requestqueuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "request queue time",
|
||||
|
@ -1828,7 +1828,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_localtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "local time",
|
||||
|
@ -1839,7 +1839,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_remotetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "remote time",
|
||||
|
@ -1850,7 +1850,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsequeuetimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "response queue time",
|
||||
|
@ -1861,7 +1861,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_network_requestmetrics_responsesendtimems{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\", request=\"Fetch\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "response time",
|
||||
|
@ -2079,7 +2079,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_server_zookeeperclientmetrics_zookeeperrequestlatencyms{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n kafka_server_zookeeperclientmetrics_zookeeperrequestlatencyms{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Zookeeper request latency",
|
||||
|
@ -2179,7 +2179,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Zookeeper connections",
|
||||
|
@ -2190,7 +2190,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperexpirespersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperexpirespersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Zookeeper expired connections",
|
||||
|
@ -2201,7 +2201,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperdisconnectspersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperdisconnectspersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Zookeeper disconnects",
|
||||
|
@ -2212,7 +2212,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperauthfailurespersec{job=\"integrations/kafka\", quantile=\"0.95\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job,kafka_cluster,instance) (\n rate(kafka_server_sessionexpirelistener_zookeeperauthfailurespersec{quantile=\"0.95\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{instance}}: Zookeeper auth failures",
|
||||
|
|
|
@ -95,7 +95,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
|
@ -106,7 +106,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logendoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logendoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
|
@ -117,7 +117,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic,partition) (\n rate(kafka_topic_partition_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"expr": "sum by (job,kafka_cluster,topic,partition) (\n rate(kafka_topic_partition_current_offset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
|
@ -128,7 +128,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_size{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_size{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
|
@ -223,7 +223,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_topic_partition_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_topic_partition_current_offset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
|
@ -289,7 +289,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesinpersec{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesinpersec{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
|
@ -355,7 +355,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesoutpersec{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesoutpersec{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
|
@ -435,7 +435,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
|
@ -446,7 +446,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
|
@ -457,7 +457,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
|
@ -552,7 +552,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
|
@ -616,7 +616,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
|
@ -680,7 +680,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
|
@ -714,7 +714,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\"}, job)",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\"}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -729,7 +729,7 @@
|
|||
"label": "Kafka_cluster",
|
||||
"multi": true,
|
||||
"name": "kafka_cluster",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\"}, kafka_cluster)",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\"}, kafka_cluster)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -744,7 +744,7 @@
|
|||
"label": "Topic",
|
||||
"multi": true,
|
||||
"name": "topic",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\"}, topic)",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{topic!=\"__consumer_offsets\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\"}, topic)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -759,7 +759,7 @@
|
|||
"label": "Consumergroup",
|
||||
"multi": true,
|
||||
"name": "consumergroup",
|
||||
"query": "label_values(kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}, consumergroup)",
|
||||
"query": "label_values(kafka_consumergroup_uncommitted_offsets{topic!=\"__consumer_offsets\",consumergroup!=\"\",job=\"integrations/kafka\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}, consumergroup)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -573,8 +573,8 @@
|
|||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"max": "150",
|
||||
"min": "0",
|
||||
"max": 150,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
|
@ -617,8 +617,8 @@
|
|||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"max": "150",
|
||||
"min": "0",
|
||||
"max": 150,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
|
|
|
@ -7,7 +7,7 @@ groups:
|
|||
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High CPU usage on Windows host.
|
||||
expr: |
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -18,9 +18,9 @@ groups:
|
|||
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High memory usage on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
|
||||
100 - ((windows_os_physical_memory_free_bytes{}
|
||||
/
|
||||
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
|
||||
windows_cs_physical_memory_bytes{}) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -31,7 +31,7 @@ groups:
|
|||
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
|
||||
summary: Disk is almost full on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
|
||||
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -42,7 +42,7 @@ groups:
|
|||
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
|
||||
summary: Windows service is not healthy.
|
||||
expr: |
|
||||
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
|
||||
windows_service_status{status!~"starting|stopping|ok", } > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -52,7 +52,7 @@ groups:
|
|||
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
|
||||
summary: Windows physical disk is not healthy.
|
||||
expr: |
|
||||
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
|
||||
windows_disk_drive_status{status="OK", } != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -62,7 +62,7 @@ groups:
|
|||
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
|
||||
summary: NTP client delay.
|
||||
expr: |
|
||||
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_ntp_round_trip_delay_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -73,7 +73,7 @@ groups:
|
|||
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
|
||||
summary: NTP time offset is too large.
|
||||
expr: |
|
||||
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_computed_time_offset_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -85,7 +85,7 @@ groups:
|
|||
summary: There is a high number of pending replication operations in Active
|
||||
Directory. A high number of pending operations sustained over a period of
|
||||
time can indicate a problem with replication.
|
||||
expr: "windows_ad_replication_pending_operations{job=~\".*windows.*\"} >= 50 \n"
|
||||
expr: "windows_ad_replication_pending_operations{} >= 50 \n"
|
||||
for: 10m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -97,7 +97,7 @@ groups:
|
|||
summary: There are a number of replication synchronization request failures.
|
||||
These can cause authentication failures, outdated information being propagated
|
||||
across domain controllers, and potentially data loss or inconsistencies.
|
||||
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{job=~\".*windows.*\"}[5m])
|
||||
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{}[5m])
|
||||
> 0 \n"
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
|
@ -111,7 +111,7 @@ groups:
|
|||
summary: There is a high number of password changes. This may indicate unauthorized
|
||||
changes or attacks.
|
||||
expr: |
|
||||
increase(windows_ad_sam_password_changes_total{job=~".*windows.*"}[5m]) > 25
|
||||
increase(windows_ad_sam_password_changes_total{}[5m]) > 25
|
||||
for: 5m
|
||||
labels:
|
||||
keep_firing_for: 24h
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-purple",
|
||||
"name": "Critical system event",
|
||||
|
@ -21,7 +21,7 @@
|
|||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"hide": true,
|
||||
"iconColor": "light-yellow",
|
||||
"name": "Reboot",
|
||||
|
@ -35,7 +35,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-orange",
|
||||
"name": "Service failed",
|
||||
|
@ -95,7 +95,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "sum(windows_ad_replication_pending_operations{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
|
||||
"expr": "sum(windows_ad_replication_pending_operations{job=~\"$job\",instance=~\"$instance\"})",
|
||||
"legendFormat": "Operations"
|
||||
}
|
||||
],
|
||||
|
@ -139,7 +139,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "sum(windows_ad_directory_service_threads{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
|
||||
"expr": "sum(windows_ad_directory_service_threads{job=~\"$job\",instance=~\"$instance\"})",
|
||||
"legendFormat": "Directory service threads"
|
||||
}
|
||||
],
|
||||
|
@ -159,7 +159,7 @@
|
|||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"alertInstanceLabelFilter": "job=~\".*windows.*\",job=~\"${job:regex}\",instance=~\"${instance:regex}\""
|
||||
"alertInstanceLabelFilter": "job=~\"${job:regex}\",instance=~\"${instance:regex}\""
|
||||
},
|
||||
"pluginVersion": "v10.0.0",
|
||||
"title": "Windows Active Directory alerts",
|
||||
|
@ -202,7 +202,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "sum(windows_ad_replication_pending_synchronizations{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
|
||||
"expr": "sum(windows_ad_replication_pending_synchronizations{job=~\"$job\",instance=~\"$instance\"})",
|
||||
"legendFormat": "Operations"
|
||||
}
|
||||
],
|
||||
|
@ -251,7 +251,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_binds_total{bind_method=~\"ldap\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "rate(windows_ad_binds_total{bind_method=~\"ldap\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
|
@ -300,7 +300,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_directory_operations_total{origin=~\"ldap\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "rate(windows_ad_directory_operations_total{origin=~\"ldap\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{instance}} - {{ operation }}"
|
||||
}
|
||||
],
|
||||
|
@ -565,7 +565,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_binds_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "rate(windows_ad_binds_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{instance}} - {{ operation }}"
|
||||
}
|
||||
],
|
||||
|
@ -721,7 +721,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_replication_data_intrasite_bytes_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
|
||||
"expr": "rate(windows_ad_replication_data_intrasite_bytes_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
|
||||
"legendFormat": "{{instance}} - {{ direction }}"
|
||||
}
|
||||
],
|
||||
|
@ -776,7 +776,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_replication_data_intersite_bytes_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
|
||||
"expr": "rate(windows_ad_replication_data_intersite_bytes_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]) * 8",
|
||||
"legendFormat": "{{instance}} - {{ direction }}"
|
||||
}
|
||||
],
|
||||
|
@ -829,7 +829,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_replication_inbound_objects_updated_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "rate(windows_ad_replication_inbound_objects_updated_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{instance}} objects"
|
||||
},
|
||||
{
|
||||
|
@ -837,7 +837,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_replication_inbound_properties_updated_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "rate(windows_ad_replication_inbound_properties_updated_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{instance}} properties"
|
||||
}
|
||||
],
|
||||
|
@ -972,7 +972,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_database_operations_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "rate(windows_ad_database_operations_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{instance}} - {{ operation }}"
|
||||
}
|
||||
],
|
||||
|
@ -1083,7 +1083,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "rate(windows_ad_database_operations_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "rate(windows_ad_database_operations_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{instance}} - {{ operation }}"
|
||||
}
|
||||
],
|
||||
|
@ -1114,7 +1114,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(windows_ad_directory_service_threads{job=~\".*windows.*\"}, job)",
|
||||
"query": "label_values(windows_ad_directory_service_threads{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -1128,7 +1128,7 @@
|
|||
"label": "Instance",
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"query": "label_values(windows_ad_directory_service_threads{job=~\".*windows.*\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values(windows_ad_directory_service_threads{job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-purple",
|
||||
"name": "Critical system event",
|
||||
|
@ -21,7 +21,7 @@
|
|||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"hide": true,
|
||||
"iconColor": "light-yellow",
|
||||
"name": "Reboot",
|
||||
|
@ -35,7 +35,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-orange",
|
||||
"name": "Service failed",
|
||||
|
@ -204,7 +204,7 @@
|
|||
"type": "loki",
|
||||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"expr": "sum by (level) (count_over_time({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
|
||||
"expr": "sum by (level) (count_over_time({job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
|
||||
"legendFormat": "{{ level }}"
|
||||
}
|
||||
],
|
||||
|
@ -246,7 +246,7 @@
|
|||
"type": "loki",
|
||||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
|
||||
}
|
||||
],
|
||||
"title": "Logs",
|
||||
|
@ -277,7 +277,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values({job=~\".*windows.*\"}, job)",
|
||||
"query": "label_values({}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -292,7 +292,7 @@
|
|||
"label": "Instance",
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values({,job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -307,7 +307,7 @@
|
|||
"label": "Channel",
|
||||
"multi": true,
|
||||
"name": "channel",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}, channel)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\"}, channel)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -322,7 +322,7 @@
|
|||
"label": "Source",
|
||||
"multi": true,
|
||||
"name": "source",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -337,7 +337,7 @@
|
|||
"label": "Keywords",
|
||||
"multi": true,
|
||||
"name": "keywords",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -352,7 +352,7 @@
|
|||
"label": "Level",
|
||||
"multi": true,
|
||||
"name": "level",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -7,7 +7,7 @@ groups:
|
|||
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High CPU usage on Windows host.
|
||||
expr: |
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -18,9 +18,9 @@ groups:
|
|||
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High memory usage on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
|
||||
100 - ((windows_os_physical_memory_free_bytes{}
|
||||
/
|
||||
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
|
||||
windows_cs_physical_memory_bytes{}) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -31,7 +31,7 @@ groups:
|
|||
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
|
||||
summary: Disk is almost full on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
|
||||
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -42,7 +42,7 @@ groups:
|
|||
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
|
||||
summary: Windows service is not healthy.
|
||||
expr: |
|
||||
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
|
||||
windows_service_status{status!~"starting|stopping|ok", } > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -52,7 +52,7 @@ groups:
|
|||
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
|
||||
summary: Windows physical disk is not healthy.
|
||||
expr: |
|
||||
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
|
||||
windows_disk_drive_status{status="OK", } != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -62,7 +62,7 @@ groups:
|
|||
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
|
||||
summary: NTP client delay.
|
||||
expr: |
|
||||
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_ntp_round_trip_delay_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -73,7 +73,7 @@ groups:
|
|||
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
|
||||
summary: NTP time offset is too large.
|
||||
expr: |
|
||||
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_computed_time_offset_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-purple",
|
||||
"name": "Critical system event",
|
||||
|
@ -21,7 +21,7 @@
|
|||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"hide": true,
|
||||
"iconColor": "light-yellow",
|
||||
"name": "Reboot",
|
||||
|
@ -35,7 +35,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-orange",
|
||||
"name": "Service failed",
|
||||
|
@ -125,7 +125,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
|
||||
"legendFormat": "{{ volume }} available"
|
||||
}
|
||||
],
|
||||
|
@ -251,7 +251,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "TOTAL"
|
||||
|
@ -261,7 +261,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{ volume }} available",
|
||||
|
@ -426,7 +426,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ volume }} read"
|
||||
},
|
||||
{
|
||||
|
@ -434,7 +434,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ volume }} written"
|
||||
},
|
||||
{
|
||||
|
@ -442,7 +442,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
|
||||
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
|
||||
"legendFormat": "{{ volume }} io util"
|
||||
}
|
||||
],
|
||||
|
@ -507,7 +507,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"expr": "irate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"legendFormat": "{{ volume }} reads"
|
||||
},
|
||||
{
|
||||
|
@ -515,7 +515,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"expr": "irate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"legendFormat": "{{ volume }} writes"
|
||||
}
|
||||
],
|
||||
|
@ -580,7 +580,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_read_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"expr": "irate(windows_logical_disk_read_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_reads_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"legendFormat": "{{ volume }} avg read time"
|
||||
},
|
||||
{
|
||||
|
@ -588,7 +588,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_write_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"expr": "irate(windows_logical_disk_write_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n/\nirate(windows_logical_disk_writes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])\n",
|
||||
"legendFormat": "{{ volume }} avg write time"
|
||||
}
|
||||
],
|
||||
|
@ -652,7 +652,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_avg_read_requests_queued{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_logical_disk_avg_read_requests_queued{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ volume }} read queue"
|
||||
},
|
||||
{
|
||||
|
@ -660,7 +660,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_avg_write_requests_queued{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_logical_disk_avg_write_requests_queued{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ volume }} write queue"
|
||||
}
|
||||
],
|
||||
|
@ -692,7 +692,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
|
||||
"query": "label_values(windows_os_info{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -707,7 +707,7 @@
|
|||
"label": "Instance",
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-purple",
|
||||
"name": "Critical system event",
|
||||
|
@ -21,7 +21,7 @@
|
|||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"hide": true,
|
||||
"iconColor": "light-yellow",
|
||||
"name": "Reboot",
|
||||
|
@ -35,7 +35,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-orange",
|
||||
"name": "Service failed",
|
||||
|
@ -418,7 +418,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "OS Info"
|
||||
|
@ -428,7 +428,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "time() - windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "time() - windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "Uptime"
|
||||
|
@ -438,7 +438,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_cs_logical_processors{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_cs_logical_processors{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "Cores"
|
||||
|
@ -448,7 +448,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "CPU usage",
|
||||
|
@ -459,7 +459,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "Memory total",
|
||||
|
@ -470,7 +470,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100",
|
||||
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "Memory usage"
|
||||
|
@ -480,7 +480,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "Disk C: total"
|
||||
|
@ -490,7 +490,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - windows_logical_disk_free_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*100",
|
||||
"expr": "100 - windows_logical_disk_free_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}*100",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "Disk C: used"
|
||||
|
@ -500,7 +500,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "count by (instance) (max_over_time(ALERTS{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"critical\"}[1m])) * group by (instance) (windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
|
||||
"expr": "count by (instance) (max_over_time(ALERTS{job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"critical\"}[1m])) * group by (instance) (windows_os_info{job=~\"$job\",instance=~\"$instance\"})",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "CRITICAL"
|
||||
|
@ -510,7 +510,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "count by (instance) (max_over_time(ALERTS{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"warning\"}[1m])) * group by (instance) (windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"})",
|
||||
"expr": "count by (instance) (max_over_time(ALERTS{job=~\"$job\",instance=~\"$instance\", alertstate=\"firing\", severity=\"warning\"}[1m])) * group by (instance) (windows_os_info{job=~\"$job\",instance=~\"$instance\"})",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "WARNING"
|
||||
|
@ -648,7 +648,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25,100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
|
||||
"expr": "topk(25,100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
|
||||
"legendFormat": "{{instance}}"
|
||||
},
|
||||
{
|
||||
|
@ -656,7 +656,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "avg(100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
|
||||
"expr": "avg(100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100)))",
|
||||
"legendFormat": "Mean"
|
||||
}
|
||||
],
|
||||
|
@ -753,7 +753,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25,100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100)",
|
||||
"expr": "topk(25,100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100)",
|
||||
"legendFormat": "{{instance}}"
|
||||
},
|
||||
{
|
||||
|
@ -761,7 +761,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "avg(100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100)",
|
||||
"expr": "avg(100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100)",
|
||||
"legendFormat": "Mean"
|
||||
}
|
||||
],
|
||||
|
@ -858,7 +858,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25,(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
|
||||
"expr": "topk(25,(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
|
||||
"legendFormat": "{{instance}}: {{volume}}"
|
||||
},
|
||||
{
|
||||
|
@ -866,7 +866,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "avg((1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
|
||||
"expr": "avg((1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100)",
|
||||
"legendFormat": "Mean"
|
||||
}
|
||||
],
|
||||
|
@ -963,7 +963,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25,100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*100)",
|
||||
"expr": "topk(25,100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}*100)",
|
||||
"legendFormat": "{{instance}}: {{volume}}"
|
||||
},
|
||||
{
|
||||
|
@ -971,7 +971,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "avg(100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*100)",
|
||||
"expr": "avg(100 - windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}/windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}*100)",
|
||||
"legendFormat": "Mean"
|
||||
}
|
||||
],
|
||||
|
@ -1024,7 +1024,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25, irate(windows_net_packets_outbound_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"expr": "topk(25, irate(windows_net_packets_outbound_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"legendFormat": "{{instance}}: {{ nic }} transmitted"
|
||||
},
|
||||
{
|
||||
|
@ -1032,7 +1032,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25, irate(windows_net_packets_received_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"expr": "topk(25, irate(windows_net_packets_received_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"legendFormat": "{{instance}}: {{ nic }} received"
|
||||
},
|
||||
{
|
||||
|
@ -1040,7 +1040,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25, irate(windows_net_packets_received_unknown_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"expr": "topk(25, irate(windows_net_packets_received_unknown_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"legendFormat": "{{instance}}: {{ nic }} received (unknown)"
|
||||
},
|
||||
{
|
||||
|
@ -1048,7 +1048,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25, irate(windows_net_packets_outbound_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"expr": "topk(25, irate(windows_net_packets_outbound_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"legendFormat": "{{instance}}: {{ nic }} transmitted packets dropped"
|
||||
},
|
||||
{
|
||||
|
@ -1056,7 +1056,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "topk(25, irate(windows_net_packets_received_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"expr": "topk(25, irate(windows_net_packets_received_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))>0.5",
|
||||
"legendFormat": "{{instance}}: {{ nic }} received packets dropped"
|
||||
}
|
||||
],
|
||||
|
@ -1088,7 +1088,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
|
||||
"query": "label_values(windows_os_info{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -1103,7 +1103,7 @@
|
|||
"label": "Instance",
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-purple",
|
||||
"name": "Critical system event",
|
||||
|
@ -21,7 +21,7 @@
|
|||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"hide": true,
|
||||
"iconColor": "light-yellow",
|
||||
"name": "Reboot",
|
||||
|
@ -35,7 +35,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-orange",
|
||||
"name": "Service failed",
|
||||
|
@ -216,7 +216,7 @@
|
|||
"type": "loki",
|
||||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"expr": "sum by (level) (count_over_time({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
|
||||
"expr": "sum by (level) (count_over_time({job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"}\n|~ \"$regex_search\"\n| json | __error__=``\n[$__auto]))\n",
|
||||
"legendFormat": "{{ level }}"
|
||||
}
|
||||
],
|
||||
|
@ -258,7 +258,7 @@
|
|||
"type": "loki",
|
||||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\",level=~\"$level\"} \n|~ \"$regex_search\"\n| json | __error__=``\n| label_format timestamp=\"{{__timestamp__}}\"\n| drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted\n| line_format `{{ if eq \"[[instance]]\" \".*\" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}`\n\n"
|
||||
}
|
||||
],
|
||||
"title": "Logs",
|
||||
|
@ -289,7 +289,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values({job=~\".*windows.*\"}, job)",
|
||||
"query": "label_values({}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -304,7 +304,7 @@
|
|||
"label": "Instance",
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values({,job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -319,7 +319,7 @@
|
|||
"label": "Channel",
|
||||
"multi": true,
|
||||
"name": "channel",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}, channel)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\"}, channel)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -334,7 +334,7 @@
|
|||
"label": "Source",
|
||||
"multi": true,
|
||||
"name": "source",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\"}, source)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -349,7 +349,7 @@
|
|||
"label": "Keywords",
|
||||
"multi": true,
|
||||
"name": "keywords",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\"}, keywords)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -364,7 +364,7 @@
|
|||
"label": "Level",
|
||||
"multi": true,
|
||||
"name": "level",
|
||||
"query": "label_values({job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
|
||||
"query": "label_values({,job=~\"$job\",instance=~\"$instance\",channel=~\"$channel\",source=~\"$source\",keywords=~\"$keywords\"}, level)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-purple",
|
||||
"name": "Critical system event",
|
||||
|
@ -21,7 +21,7 @@
|
|||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"hide": true,
|
||||
"iconColor": "light-yellow",
|
||||
"name": "Reboot",
|
||||
|
@ -35,7 +35,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-orange",
|
||||
"name": "Service failed",
|
||||
|
@ -128,7 +128,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "time() - windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
|
||||
"expr": "time() - windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}"
|
||||
}
|
||||
],
|
||||
"title": "Uptime",
|
||||
|
@ -172,7 +172,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table"
|
||||
}
|
||||
],
|
||||
|
@ -217,7 +217,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table"
|
||||
}
|
||||
],
|
||||
|
@ -262,7 +262,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_os_info{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_os_info{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table"
|
||||
}
|
||||
],
|
||||
|
@ -307,7 +307,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_cs_logical_processors{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
|
||||
"expr": "windows_cs_logical_processors{job=~\"$job\",instance=~\"$instance\"}"
|
||||
}
|
||||
],
|
||||
"title": "CPU count",
|
||||
|
@ -351,7 +351,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"legendFormat": "Memory total"
|
||||
}
|
||||
],
|
||||
|
@ -396,7 +396,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_os_paging_limit_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
|
||||
"expr": "windows_os_paging_limit_bytes{job=~\"$job\",instance=~\"$instance\"}"
|
||||
}
|
||||
],
|
||||
"title": "Pagefile size",
|
||||
|
@ -440,7 +440,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}"
|
||||
"expr": "windows_logical_disk_size_bytes{volume=\"C:\", job=~\"$job\",instance=~\"$instance\"}"
|
||||
}
|
||||
],
|
||||
"title": "Disk C: size",
|
||||
|
@ -498,7 +498,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"legendFormat": "CPU usage"
|
||||
}
|
||||
],
|
||||
|
@ -553,7 +553,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"legendFormat": "CPU usage"
|
||||
}
|
||||
],
|
||||
|
@ -612,7 +612,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} * 100"
|
||||
"expr": "100 - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"} / windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} * 100"
|
||||
}
|
||||
],
|
||||
"title": "Memory usage",
|
||||
|
@ -692,7 +692,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"} - windows_os_physical_memory_free_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"} - windows_os_physical_memory_free_bytes{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"legendFormat": "Memory used"
|
||||
},
|
||||
{
|
||||
|
@ -700,7 +700,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_cs_physical_memory_bytes{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"legendFormat": "Memory total"
|
||||
}
|
||||
],
|
||||
|
@ -783,7 +783,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_logical_disk_read_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ volume }} read"
|
||||
},
|
||||
{
|
||||
|
@ -791,7 +791,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_logical_disk_write_bytes_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ volume }} written"
|
||||
},
|
||||
{
|
||||
|
@ -799,7 +799,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
|
||||
"expr": "(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]),1)) * 100",
|
||||
"legendFormat": "{{ volume }} io util"
|
||||
}
|
||||
],
|
||||
|
@ -925,7 +925,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_logical_disk_size_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "TOTAL"
|
||||
|
@ -935,7 +935,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_logical_disk_free_bytes{volume!~\"HarddiskVolume.*\", job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{ volume }} available",
|
||||
|
@ -1107,7 +1107,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_net_bytes_received_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
|
||||
"expr": "irate(windows_net_bytes_received_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
|
||||
"legendFormat": "{{ nic }} received"
|
||||
},
|
||||
{
|
||||
|
@ -1115,7 +1115,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_net_bytes_sent_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
|
||||
"expr": "irate(windows_net_bytes_sent_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*8",
|
||||
"legendFormat": "{{ nic }} transmitted"
|
||||
}
|
||||
],
|
||||
|
@ -1182,7 +1182,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_net_packets_outbound_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_net_packets_outbound_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ nic }} transmitted"
|
||||
},
|
||||
{
|
||||
|
@ -1190,7 +1190,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_net_packets_received_errors_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_net_packets_received_errors_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ nic }} received"
|
||||
},
|
||||
{
|
||||
|
@ -1198,7 +1198,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_net_packets_received_unknown_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_net_packets_received_unknown_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ nic }} received (unknown)"
|
||||
},
|
||||
{
|
||||
|
@ -1206,7 +1206,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_net_packets_outbound_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_net_packets_outbound_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ nic }} transmitted packets dropped"
|
||||
},
|
||||
{
|
||||
|
@ -1214,7 +1214,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_net_packets_received_discarded_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_net_packets_received_discarded_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "{{ nic }} received packets dropped"
|
||||
}
|
||||
],
|
||||
|
@ -1246,7 +1246,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
|
||||
"query": "label_values(windows_os_info{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -1261,7 +1261,7 @@
|
|||
"label": "Instance",
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", channel=\"System\", level=\"Critical\"} | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-purple",
|
||||
"name": "Critical system event",
|
||||
|
@ -21,7 +21,7 @@
|
|||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "windows_system_system_up_time{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"expr": "windows_system_system_up_time{job=~\"$job\",instance=~\"$instance\"}*1000 > $__from < $__to",
|
||||
"hide": true,
|
||||
"iconColor": "light-yellow",
|
||||
"name": "Reboot",
|
||||
|
@ -35,7 +35,7 @@
|
|||
"uid": "${loki_datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"expr": "{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"expr": "{job=~\"$job\",instance=~\"$instance\", source=\"Service Control Manager\", level=\"Error\"} |= \"terminated\" | json",
|
||||
"hide": true,
|
||||
"iconColor": "light-orange",
|
||||
"name": "Service failed",
|
||||
|
@ -122,7 +122,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"legendFormat": "CPU usage"
|
||||
}
|
||||
],
|
||||
|
@ -177,7 +177,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"expr": "100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode=\"idle\", job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])*100))",
|
||||
"legendFormat": "CPU usage"
|
||||
}
|
||||
],
|
||||
|
@ -293,7 +293,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "sum by(instance, mode) (irate(windows_cpu_time_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) \n/ on(instance) \ngroup_left sum by (instance) ((irate(windows_cpu_time_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))) * 100\n",
|
||||
"expr": "sum by(instance, mode) (irate(windows_cpu_time_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])) \n/ on(instance) \ngroup_left sum by (instance) ((irate(windows_cpu_time_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))) * 100\n",
|
||||
"legendFormat": "{{ mode }}"
|
||||
}
|
||||
],
|
||||
|
@ -342,7 +342,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_system_processor_queue_length{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}\n",
|
||||
"expr": "windows_system_processor_queue_length{job=~\"$job\",instance=~\"$instance\"}\n",
|
||||
"legendFormat": "CPU average queue"
|
||||
}
|
||||
],
|
||||
|
@ -391,7 +391,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "irate(windows_system_context_switches_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"expr": "irate(windows_system_context_switches_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval])",
|
||||
"legendFormat": "Context switches"
|
||||
},
|
||||
{
|
||||
|
@ -399,7 +399,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "sum without (core) (irate(windows_cpu_interrupts_total{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
|
||||
"expr": "sum without (core) (irate(windows_cpu_interrupts_total{job=~\"$job\",instance=~\"$instance\"}[$__rate_interval]))",
|
||||
"legendFormat": "Interrupts"
|
||||
}
|
||||
],
|
||||
|
@ -456,7 +456,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_os_timezone{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_os_timezone{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"format": "table"
|
||||
}
|
||||
],
|
||||
|
@ -509,7 +509,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "clamp_max(windows_time_ntp_client_time_sources{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}, 1)",
|
||||
"expr": "clamp_max(windows_time_ntp_client_time_sources{job=~\"$job\",instance=~\"$instance\"}, 1)",
|
||||
"legendFormat": "NTP status"
|
||||
}
|
||||
],
|
||||
|
@ -559,7 +559,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_time_ntp_round_trip_delay_seconds{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_time_ntp_round_trip_delay_seconds{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"legendFormat": "NTP trip delay"
|
||||
},
|
||||
{
|
||||
|
@ -567,7 +567,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${prometheus_datasource}"
|
||||
},
|
||||
"expr": "windows_time_computed_time_offset_seconds{job=~\".*windows.*\",job=~\"$job\",instance=~\"$instance\"}",
|
||||
"expr": "windows_time_computed_time_offset_seconds{job=~\"$job\",instance=~\"$instance\"}",
|
||||
"legendFormat": "Time offset"
|
||||
}
|
||||
],
|
||||
|
@ -599,7 +599,7 @@
|
|||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\"}, job)",
|
||||
"query": "label_values(windows_os_info{}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
@ -614,7 +614,7 @@
|
|||
"label": "Instance",
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"query": "label_values(windows_os_info{job=~\".*windows.*\",job=~\"$job\"}, instance)",
|
||||
"query": "label_values(windows_os_info{job=~\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
|
|
|
@ -26,8 +26,8 @@ annotations:
|
|||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
|
||||
5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
|
||||
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", }))/(sum without (id)
|
||||
(jvm_memory_max_bytes{area="heap", } != -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -43,7 +43,7 @@ annotations:
|
|||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job!=""}) > 0
|
||||
expr: (jvm_threads_deadlocked{}) > 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
|
|
@ -26,7 +26,7 @@ annotations:
|
|||
description: 'Kafka lag keeps increasing over the last 15 minutes for consumer group:
|
||||
{{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag keeps increasing.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""}[5m]))
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"}[5m]))
|
||||
> 0
|
||||
for: 15m
|
||||
keep_firing_for: 10m
|
||||
|
@ -42,7 +42,7 @@ annotations:
|
|||
description: 'Total kafka lag across all partitions is too high ({{ printf "%.0f"
|
||||
$value }}) for consumer group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag is too high.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""})
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{topic!="__consumer_offsets",consumergroup!="",job="integrations/kafka"})
|
||||
> 100
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
|
@ -183,8 +183,8 @@ annotations:
|
|||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} has disconected from Zookeeper.
|
||||
summary: Kafka Zookeeper sync disconected.
|
||||
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job="integrations/kafka",
|
||||
quantile="0.95"}[5m])) < 0
|
||||
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{quantile="0.95",job="integrations/kafka"}[5m]))
|
||||
< 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
@ -27,7 +27,7 @@ annotations:
|
|||
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High CPU usage on Windows host.
|
||||
expr: |
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -43,9 +43,9 @@ annotations:
|
|||
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High memory usage on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
|
||||
100 - ((windows_os_physical_memory_free_bytes{}
|
||||
/
|
||||
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
|
||||
windows_cs_physical_memory_bytes{}) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -61,7 +61,7 @@ annotations:
|
|||
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
|
||||
summary: Disk is almost full on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
|
||||
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -77,7 +77,7 @@ annotations:
|
|||
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
|
||||
summary: Windows service is not healthy.
|
||||
expr: |
|
||||
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
|
||||
windows_service_status{status!~"starting|stopping|ok", } > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -92,7 +92,7 @@ annotations:
|
|||
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
|
||||
summary: Windows physical disk is not healthy.
|
||||
expr: |
|
||||
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
|
||||
windows_disk_drive_status{status="OK", } != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -107,7 +107,7 @@ annotations:
|
|||
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
|
||||
summary: NTP client delay.
|
||||
expr: |
|
||||
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_ntp_round_trip_delay_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -123,7 +123,7 @@ annotations:
|
|||
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
|
||||
summary: NTP time offset is too large.
|
||||
expr: |
|
||||
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_computed_time_offset_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -140,7 +140,7 @@ annotations:
|
|||
summary: There is a high number of pending replication operations in Active Directory.
|
||||
A high number of pending operations sustained over a period of time can indicate
|
||||
a problem with replication.
|
||||
expr: "windows_ad_replication_pending_operations{job=~\".*windows.*\"} >= 50
|
||||
expr: "windows_ad_replication_pending_operations{} >= 50
|
||||
"
|
||||
for: 10m
|
||||
keep_firing_for: 5m
|
||||
|
@ -158,7 +158,7 @@ annotations:
|
|||
summary: There are a number of replication synchronization request failures. These
|
||||
can cause authentication failures, outdated information being propagated across
|
||||
domain controllers, and potentially data loss or inconsistencies.
|
||||
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{job=~\".*windows.*\"}[5m])
|
||||
expr: "increase(windows_ad_replication_sync_requests_schema_mismatch_failure_total{}[5m])
|
||||
> 0
|
||||
"
|
||||
for: 5m
|
||||
|
@ -178,7 +178,7 @@ annotations:
|
|||
summary: There is a high number of password changes. This may indicate unauthorized
|
||||
changes or attacks.
|
||||
expr: |
|
||||
increase(windows_ad_sam_password_changes_total{job=~".*windows.*"}[5m]) > 25
|
||||
increase(windows_ad_sam_password_changes_total{}[5m]) > 25
|
||||
for: 5m
|
||||
labels:
|
||||
keep_firing_for: 24h
|
||||
|
|
|
@ -27,7 +27,7 @@ annotations:
|
|||
CPU usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High CPU usage on Windows host.
|
||||
expr: |
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{job=~".*windows.*", mode="idle"}[2m])) * 100) > 90
|
||||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", }[2m])) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -43,9 +43,9 @@ annotations:
|
|||
Memory usage on host {{ $labels.instance }} is above 90%. The current value is {{ $value | printf "%.2f" }}%.
|
||||
summary: High memory usage on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_os_physical_memory_free_bytes{job=~".*windows.*"}
|
||||
100 - ((windows_os_physical_memory_free_bytes{}
|
||||
/
|
||||
windows_cs_physical_memory_bytes{job=~".*windows.*"}) * 100) > 90
|
||||
windows_cs_physical_memory_bytes{}) * 100) > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -61,7 +61,7 @@ annotations:
|
|||
Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than 90% of space is used. The current volume utilization is {{ $value | printf "%.2f" }}%.
|
||||
summary: Disk is almost full on Windows host.
|
||||
expr: |
|
||||
100 - ((windows_logical_disk_free_bytes{job=~".*windows.*"} ) / (windows_logical_disk_size_bytes{job=~".*windows.*"})) * 100 > 90
|
||||
100 - ((windows_logical_disk_free_bytes{} ) / (windows_logical_disk_size_bytes{})) * 100 > 90
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -77,7 +77,7 @@ annotations:
|
|||
Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'.
|
||||
summary: Windows service is not healthy.
|
||||
expr: |
|
||||
windows_service_status{job=~".*windows.*", status!~"starting|stopping|ok"} > 0
|
||||
windows_service_status{status!~"starting|stopping|ok", } > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -92,7 +92,7 @@ annotations:
|
|||
Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status.
|
||||
summary: Windows physical disk is not healthy.
|
||||
expr: |
|
||||
windows_disk_drive_status{job=~".*windows.*", status="OK"} != 1
|
||||
windows_disk_drive_status{status="OK", } != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -107,7 +107,7 @@ annotations:
|
|||
Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.
|
||||
summary: NTP client delay.
|
||||
expr: |
|
||||
windows_time_ntp_round_trip_delay_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_ntp_round_trip_delay_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
@ -123,7 +123,7 @@ annotations:
|
|||
NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.
|
||||
summary: NTP time offset is too large.
|
||||
expr: |
|
||||
windows_time_computed_time_offset_seconds{job=~".*windows.*"} > 1
|
||||
windows_time_computed_time_offset_seconds{} > 1
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
|
|
Loading…
Reference in a new issue