1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00
monitoring-mixins-website/assets/opensearch/alerts.yaml
2024-10-25 03:33:09 +00:00

121 lines
5.7 KiB
YAML

groups:
- name: opensearch-alerts
rules:
- alert: OpenSearchYellowCluster
annotations:
description: '{{$labels.cluster}} health status is yellow over the last 5 minutes'
summary: At least one of the clusters is reporting a yellow status.
expr: |
opensearch_cluster_status{opensearch_cluster!=""} == 1
for: 5m
labels:
severity: warning
- alert: OpenSearchRedCluster
annotations:
description: '{{$labels.cluster}} health status is red over the last 5 minutes'
summary: At least one of the clusters is reporting a red status.
expr: |
opensearch_cluster_status{opensearch_cluster!=""} == 2
for: 5m
labels:
severity: critical
- alert: OpenSearchUnstableShardReallocation
annotations:
description: |
{{$labels.cluster}} has had {{ printf "%.0f" $value }} shard reallocation over the last 1m which is above the threshold of 0.
summary: A node has gone offline or has been disconnected triggering shard reallocation.
expr: |
sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="relocating"}) > 0
for: 1m
labels:
severity: warning
- alert: OpenSearchUnstableShardUnassigned
annotations:
description: |
{{$labels.cluster}} has had {{ printf "%.0f" $value }} shard unassigned over the last 5m which is above the threshold of 0.
summary: There are shards that have been detected as unassigned.
expr: |
sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="unassigned"}) > 0
for: 5m
labels:
severity: warning
- alert: OpenSearchHighNodeDiskUsage
annotations:
description: |
{{$labels.node}} has had {{ printf "%.0f" $value }} disk usage over the last 5m which is above the threshold of 60.
summary: The node disk usage has exceeded the warning threshold.
expr: |
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes{opensearch_cluster!=""}) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 60
for: 5m
labels:
severity: warning
- alert: OpenSearchHighNodeDiskUsage
annotations:
description: |
{{$labels.node}} has had {{ printf "%.0f" $value }}% disk usage over the last 5m which is above the threshold of 80.
summary: The node disk usage has exceeded the critical threshold.
expr: |
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 80
for: 5m
labels:
severity: critical
- alert: OpenSearchHighNodeCpuUsage
annotations:
description: |
{{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 70.
summary: The node CPU usage has exceeded the warning threshold.
expr: |
sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 70
for: 5m
labels:
severity: warning
- alert: OpenSearchHighNodeCpuUsage
annotations:
description: |
{{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 85.
summary: The node CPU usage has exceeded the critical threshold.
expr: |
sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 85
for: 5m
labels:
severity: critical
- alert: OpenSearchHighNodeMemoryUsage
annotations:
description: |
{{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 70.
summary: The node memory usage has exceeded the warning threshold.
expr: |
sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 70
for: 5m
labels:
severity: warning
- alert: OpenSearchHighNodeMemoryUsage
annotations:
description: |
{{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 85.
summary: The node memory usage has exceeded the critical threshold.
expr: |
sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 85
for: 5m
labels:
severity: critical
- alert: OpenSearchModerateRequestLatency
annotations:
description: |
{{$labels.index}} has had {{ printf "%.0f" $value }}s of request latency over the last 5m which is above the threshold of 0.5.
summary: The request latency has exceeded the warning threshold.
expr: |
sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{opensearch_cluster!="", context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > 0.5
for: 5m
labels:
severity: warning
- alert: OpenSearchModerateIndexLatency
annotations:
description: |
{{$labels.index}} has had {{ printf "%.0f" $value }}s of index latency over the last 5m which is above the threshold of 0.5.
summary: The index latency has exceeded the warning threshold.
expr: |
sum without(context) (increase(opensearch_index_indexing_index_time_seconds{opensearch_cluster!="", context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > 0.5
for: 5m
labels:
severity: warning