2024-05-04 12:01:41 +00:00
|
|
|
groups:
|
|
|
|
- name: opensearch-alerts
|
|
|
|
rules:
|
|
|
|
- alert: OpenSearchYellowCluster
|
|
|
|
annotations:
|
|
|
|
description: '{{$labels.cluster}} health status is yellow over the last 5 minutes'
|
|
|
|
summary: At least one of the clusters is reporting a yellow status.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
opensearch_cluster_status{opensearch_cluster!=""} == 1
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: OpenSearchRedCluster
|
|
|
|
annotations:
|
|
|
|
description: '{{$labels.cluster}} health status is red over the last 5 minutes'
|
|
|
|
summary: At least one of the clusters is reporting a red status.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
opensearch_cluster_status{opensearch_cluster!=""} == 2
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: OpenSearchUnstableShardReallocation
|
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.cluster}} has had {{ printf "%.0f" $value }} shard reallocation over the last 1m which is above the threshold of 0.
|
|
|
|
summary: A node has gone offline or has been disconnected triggering shard reallocation.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="relocating"}) > 0
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 1m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: OpenSearchUnstableShardUnassigned
|
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.cluster}} has had {{ printf "%.0f" $value }} shard unassigned over the last 5m which is above the threshold of 0.
|
|
|
|
summary: There are shards that have been detected as unassigned.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="unassigned"}) > 0
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
2024-10-25 03:33:09 +00:00
|
|
|
- alert: OpenSearchHighNodeDiskUsage
|
2024-05-04 12:01:41 +00:00
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.node}} has had {{ printf "%.0f" $value }} disk usage over the last 5m which is above the threshold of 60.
|
|
|
|
summary: The node disk usage has exceeded the warning threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes{opensearch_cluster!=""}) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 60
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: OpenSearchHighNodeDiskUsage
|
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.node}} has had {{ printf "%.0f" $value }}% disk usage over the last 5m which is above the threshold of 80.
|
|
|
|
summary: The node disk usage has exceeded the critical threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 80
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2024-10-25 03:33:09 +00:00
|
|
|
- alert: OpenSearchHighNodeCpuUsage
|
2024-05-04 12:01:41 +00:00
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 70.
|
|
|
|
summary: The node CPU usage has exceeded the warning threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 70
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: OpenSearchHighNodeCpuUsage
|
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 85.
|
|
|
|
summary: The node CPU usage has exceeded the critical threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 85
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
2024-10-25 03:33:09 +00:00
|
|
|
- alert: OpenSearchHighNodeMemoryUsage
|
2024-05-04 12:01:41 +00:00
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 70.
|
|
|
|
summary: The node memory usage has exceeded the warning threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 70
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: OpenSearchHighNodeMemoryUsage
|
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 85.
|
|
|
|
summary: The node memory usage has exceeded the critical threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 85
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: OpenSearchModerateRequestLatency
|
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.index}} has had {{ printf "%.0f" $value }}s of request latency over the last 5m which is above the threshold of 0.5.
|
|
|
|
summary: The request latency has exceeded the warning threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{opensearch_cluster!="", context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > 0.5
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: OpenSearchModerateIndexLatency
|
|
|
|
annotations:
|
|
|
|
description: |
|
|
|
|
{{$labels.index}} has had {{ printf "%.0f" $value }}s of index latency over the last 5m which is above the threshold of 0.5.
|
|
|
|
summary: The index latency has exceeded the warning threshold.
|
|
|
|
expr: |
|
2024-10-25 03:33:09 +00:00
|
|
|
sum without(context) (increase(opensearch_index_indexing_index_time_seconds{opensearch_cluster!="", context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > 0.5
|
2024-05-04 12:01:41 +00:00
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|