groups: - name: opensearch-alerts rules: - alert: OpenSearchYellowCluster annotations: description: '{{$labels.cluster}} health status is yellow over the last 5 minutes' summary: At least one of the clusters is reporting a yellow status. expr: | opensearch_cluster_status{opensearch_cluster!=""} == 1 for: 5m labels: severity: warning - alert: OpenSearchRedCluster annotations: description: '{{$labels.cluster}} health status is red over the last 5 minutes' summary: At least one of the clusters is reporting a red status. expr: | opensearch_cluster_status{opensearch_cluster!=""} == 2 for: 5m labels: severity: critical - alert: OpenSearchUnstableShardReallocation annotations: description: | {{$labels.cluster}} has had {{ printf "%.0f" $value }} shard reallocation over the last 1m which is above the threshold of 0. summary: A node has gone offline or has been disconnected triggering shard reallocation. expr: | sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="relocating"}) > 0 for: 1m labels: severity: warning - alert: OpenSearchUnstableShardUnassigned annotations: description: | {{$labels.cluster}} has had {{ printf "%.0f" $value }} shard unassigned over the last 5m which is above the threshold of 0. summary: There are shards that have been detected as unassigned. expr: | sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="unassigned"}) > 0 for: 5m labels: severity: warning - alert: OpenSearchHighNodeDiskUsage annotations: description: | {{$labels.node}} has had {{ printf "%.0f" $value }} disk usage over the last 5m which is above the threshold of 60. summary: The node disk usage has exceeded the warning threshold. expr: | 100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes{opensearch_cluster!=""}) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 60 for: 5m labels: severity: warning - alert: OpenSearchHighNodeDiskUsage annotations: description: | {{$labels.node}} has had {{ printf "%.0f" $value }}% disk usage over the last 5m which is above the threshold of 80. summary: The node disk usage has exceeded the critical threshold. expr: | 100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 80 for: 5m labels: severity: critical - alert: OpenSearchHighNodeCpuUsage annotations: description: | {{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 70. summary: The node CPU usage has exceeded the warning threshold. expr: | sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 70 for: 5m labels: severity: warning - alert: OpenSearchHighNodeCpuUsage annotations: description: | {{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 85. summary: The node CPU usage has exceeded the critical threshold. expr: | sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 85 for: 5m labels: severity: critical - alert: OpenSearchHighNodeMemoryUsage annotations: description: | {{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 70. summary: The node memory usage has exceeded the warning threshold. expr: | sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 70 for: 5m labels: severity: warning - alert: OpenSearchHighNodeMemoryUsage annotations: description: | {{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 85. summary: The node memory usage has exceeded the critical threshold. expr: | sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 85 for: 5m labels: severity: critical - alert: OpenSearchModerateRequestLatency annotations: description: | {{$labels.index}} has had {{ printf "%.0f" $value }}s of request latency over the last 5m which is above the threshold of 0.5. summary: The request latency has exceeded the warning threshold. expr: | sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{opensearch_cluster!="", context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > 0.5 for: 5m labels: severity: warning - alert: OpenSearchModerateIndexLatency annotations: description: | {{$labels.index}} has had {{ printf "%.0f" $value }}s of index latency over the last 5m which is above the threshold of 0.5. summary: The index latency has exceeded the warning threshold. expr: | sum without(context) (increase(opensearch_index_indexing_index_time_seconds{opensearch_cluster!="", context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > 0.5 for: 5m labels: severity: warning