1
0
Fork 0
mirror of https://github.com/monitoring-mixins/website.git synced 2024-12-14 11:37:31 +00:00
monitoring-mixins-website/assets/ceph/alerts.yaml
2022-02-03 03:23:31 +00:00

329 lines
13 KiB
YAML

groups:
- name: ceph-mgr-status
rules:
- alert: CephMgrIsAbsent
annotations:
description: Ceph Manager has disappeared from Prometheus target discovery.
message: Storage metrics collector service not available anymore.
severity_level: critical
storage_type: ceph
expr: |
label_replace((up{job="rook-ceph-mgr"} == 0 or absent(up{job="rook-ceph-mgr"})), "namespace", "openshift-storage", "", "")
for: 5m
labels:
severity: critical
- alert: CephMgrIsMissingReplicas
annotations:
description: Ceph Manager is missing replicas.
message: Storage metrics collector service doesn't have required no of replicas.
severity_level: warning
storage_type: ceph
expr: |
sum(kube_deployment_spec_replicas{deployment=~"rook-ceph-mgr-.*"}) by (namespace) < 1
for: 5m
labels:
severity: warning
- name: ceph-mds-status
rules:
- alert: CephMdsMissingReplicas
annotations:
description: Minimum required replicas for storage metadata service not available.
Might affect the working of storage cluster.
message: Insufficient replicas for storage metadata service.
severity_level: warning
storage_type: ceph
expr: |
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2
for: 5m
labels:
severity: warning
- name: quorum-alert.rules
rules:
- alert: CephMonQuorumAtRisk
annotations:
description: Storage cluster quorum is low. Contact Support.
message: Storage quorum at risk
severity_level: error
storage_type: ceph
expr: |
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1)
for: 15m
labels:
severity: critical
- alert: CephMonQuorumLost
annotations:
description: Storage cluster quorum is lost. Contact Support.
message: Storage quorum is lost
severity_level: critical
storage_type: ceph
expr: |
count(kube_pod_status_phase{pod=~"rook-ceph-mon-.*", phase=~"Running|running"} == 1) by (namespace) < 2
for: 5m
labels:
severity: critical
- alert: CephMonHighNumberOfLeaderChanges
annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
message: Storage Cluster has seen many leader changes recently.
severity_level: warning
storage_type: ceph
expr: |
(ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95
for: 5m
labels:
severity: warning
- name: ceph-node-alert.rules
rules:
- alert: CephNodeDown
annotations:
description: Storage node {{ $labels.node }} went down. Please check the node
immediately.
message: Storage node {{ $labels.node }} went down
severity_level: error
storage_type: ceph
expr: |
cluster:ceph_node_down:join_kube == 0
for: 30s
labels:
severity: critical
- name: osd-alert.rules
rules:
- alert: CephOSDCriticallyFull
annotations:
description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class
type {{$labels.device_class}} has crossed 80% on host {{ $labels.hostname
}}. Immediately free up some space or add capacity of type {{$labels.device_class}}.
message: Back-end storage device is critically full.
severity_level: error
storage_type: ceph
expr: |
(ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.80
for: 40s
labels:
severity: critical
- alert: CephOSDFlapping
annotations:
description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times
in last 5 minutes. Please check the pod events or ceph status to find out
the cause.
message: Ceph storage osd flapping.
severity_level: error
storage_type: ceph
expr: |
changes(ceph_osd_up[5m]) >= 10
for: 0s
labels:
severity: critical
- alert: CephOSDNearFull
annotations:
description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class
type {{$labels.device_class}} has crossed 75% on host {{ $labels.hostname
}}. Immediately free up some space or add capacity of type {{$labels.device_class}}.
message: Back-end storage device is nearing full.
severity_level: warning
storage_type: ceph
expr: |
(ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75
for: 40s
labels:
severity: warning
- alert: CephOSDDiskNotResponding
annotations:
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
}}.
message: Disk not responding
severity_level: error
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 15m
labels:
severity: critical
- alert: CephOSDDiskUnavailable
annotations:
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
}}.
message: Disk not accessible
severity_level: error
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 1m
labels:
severity: critical
- alert: CephOSDSlowOps
annotations:
description: '{{ $value }} Ceph OSD requests are taking too long to process.
Please check ceph status to find out the cause.'
message: OSD requests are taking too long to process.
severity_level: warning
storage_type: ceph
expr: |
ceph_healthcheck_slow_ops > 0
for: 30s
labels:
severity: warning
- alert: CephDataRecoveryTakingTooLong
annotations:
description: Data recovery has been active for too long. Contact Support.
message: Data recovery is slow
severity_level: warning
storage_type: ceph
expr: |
ceph_pg_undersized > 0
for: 2h
labels:
severity: warning
- alert: CephPGRepairTakingTooLong
annotations:
description: Self heal operations taking too long. Contact Support.
message: Self heal problems detected
severity_level: warning
storage_type: ceph
expr: |
ceph_pg_inconsistent > 0
for: 1h
labels:
severity: warning
- name: persistent-volume-alert.rules
rules:
- alert: PersistentVolumeUsageNearFull
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
75%. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
or PVC expansion is required.
severity_level: warning
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75
for: 5s
labels:
severity: warning
- alert: PersistentVolumeUsageCritical
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
85%. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
or PVC expansion is required.
severity_level: error
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85
for: 5s
labels:
severity: critical
- name: cluster-state-alert.rules
rules:
- alert: CephClusterErrorState
annotations:
description: Storage cluster is in error state for more than 10m.
message: Storage cluster is in error state
severity_level: error
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} > 1
for: 10m
labels:
severity: critical
- alert: CephClusterWarningState
annotations:
description: Storage cluster is in warning state for more than 10m.
message: Storage cluster is in degraded state
severity_level: warning
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} == 1
for: 15m
labels:
severity: warning
- alert: CephOSDVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph OSD components
running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1
for: 10m
labels:
severity: warning
- alert: CephMonVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph Mon components
running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_mon_metadata{job="rook-ceph-mgr", ceph_version != ""}) by (ceph_version)) > 1
for: 10m
labels:
severity: warning
- name: cluster-utilization-alert.rules
rules:
- alert: CephClusterNearFull
annotations:
description: Storage cluster utilization has crossed 75% and will become read-only
at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion
is required.
severity_level: warning
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.75
for: 5s
labels:
severity: warning
- alert: CephClusterCriticallyFull
annotations:
description: Storage cluster utilization has crossed 80% and will become read-only
at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion
or cluster expansion.
severity_level: error
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.80
for: 5s
labels:
severity: critical
- alert: CephClusterReadOnly
annotations:
description: Storage cluster utilization has crossed 85% and will become read-only
now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion
or cluster expansion.
severity_level: error
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes >= 0.85
for: 0s
labels:
severity: critical
- name: pool-quota.rules
rules:
- alert: CephPoolQuotaBytesNearExhaustion
annotations:
description: Storage pool {{ $labels.name }} quota usage has crossed 70%.
message: Storage pool quota(bytes) is near exhaustion.
severity_level: warning
storage_type: ceph
expr: |
(ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.70
for: 1m
labels:
severity: warning
- alert: CephPoolQuotaBytesCriticallyExhausted
annotations:
description: Storage pool {{ $labels.name }} quota usage has crossed 90%.
message: Storage pool quota(bytes) is critically exhausted.
severity_level: critical
storage_type: ceph
expr: |
(ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.90
for: 1m
labels:
severity: critical