mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
329 lines
13 KiB
YAML
329 lines
13 KiB
YAML
groups:
|
|
- name: ceph-mgr-status
|
|
rules:
|
|
- alert: CephMgrIsAbsent
|
|
annotations:
|
|
description: Ceph Manager has disappeared from Prometheus target discovery.
|
|
message: Storage metrics collector service not available anymore.
|
|
severity_level: critical
|
|
storage_type: ceph
|
|
expr: |
|
|
label_replace((up{job="rook-ceph-mgr"} == 0 or absent(up{job="rook-ceph-mgr"})), "namespace", "openshift-storage", "", "")
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: CephMgrIsMissingReplicas
|
|
annotations:
|
|
description: Ceph Manager is missing replicas.
|
|
message: Storage metrics collector service doesn't have required no of replicas.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
sum(kube_deployment_spec_replicas{deployment=~"rook-ceph-mgr-.*"}) by (namespace) < 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- name: ceph-mds-status
|
|
rules:
|
|
- alert: CephMdsMissingReplicas
|
|
annotations:
|
|
description: Minimum required replicas for storage metadata service not available.
|
|
Might affect the working of storage cluster.
|
|
message: Insufficient replicas for storage metadata service.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- name: quorum-alert.rules
|
|
rules:
|
|
- alert: CephMonQuorumAtRisk
|
|
annotations:
|
|
description: Storage cluster quorum is low. Contact Support.
|
|
message: Storage quorum at risk
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: CephMonQuorumLost
|
|
annotations:
|
|
description: Storage cluster quorum is lost. Contact Support.
|
|
message: Storage quorum is lost
|
|
severity_level: critical
|
|
storage_type: ceph
|
|
expr: |
|
|
count(kube_pod_status_phase{pod=~"rook-ceph-mon-.*", phase=~"Running|running"} == 1) by (namespace) < 2
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: CephMonHighNumberOfLeaderChanges
|
|
annotations:
|
|
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
|
|
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
|
message: Storage Cluster has seen many leader changes recently.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
(ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- name: ceph-node-alert.rules
|
|
rules:
|
|
- alert: CephNodeDown
|
|
annotations:
|
|
description: Storage node {{ $labels.node }} went down. Please check the node
|
|
immediately.
|
|
message: Storage node {{ $labels.node }} went down
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
cluster:ceph_node_down:join_kube == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
- name: osd-alert.rules
|
|
rules:
|
|
- alert: CephOSDCriticallyFull
|
|
annotations:
|
|
description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class
|
|
type {{$labels.device_class}} has crossed 80% on host {{ $labels.hostname
|
|
}}. Immediately free up some space or add capacity of type {{$labels.device_class}}.
|
|
message: Back-end storage device is critically full.
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
(ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.80
|
|
for: 40s
|
|
labels:
|
|
severity: critical
|
|
- alert: CephOSDFlapping
|
|
annotations:
|
|
description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times
|
|
in last 5 minutes. Please check the pod events or ceph status to find out
|
|
the cause.
|
|
message: Ceph storage osd flapping.
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
changes(ceph_osd_up[5m]) >= 10
|
|
for: 0s
|
|
labels:
|
|
severity: critical
|
|
- alert: CephOSDNearFull
|
|
annotations:
|
|
description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class
|
|
type {{$labels.device_class}} has crossed 75% on host {{ $labels.hostname
|
|
}}. Immediately free up some space or add capacity of type {{$labels.device_class}}.
|
|
message: Back-end storage device is nearing full.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
(ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75
|
|
for: 40s
|
|
labels:
|
|
severity: warning
|
|
- alert: CephOSDDiskNotResponding
|
|
annotations:
|
|
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
|
|
}}.
|
|
message: Disk not responding
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: CephOSDDiskUnavailable
|
|
annotations:
|
|
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
|
|
}}.
|
|
message: Disk not accessible
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- alert: CephOSDSlowOps
|
|
annotations:
|
|
description: '{{ $value }} Ceph OSD requests are taking too long to process.
|
|
Please check ceph status to find out the cause.'
|
|
message: OSD requests are taking too long to process.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_healthcheck_slow_ops > 0
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
- alert: CephDataRecoveryTakingTooLong
|
|
annotations:
|
|
description: Data recovery has been active for too long. Contact Support.
|
|
message: Data recovery is slow
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_pg_undersized > 0
|
|
for: 2h
|
|
labels:
|
|
severity: warning
|
|
- alert: CephPGRepairTakingTooLong
|
|
annotations:
|
|
description: Self heal operations taking too long. Contact Support.
|
|
message: Self heal problems detected
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_pg_inconsistent > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- name: persistent-volume-alert.rules
|
|
rules:
|
|
- alert: PersistentVolumeUsageNearFull
|
|
annotations:
|
|
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
|
|
75%. Free up some space or expand the PVC.
|
|
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
|
|
or PVC expansion is required.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75
|
|
for: 5s
|
|
labels:
|
|
severity: warning
|
|
- alert: PersistentVolumeUsageCritical
|
|
annotations:
|
|
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed
|
|
85%. Free up some space or expand the PVC immediately.
|
|
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
|
|
or PVC expansion is required.
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85
|
|
for: 5s
|
|
labels:
|
|
severity: critical
|
|
- name: cluster-state-alert.rules
|
|
rules:
|
|
- alert: CephClusterErrorState
|
|
annotations:
|
|
description: Storage cluster is in error state for more than 10m.
|
|
message: Storage cluster is in error state
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_health_status{job="rook-ceph-mgr"} > 1
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: CephClusterWarningState
|
|
annotations:
|
|
description: Storage cluster is in warning state for more than 10m.
|
|
message: Storage cluster is in degraded state
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_health_status{job="rook-ceph-mgr"} == 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: CephOSDVersionMismatch
|
|
annotations:
|
|
description: There are {{ $value }} different versions of Ceph OSD components
|
|
running.
|
|
message: There are multiple versions of storage services running.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: CephMonVersionMismatch
|
|
annotations:
|
|
description: There are {{ $value }} different versions of Ceph Mon components
|
|
running.
|
|
message: There are multiple versions of storage services running.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
count(count(ceph_mon_metadata{job="rook-ceph-mgr", ceph_version != ""}) by (ceph_version)) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- name: cluster-utilization-alert.rules
|
|
rules:
|
|
- alert: CephClusterNearFull
|
|
annotations:
|
|
description: Storage cluster utilization has crossed 75% and will become read-only
|
|
at 85%. Free up some space or expand the storage cluster.
|
|
message: Storage cluster is nearing full. Data deletion or cluster expansion
|
|
is required.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.75
|
|
for: 5s
|
|
labels:
|
|
severity: warning
|
|
- alert: CephClusterCriticallyFull
|
|
annotations:
|
|
description: Storage cluster utilization has crossed 80% and will become read-only
|
|
at 85%. Free up some space or expand the storage cluster immediately.
|
|
message: Storage cluster is critically full and needs immediate data deletion
|
|
or cluster expansion.
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.80
|
|
for: 5s
|
|
labels:
|
|
severity: critical
|
|
- alert: CephClusterReadOnly
|
|
annotations:
|
|
description: Storage cluster utilization has crossed 85% and will become read-only
|
|
now. Free up some space or expand the storage cluster immediately.
|
|
message: Storage cluster is read-only now and needs immediate data deletion
|
|
or cluster expansion.
|
|
severity_level: error
|
|
storage_type: ceph
|
|
expr: |
|
|
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes >= 0.85
|
|
for: 0s
|
|
labels:
|
|
severity: critical
|
|
- name: pool-quota.rules
|
|
rules:
|
|
- alert: CephPoolQuotaBytesNearExhaustion
|
|
annotations:
|
|
description: Storage pool {{ $labels.name }} quota usage has crossed 70%.
|
|
message: Storage pool quota(bytes) is near exhaustion.
|
|
severity_level: warning
|
|
storage_type: ceph
|
|
expr: |
|
|
(ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.70
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
- alert: CephPoolQuotaBytesCriticallyExhausted
|
|
annotations:
|
|
description: Storage pool {{ $labels.name }} quota usage has crossed 90%.
|
|
message: Storage pool quota(bytes) is critically exhausted.
|
|
severity_level: critical
|
|
storage_type: ceph
|
|
expr: |
|
|
(ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.90
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|