diff --git a/assets/ceph/alerts.yaml b/assets/ceph/alerts.yaml index b47cd0a..83cceca 100644 --- a/assets/ceph/alerts.yaml +++ b/assets/ceph/alerts.yaml @@ -91,6 +91,19 @@ groups: for: 40s labels: severity: critical + - alert: CephOSDFlapping + annotations: + description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times + in last 5 minutes. Please check the pod events or ceph status to find out + the cause. + message: Ceph storage osd flapping. + severity_level: error + storage_type: ceph + expr: | + changes(ceph_osd_up[5m]) >= 10 + for: 0s + labels: + severity: critical - alert: CephOSDNearFull annotations: description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class diff --git a/assets/cortex/alerts.yaml b/assets/cortex/alerts.yaml index ceba1f0..3991407 100644 --- a/assets/cortex/alerts.yaml +++ b/assets/cortex/alerts.yaml @@ -447,7 +447,7 @@ groups: ( sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) - - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0"}[5m])) + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) ) / sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) diff --git a/assets/cortex/dashboards/cortex-compactor-resources.json b/assets/cortex/dashboards/cortex-compactor-resources.json index b395a70..2759ad0 100644 --- a/assets/cortex/dashboards/cortex-compactor-resources.json +++ b/assets/cortex/dashboards/cortex-compactor-resources.json @@ -87,9 +87,7 @@ "timeShift": null, "title": "CPU", "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "sort": 2 }, "type": "graph", "xaxis": { @@ -179,9 +177,7 @@ "timeShift": null, "title": "Memory (workingset)", "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "sort": 2 }, "type": "graph", "xaxis": { @@ -256,9 +252,7 @@ "timeShift": null, "title": "Memory (go heap inuse)", "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "sort": 2 }, "type": "graph", "xaxis": { diff --git a/site/content/ceph/_index.md b/site/content/ceph/_index.md index 7f9d21d..d3d5e89 100644 --- a/site/content/ceph/_index.md +++ b/site/content/ceph/_index.md @@ -144,6 +144,23 @@ labels: severity: critical {{< /code >}} +##### CephOSDFlapping + +{{< code lang="yaml" >}} +alert: CephOSDFlapping +annotations: + description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times in last + 5 minutes. Please check the pod events or ceph status to find out the cause. + message: Ceph storage osd flapping. + severity_level: error + storage_type: ceph +expr: | + changes(ceph_osd_up[5m]) >= 10 +for: 0s +labels: + severity: critical +{{< /code >}} + ##### CephOSDNearFull {{< code lang="yaml" >}} diff --git a/site/content/cortex/_index.md b/site/content/cortex/_index.md index 8ba28d4..01d7b71 100644 --- a/site/content/cortex/_index.md +++ b/site/content/cortex/_index.md @@ -682,7 +682,7 @@ expr: | ( sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) - - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0"}[5m])) + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) ) / sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))