From e2f1741958657db6559c09a0f63e2e7e2a7dc634 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 9 Nov 2024 03:29:12 +0000 Subject: [PATCH] assets,site/content: daily assets regeneration --- assets/ceph/alerts.yaml | 1385 ++++- .../dashboards/ceph-cluster-advanced.json | 3813 +++++++++++++ assets/ceph/dashboards/cephfs-overview.json | 360 ++ assets/ceph/dashboards/host-details.json | 1434 +++++ assets/ceph/dashboards/hosts-overview.json | 892 ++++ .../dashboards/multi-cluster-overview.json | 2073 ++++++++ .../ceph/dashboards/osd-device-details.json | 914 ++++ assets/ceph/dashboards/osds-overview.json | 1339 +++++ assets/ceph/dashboards/pool-detail.json | 724 +++ assets/ceph/dashboards/pool-overview.json | 1691 ++++++ assets/ceph/dashboards/radosgw-detail.json | 651 +++ assets/ceph/dashboards/radosgw-overview.json | 1336 +++++ .../dashboards/radosgw-sync-overview.json | 614 +++ assets/ceph/dashboards/rbd-details.json | 465 ++ assets/ceph/dashboards/rbd-overview.json | 885 ++++ assets/ceph/dashboards/rgw-s3-analytics.json | 4715 +++++++++++++++++ assets/ceph/rules.yaml | 27 +- site/content/ceph/_index.md | 1860 +++++-- 18 files changed, 24441 insertions(+), 737 deletions(-) create mode 100644 assets/ceph/dashboards/ceph-cluster-advanced.json create mode 100644 assets/ceph/dashboards/cephfs-overview.json create mode 100644 assets/ceph/dashboards/host-details.json create mode 100644 assets/ceph/dashboards/hosts-overview.json create mode 100644 assets/ceph/dashboards/multi-cluster-overview.json create mode 100644 assets/ceph/dashboards/osd-device-details.json create mode 100644 assets/ceph/dashboards/osds-overview.json create mode 100644 assets/ceph/dashboards/pool-detail.json create mode 100644 assets/ceph/dashboards/pool-overview.json create mode 100644 assets/ceph/dashboards/radosgw-detail.json create mode 100644 assets/ceph/dashboards/radosgw-overview.json create mode 100644 assets/ceph/dashboards/radosgw-sync-overview.json create mode 100644 assets/ceph/dashboards/rbd-details.json create mode 100644 assets/ceph/dashboards/rbd-overview.json create mode 100644 assets/ceph/dashboards/rgw-s3-analytics.json diff --git a/assets/ceph/alerts.yaml b/assets/ceph/alerts.yaml index c1e2f40..61aca65 100644 --- a/assets/ceph/alerts.yaml +++ b/assets/ceph/alerts.yaml @@ -1,329 +1,1140 @@ groups: -- name: ceph-mgr-status +- name: cluster health rules: - - alert: CephMgrIsAbsent + - alert: CephHealthError annotations: - description: Ceph Manager has disappeared from Prometheus target discovery. - message: Storage metrics collector service not available anymore. - severity_level: critical - storage_type: ceph - expr: | - label_replace((up{job="rook-ceph-mgr"} == 0 or absent(up{job="rook-ceph-mgr"})), "namespace", "openshift-storage", "", "") + description: The cluster state has been HEALTH_ERROR for more than 5 minutes + on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more + information. + summary: Ceph is in the ERROR state on cluster {{ $labels.cluster }} + expr: ceph_health_status == 2 for: 5m labels: + oid: 1.3.6.1.4.1.50495.1.2.1.2.1 severity: critical - - alert: CephMgrIsMissingReplicas + type: ceph_default + - alert: CephHealthWarning annotations: - description: Ceph Manager is missing replicas. - message: Storage metrics collector service doesn't have required no of replicas. - severity_level: warning - storage_type: ceph - expr: | - sum(kube_deployment_spec_replicas{deployment=~"rook-ceph-mgr-.*"}) by (namespace) < 1 - for: 5m - labels: - severity: warning -- name: ceph-mds-status - rules: - - alert: CephMdsMissingReplicas - annotations: - description: Minimum required replicas for storage metadata service not available. - Might affect the working of storage cluster. - message: Insufficient replicas for storage metadata service. - severity_level: warning - storage_type: ceph - expr: | - sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2 - for: 5m - labels: - severity: warning -- name: quorum-alert.rules - rules: - - alert: CephMonQuorumAtRisk - annotations: - description: Storage cluster quorum is low. Contact Support. - message: Storage quorum at risk - severity_level: error - storage_type: ceph - expr: | - count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1) + description: The cluster state has been HEALTH_WARN for more than 15 minutes + on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more + information. + summary: Ceph is in the WARNING state on cluster {{ $labels.cluster }} + expr: ceph_health_status == 1 for: 15m - labels: - severity: critical - - alert: CephMonQuorumLost - annotations: - description: Storage cluster quorum is lost. Contact Support. - message: Storage quorum is lost - severity_level: critical - storage_type: ceph - expr: | - count(kube_pod_status_phase{pod=~"rook-ceph-mon-.*", phase=~"Running|running"} == 1) by (namespace) < 2 - for: 5m - labels: - severity: critical - - alert: CephMonHighNumberOfLeaderChanges - annotations: - description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname - }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently. - message: Storage Cluster has seen many leader changes recently. - severity_level: warning - storage_type: ceph - expr: | - (ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95 - for: 5m labels: severity: warning -- name: ceph-node-alert.rules + type: ceph_default +- name: mon rules: - - alert: CephNodeDown + - alert: CephMonDownQuorumAtRisk annotations: - description: Storage node {{ $labels.node }} went down. Please check the node - immediately. - message: Storage node {{ $labels.node }} went down - severity_level: error - storage_type: ceph + description: '{{ $min := printf "floor(count(ceph_mon_metadata{cluster=''%s''}) + / 2) + 1" .Labels.cluster | query | first | value }}Quorum requires a majority + of monitors (x {{ $min }}) to be active. Without quorum the cluster will become + inoperable, affecting all services and connected clients. The following monitors + are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''} == 0) + + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster + | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: Monitor quorum is at risk on cluster {{ $labels.cluster }} expr: | - cluster:ceph_node_down:join_kube == 0 + ( + (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) ( + count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1) + ) + ) == 1 for: 30s labels: + oid: 1.3.6.1.4.1.50495.1.2.1.3.1 severity: critical -- name: osd-alert.rules - rules: - - alert: CephOSDCriticallyFull + type: ceph_default + - alert: CephMonDown annotations: - description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class - type {{$labels.device_class}} has crossed 80% on host {{ $labels.hostname - }}. Immediately free up some space or add capacity of type {{$labels.device_class}}. - message: Back-end storage device is critically full. - severity_level: error - storage_type: ceph + description: '{{ $down := printf "count(ceph_mon_quorum_status{cluster=''%s''} + == 0)" .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $down + 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum + is still intact, but the loss of an additional monitor will make your cluster + inoperable. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''} + == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * + 0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname + }} {{- end }}' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: One or more monitors down on cluster {{ $labels.cluster }} expr: | - (ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.80 - for: 40s - labels: - severity: critical - - alert: CephOSDFlapping - annotations: - description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times - in last 5 minutes. Please check the pod events or ceph status to find out - the cause. - message: Ceph storage osd flapping. - severity_level: error - storage_type: ceph - expr: | - changes(ceph_osd_up[5m]) >= 10 - for: 0s - labels: - severity: critical - - alert: CephOSDNearFull - annotations: - description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class - type {{$labels.device_class}} has crossed 75% on host {{ $labels.hostname - }}. Immediately free up some space or add capacity of type {{$labels.device_class}}. - message: Back-end storage device is nearing full. - severity_level: warning - storage_type: ceph - expr: | - (ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75 - for: 40s + (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1))) + for: 30s labels: severity: warning - - alert: CephOSDDiskNotResponding + type: ceph_default + - alert: CephMonDiskspaceCritical annotations: - description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host - }}. - message: Disk not responding - severity_level: error - storage_type: ceph - expr: | - label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") - for: 15m - labels: - severity: critical - - alert: CephOSDDiskUnavailable - annotations: - description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host - }}. - message: Disk not accessible - severity_level: error - storage_type: ceph - expr: | - label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") + description: The free space available to a monitor's store is critically low. + You should increase the space available to the monitor(s). The default directory + is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db + on the mon pod's worker node for Rook. Look for old, rotated versions of *.log + and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories + under /var/lib/rook and other directories on the same filesystem, often /var/log + and /var/tmp are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} {{- end }} + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit + summary: Filesystem space on at least one monitor is critically low on cluster + {{ $labels.cluster }} + expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 for: 1m labels: + oid: 1.3.6.1.4.1.50495.1.2.1.3.2 severity: critical - - alert: CephOSDSlowOps + type: ceph_default + - alert: CephMonDiskspaceLow annotations: - description: '{{ $value }} Ceph OSD requests are taking too long to process. - Please check ceph status to find out the cause.' - message: OSD requests are taking too long to process. - severity_level: warning - storage_type: ceph - expr: | - ceph_healthcheck_slow_ops > 0 + description: The space available to a monitor's store is approaching full (>70% + is the default). You should increase the space available to the monitor(s). + The default directory is /var/lib/ceph/mon-*/data/store.db on traditional + deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker + node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do + NOT touch any *.sst files. Also check any other directories under /var/lib/rook + and other directories on the same filesystem, often /var/log and /var/tmp + are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} {{- end }} + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low + summary: Drive space on at least one monitor is approaching full on cluster + {{ $labels.cluster }} + expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + - alert: CephMonClockSkew + annotations: + description: Ceph monitors rely on closely synchronized time to maintain quorum + and cluster consistency. This event indicates that the time on at least one + mon has drifted too far from the lead mon. Review cluster status with ceph + -s. This will show which monitors are affected. Check the time sync status + on each monitor host with 'ceph time-sync-status' and the state and peers + of your ntpd or chrony daemon. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew + summary: Clock skew detected among monitors on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default +- name: osd + rules: + - alert: CephOSDDownHigh + annotations: + description: '{{ $value | humanize }}% or {{ with printf "count (ceph_osd_up{cluster=''%s''} + == 0)" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with + printf "count (ceph_osd_up{cluster=''%s''})" .Labels.cluster | query }}{{ + . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are + down: {{- range printf "(ceph_osd_up{cluster=''%s''} * on(cluster, ceph_daemon) + group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - + {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' + summary: More than 10% of OSDs are down on cluster {{ $labels.cluster }} + expr: count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) + * 100 >= 10 + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.1 + severity: critical + type: ceph_default + - alert: CephOSDHostDown + annotations: + description: 'The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=''%s''} + * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster + | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}' + summary: An OSD host is offline on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.8 + severity: warning + type: ceph_default + - alert: CephOSDDown + annotations: + description: '{{ $num := printf "count(ceph_osd_up{cluster=''%s''} == 0) " .Labels.cluster + | query | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ + end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} + {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range printf "(ceph_osd_up{cluster=''%s''} + * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster + | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down + summary: An OSD has been marked down on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="OSD_DOWN"} == 1 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.2 + severity: warning + type: ceph_default + - alert: CephOSDNearFull + annotations: + description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph + health detail' and 'ceph osd df' to identify the problem. To resolve, add + capacity to the affected OSD's failure domain, restore down/out OSDs, or delete + unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull + summary: OSD(s) running low on free space (NEARFULL) on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.3 + severity: warning + type: ceph_default + - alert: CephOSDFull + annotations: + description: An OSD has reached the FULL threshold. Writes to pools that share + the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' + to identify the problem. To resolve, add capacity to the affected OSD's failure + domain, restore down/out OSDs, or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full + summary: OSD full, writes blocked on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="OSD_FULL"} > 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.6 + severity: critical + type: ceph_default + - alert: CephOSDBackfillFull + annotations: + description: An OSD has reached the BACKFILL FULL threshold. This will prevent + rebalance operations from completing. Use 'ceph health detail' and 'ceph osd + df' to identify the problem. To resolve, add capacity to the affected OSD's + failure domain, restore down/out OSDs, or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull + summary: OSD(s) too full for backfill operations on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephOSDTooManyRepairs + annotations: + description: Reads from an OSD have used a secondary PG to return data to the + client, indicating a potential failing drive. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs + summary: OSD reports a high number of read errors on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 for: 30s labels: severity: warning - - alert: CephDataRecoveryTakingTooLong + type: ceph_default + - alert: CephOSDTimeoutsPublicNetwork annotations: - description: Data recovery has been active for too long. Contact Support. - message: Data recovery is slow - severity_level: warning - storage_type: ceph - expr: | - ceph_pg_undersized > 0 - for: 2h + description: OSD heartbeats on the cluster's 'public' network (frontend) are + running slow. Investigate the network for latency or loss issues. Use 'ceph + health detail' to show the affected OSDs. + summary: Network issues delaying OSD heartbeats (public network) on cluster + {{ $labels.cluster }} + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1 + for: 1m labels: severity: warning - - alert: CephPGRepairTakingTooLong + type: ceph_default + - alert: CephOSDTimeoutsClusterNetwork annotations: - description: Self heal operations taking too long. Contact Support. - message: Self heal problems detected - severity_level: warning - storage_type: ceph + description: OSD heartbeats on the cluster's 'cluster' network (backend) are + slow. Investigate the network for latency issues on this subnet. Use 'ceph + health detail' to show the affected OSDs. + summary: Network issues delaying OSD heartbeats (cluster network) on cluster + {{ $labels.cluster }} + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephOSDInternalDiskSizeMismatch + annotations: + description: One or more OSDs have an internal inconsistency between metadata + and the size of the device. This could lead to the OSD(s) crashing in future. + You should redeploy the affected OSDs. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch + summary: OSD size inconsistency error on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephDeviceFailurePredicted + annotations: + description: The device health module has determined that one or more devices + will fail soon. To review device status use 'ceph device ls'. To show a specific + device use 'ceph device info '. Mark the OSD out so that data may + migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace + the device, and redeploy the OSD. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2 + summary: Device(s) predicted to fail soon on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephDeviceFailurePredictionTooHigh + annotations: + description: The device health module has determined that devices predicted + to fail can not be remediated automatically, since too many OSDs would be + removed from the cluster to ensure performance and availability. Prevent data + integrity issues by adding new OSDs so that data may be relocated. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany + summary: Too many devices are predicted to fail on cluster {{ $labels.cluster + }}, unable to resolve + expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.7 + severity: critical + type: ceph_default + - alert: CephDeviceFailureRelocationIncomplete + annotations: + description: "The device health module has determined that one or more devices + will fail soon, but the normal process of relocating the data on the device + to other OSDs in the cluster is blocked. \nEnsure that the cluster has available + free space. It may be necessary to add capacity to the cluster to allow data + from the failing device to successfully migrate, or to enable the balancer." + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use + summary: Device failure is predicted, but unable to relocate data on cluster + {{ $labels.cluster }} + expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephOSDFlapping + annotations: + description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked + down and back up {{ $value | humanize }} times once a minute for 5 minutes. + This may indicate a network issue (latency, packet loss, MTU mismatch) on + the cluster network, or the public network if no cluster network is deployed. + Check the network stats on the listed host(s). + documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds + summary: Network issues are causing OSDs to flap (mark each other down) on cluster + {{ $labels.cluster }} + expr: (rate(ceph_osd_up[5m]) * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) + * 60 > 1 + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.4 + severity: warning + type: ceph_default + - alert: CephOSDReadErrors + annotations: + description: An OSD has encountered read errors, but the OSD has recovered by + retrying the reads. This may indicate an issue with hardware or the kernel. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors + summary: Device read errors detected on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 + for: 30s + labels: + severity: warning + type: ceph_default + - alert: CephPGImbalance + annotations: + description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates + by more than 30% from average PG count. + summary: PGs are not balanced across OSDs on cluster {{ $labels.cluster }} expr: | - ceph_pg_inconsistent > 0 + abs( + ((ceph_osd_numpg > 0) - on (cluster,job) group_left avg(ceph_osd_numpg > 0) by (cluster,job)) / + on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on (cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.5 + severity: warning + type: ceph_default +- name: mds + rules: + - alert: CephFilesystemDamaged + annotations: + description: Filesystem metadata has been corrupted. Data may be inaccessible. + Analyze metrics from the MDS daemon admin socket, or escalate to support. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: CephFS filesystem is damaged on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.1 + severity: critical + type: ceph_default + - alert: CephFilesystemOffline + annotations: + description: All MDS ranks are unavailable. The MDS daemons managing metadata + are down, rendering the filesystem offline. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down + summary: CephFS filesystem is offline on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.3 + severity: critical + type: ceph_default + - alert: CephFilesystemDegraded + annotations: + description: One or more metadata daemons (MDS ranks) are failed or in a damaged + state. At best the filesystem is partially available, at worst the filesystem + is completely unusable. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded + summary: CephFS filesystem is degraded on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="FS_DEGRADED"} > 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.4 + severity: critical + type: ceph_default + - alert: CephFilesystemMDSRanksLow + annotations: + description: The filesystem's 'max_mds' setting defines the number of MDS ranks + in the filesystem. The current number of active MDS daemons is less than this + value. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max + summary: Ceph MDS daemon count is lower than configured on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephFilesystemInsufficientStandby + annotations: + description: The minimum number of standby daemons required by standby_count_wanted + is less than the current number of standby daemons. Adjust the standby count + or increase the number of MDS daemons. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby + summary: Ceph filesystem standby daemons too few on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephFilesystemFailureNoStandby + annotations: + description: An MDS daemon has failed, leaving only one active rank and no available + standby. Investigate the cause of the failure or add a standby MDS. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds + summary: MDS daemon failed, no further standby available on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.5 + severity: critical + type: ceph_default + - alert: CephFilesystemReadOnly + annotations: + description: The filesystem has switched to READ ONLY due to an unexpected error + when writing to the metadata pool. Either analyze the output from the MDS + daemon admin socket, or escalate to support. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: CephFS filesystem in read only mode due to write error(s) on cluster + {{ $labels.cluster }} + expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.2 + severity: critical + type: ceph_default +- name: mgr + rules: + - alert: CephMgrModuleCrash + annotations: + description: One or more mgr modules have crashed and have yet to be acknowledged + by an administrator. A crashed module may impact functionality within the + cluster. Use the 'ceph crash' command to determine which module has failed, + and archive it to acknowledge the failure. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash + summary: A manager module has recently crashed on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.6.1 + severity: critical + type: ceph_default + - alert: CephMgrPrometheusModuleInactive + annotations: + description: The mgr/prometheus module at {{ $labels.instance }} is unreachable. + This could mean that the module has been disabled or the mgr daemon itself + is down. Without the mgr/prometheus module metrics and alerts will no longer + function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to + to determine whether the mgr is active. If the mgr is not active, restart + it, otherwise you can determine module status with 'ceph mgr module ls'. If + it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'. + summary: The mgr/prometheus module is not available + expr: up{job="ceph"} == 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.6.2 + severity: critical + type: ceph_default +- name: pgs + rules: + - alert: CephPGsInactive + annotations: + description: '{{ $value }} PGs have been inactive for more than 5 minutes in + pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write + requests.' + summary: One or more placement groups are inactive on cluster {{ $labels.cluster + }} + expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total + - ceph_pg_active) > 0 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.1 + severity: critical + type: ceph_default + - alert: CephPGsUnclean + annotations: + description: '{{ $value }} PGs have been unclean for more than 15 minutes in + pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure.' + summary: One or more placement groups are marked unclean on cluster {{ $labels.cluster + }} + expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total + - ceph_pg_clean) > 0 + for: 15m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.2 + severity: warning + type: ceph_default + - alert: CephPGsDamaged + annotations: + description: During data consistency checks (scrub), at least one PG has been + flagged as being damaged or inconsistent. Check to see which PG is affected, + and attempt a manual repair if necessary. To list problematic placement groups, + use 'rados list-inconsistent-pg '. To repair PGs use the 'ceph pg repair + ' command. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged + summary: Placement group damaged, manual intervention needed on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.4 + severity: critical + type: ceph_default + - alert: CephPGRecoveryAtRisk + annotations: + description: Data redundancy is at risk since one or more OSDs are at or above + the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, + or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full + summary: OSDs are too full for recovery on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.5 + severity: critical + type: ceph_default + - alert: CephPGUnavailableBlockingIO + annotations: + description: Data availability is reduced, impacting the cluster's ability to + service I/O. One or more placement groups (PGs) are in a state that blocks + I/O. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability + summary: PG is unavailable on cluster {{ $labels.cluster }}, blocking I/O + expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) + == 1 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.3 + severity: critical + type: ceph_default + - alert: CephPGBackfillAtRisk + annotations: + description: Data redundancy may be at risk due to lack of free space within + the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add + more capacity, or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full + summary: Backfill operations are blocked due to lack of free space on cluster + {{ $labels.cluster }} + expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.6 + severity: critical + type: ceph_default + - alert: CephPGNotScrubbed + annotations: + description: 'One or more PGs have not been scrubbed recently. Scrubs check + metadata integrity, protecting against bit-rot. They check that metadata is + consistent across data replicas. When PGs miss their scrub interval, it may + indicate that the scrub window is too small, or PGs were not in a ''clean'' + state during the scrub window. You can manually initiate a scrub with: ceph + pg scrub ' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed + summary: Placement group(s) have not been scrubbed on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + - alert: CephPGsHighPerOSD + annotations: + description: |- + The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). + Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs + summary: Placement groups per OSD is too high on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: CephPGNotDeepScrubbed + annotations: + description: One or more PGs have not been deep scrubbed recently. Deep scrubs + protect against bit-rot. They compare data replicas to ensure consistency. + When PGs miss their deep scrub interval, it may indicate that the window is + too small or PGs were not in a 'clean' state during the deep-scrub window. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed + summary: Placement group(s) have not been deep scrubbed on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default +- name: nodes + rules: + - alert: CephNodeRootFilesystemFull + annotations: + description: 'Root volume is dangerously full: {{ $value | humanize }}% free.' + summary: Root filesystem is dangerously full + expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} + * 100 < 5 + for: 5m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.1 + severity: critical + type: ceph_default + - alert: CephNodeNetworkPacketDrops + annotations: + description: Node {{ $labels.instance }} experiences packet drop > 0.5% or > + 10 packets/s on interface {{ $labels.device }}. + summary: One or more NICs reports packet drops + expr: | + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.2 + severity: warning + type: ceph_default + - alert: CephNodeNetworkPacketErrors + annotations: + description: Node {{ $labels.instance }} experiences packet errors > 0.01% or + > 10 packets/s on interface {{ $labels.device }}. + summary: One or more NICs reports packet errors on cluster {{ $labels.cluster + }} + expr: | + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.3 + severity: warning + type: ceph_default + - alert: CephNodeNetworkBondDegraded + annotations: + description: Bond {{ $labels.master }} is degraded on Node {{ $labels.instance + }}. + summary: Degraded Bond on Node {{ $labels.instance }} on cluster {{ $labels.cluster + }} + expr: | + node_bonding_slaves - node_bonding_active != 0 + labels: + severity: warning + type: ceph_default + - alert: CephNodeDiskspaceWarning + annotations: + description: Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will + be full in less than 5 days based on the 48 hour trailing fill rate. + summary: Host filesystem free space is getting low on cluster {{ $labels.cluster + }} + expr: predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 + * 5) * on(cluster, instance) group_left(nodename) node_uname_info < 0 + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.4 + severity: warning + type: ceph_default + - alert: CephNodeInconsistentMTU + annotations: + description: Node {{ $labels.instance }} has a different MTU size ({{ $value + }}) than the median of devices named {{ $labels.device }}. + summary: MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster + }} + expr: node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( max + by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} + > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} + > 0)) )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( min + by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} + > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} + > 0)) ) + labels: + severity: warning + type: ceph_default +- name: pools + rules: + - alert: CephPoolGrowthWarning + annotations: + description: Pool '{{ $labels.name }}' will be full in less than 5 days assuming + the average fill-up rate of the past 48 hours. + summary: Pool growth rate may soon exceed capacity on cluster {{ $labels.cluster + }} + expr: (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster,pool_id, + instance) group_right() ceph_pool_metadata) >= 95 + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.9.2 + severity: warning + type: ceph_default + - alert: CephPoolBackfillFull + annotations: + description: A pool is approaching the near full threshold, which will prevent + recovery/backfill operations from completing. Consider adding more capacity. + summary: Free space in a pool is too low for recovery/backfill on cluster {{ + $labels.cluster }} + expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 + labels: + severity: warning + type: ceph_default + - alert: CephPoolFull + annotations: + description: A pool has reached its MAX quota, or OSDs supporting the pool have + reached the FULL threshold. Until this is resolved, writes to the pool will + be blocked. Pool Breakdown (top 5) {{- range printf "topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} + * on(cluster,pool_id) group_right ceph_pool_metadata))" .Labels.cluster | + query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's + quota, or add capacity to the cluster first then increase the pool's quota + (e.g. ceph osd pool set quota max_bytes ) + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full + summary: Pool is full - writes are blocked on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="POOL_FULL"} > 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.9.1 + severity: critical + type: ceph_default + - alert: CephPoolNearFull + annotations: + description: A pool has exceeded the warning (percent full) threshold, or OSDs + supporting the pool have reached the NEARFULL threshold. Writes may continue, + but you are at risk of the pool going read-only if more capacity isn't made + available. Determine the affected pool with 'ceph df detail', looking at QUOTA + BYTES and STORED. Increase the pool's quota, or add capacity to the cluster + first then increase the pool's quota (e.g. ceph osd pool set quota + max_bytes ). Also ensure that the balancer is active. + summary: One or more Ceph pools are nearly full on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 + for: 5m + labels: + severity: warning + type: ceph_default +- name: healthchecks + rules: + - alert: CephSlowOps + annotations: + description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time + exceeded)' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: OSD operations are slow to complete on cluster {{ $labels.cluster }} + expr: ceph_healthcheck_slow_ops > 0 + for: 30s + labels: + severity: warning + type: ceph_default + - alert: CephDaemonSlowOps + annotations: + description: '{{ $labels.ceph_daemon }} operations are taking too long to process + (complaint time exceeded)' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: '{{ $labels.ceph_daemon }} operations are slow to complete on cluster + {{ $labels.cluster }}' + expr: ceph_daemon_health_metrics{type="SLOW_OPS"} > 0 + for: 30s + labels: + severity: warning + type: ceph_default +- name: cephadm + rules: + - alert: CephadmUpgradeFailed + annotations: + description: The cephadm cluster upgrade process has failed. The cluster remains + in an undetermined state. Please review the cephadm logs, to understand the + nature of the issue + summary: Ceph version upgrade has failed on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.11.2 + severity: critical + type: ceph_default + - alert: CephadmDaemonFailed + annotations: + description: A daemon managed by cephadm is no longer active. Determine, which + daemon is down with 'ceph health detail'. you may start daemons with the 'ceph + orch daemon start ' + summary: A ceph daemon managed by cephadm is down on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.11.1 + severity: critical + type: ceph_default + - alert: CephadmPaused + annotations: + description: Cluster management has been paused manually. This will prevent + the orchestrator from service management and reconciliation. If this is not + intentional, resume cephadm operations with 'ceph orch resume' + documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused + summary: Orchestration tasks via cephadm are PAUSED on cluster {{ $labels.cluster + }} + expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default +- name: hardware + rules: + - alert: HardwareStorageError + annotations: + description: Some storage devices are in error. Check `ceph health detail`. + summary: Storage devices error(s) detected on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.1 + severity: critical + type: ceph_default + - alert: HardwareMemoryError + annotations: + description: DIMM error(s) detected. Check `ceph health detail`. + summary: DIMM error(s) detected on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.2 + severity: critical + type: ceph_default + - alert: HardwareProcessorError + annotations: + description: Processor error(s) detected. Check `ceph health detail`. + summary: Processor error(s) detected on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.3 + severity: critical + type: ceph_default + - alert: HardwareNetworkError + annotations: + description: Network error(s) detected. Check `ceph health detail`. + summary: Network error(s) detected on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.4 + severity: critical + type: ceph_default + - alert: HardwarePowerError + annotations: + description: Power supply error(s) detected. Check `ceph health detail`. + summary: Power supply error(s) detected on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="HARDWARE_POWER"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.5 + severity: critical + type: ceph_default + - alert: HardwareFanError + annotations: + description: Fan error(s) detected. Check `ceph health detail`. + summary: Fan error(s) detected on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="HARDWARE_FANS"} > 0 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.6 + severity: critical + type: ceph_default +- name: PrometheusServer + rules: + - alert: PrometheusJobMissing + annotations: + description: The prometheus job that scrapes from Ceph is no longer defined, + this will effectively mean you'll have no metrics or alerts for the cluster. Please + review the job definitions in the prometheus.yml file of the prometheus instance. + summary: The scrape job for Ceph is missing from Prometheus + expr: absent(up{job="ceph"}) + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.12.1 + severity: critical + type: ceph_default +- name: rados + rules: + - alert: CephObjectMissing + annotations: + description: The latest version of a RADOS object can not be found, even though + all OSDs are up. I/O requests for this object from clients will block (hang). + Resolving this issue may require the object to be rolled back to a prior version + manually, and manually verified. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound + summary: Object(s) marked UNFOUND on cluster {{ $labels.cluster }} + expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right(cluster) + (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) + == 1 + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.1 + severity: critical + type: ceph_default +- name: generic + rules: + - alert: CephDaemonCrash + annotations: + description: One or more daemons have crashed recently, and need to be acknowledged. + This notification ensures that software crashes do not go unseen. To acknowledge + a crash, use the 'ceph crash archive ' command. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash + summary: One or more Ceph daemons have crashed, and are pending acknowledgement + on cluster {{ $labels.cluster }} + expr: ceph_health_detail{name="RECENT_CRASH"} == 1 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.1.2 + severity: critical + type: ceph_default +- name: rbdmirror + rules: + - alert: CephRBDMirrorImagesPerDaemonHigh + annotations: + description: Number of image replications per daemon is not supposed to go beyond + threshold 100 + summary: Number of image replications are now above 100 on cluster {{ $labels.cluster + }} + expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) + > 100 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.2 + severity: critical + type: ceph_default + - alert: CephRBDMirrorImagesNotInSync + annotations: + description: Both local and remote RBD mirror images should be in sync. + summary: Some of the RBD mirror images are not in sync with the remote counter + parts on cluster {{ $labels.cluster }} + expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, + ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) + - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) + != 0 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.3 + severity: critical + type: ceph_default + - alert: CephRBDMirrorImagesNotInSyncVeryHigh + annotations: + description: More than 10% of the images have synchronization problems. + summary: Number of unsynchronized images are very high on cluster {{ $labels.cluster + }} + expr: count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, + namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk + by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) + != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1) + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.4 + severity: critical + type: ceph_default + - alert: CephRBDMirrorImageTransferBandwidthHigh + annotations: + description: Detected a heavy increase in bandwidth for rbd replications (over + 80%) in the last 30 min. This might not be a problem, but it is good to review + the number of images being replicated simultaneously + summary: The replication network usage on cluster {{ $labels.cluster }} has + been increased over 80% in the last 30 minutes. Review the number of images + being replicated. This alert will be cleaned automatically after 30 minutes + expr: rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80 + for: 1m + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.5 + severity: warning + type: ceph_default +- name: nvmeof + rules: + - alert: NVMeoFSubsystemNamespaceLimit + annotations: + description: Subsystems have a max namespace limit defined at creation time. + This alert means that no more namespaces can be added to {{ $labels.nqn }} + summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces + on cluster {{ $labels.cluster }}' + expr: (count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit + for: 1m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFTooManyGateways + annotations: + description: You may create many gateways, but 4 is the tested limit + summary: Max supported gateways exceeded on cluster {{ $labels.cluster }} + expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFMaxGatewayGroupSize + annotations: + description: You may create many gateways in a gateway group, but 4 is the tested + limit + summary: Max gateways within a gateway group ({{ $labels.group }}) exceeded + on cluster {{ $labels.cluster }} + expr: count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFSingleGatewayGroup + annotations: + description: Although a single member gateway group is valid, it should only + be used for test purposes + summary: The gateway group {{ $labels.group }} consists of a single gateway + - HA is not possible on cluster {{ $labels.cluster }} + expr: count(ceph_nvmeof_gateway_info) by(cluster,group) == 1 + for: 5m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFHighGatewayCPU + annotations: + description: Typically, high CPU may indicate degraded performance. Consider + increasing the number of reactor cores + summary: CPU used by {{ $labels.instance }} NVMe-oF Gateway is high on cluster + {{ $labels.cluster }} + expr: label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") + > 80.00 + for: 10m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFGatewayOpenSecurity + annotations: + description: It is good practice to ensure subsystems use host security to reduce + the risk of unexpected data loss + summary: Subsystem {{ $labels.nqn }} has been defined without host level security + on cluster {{ $labels.cluster }} + expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"} + for: 5m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFTooManySubsystems + annotations: + description: Although you may continue to create subsystems in {{ $labels.gateway_host + }}, the configuration may not be supported + summary: The number of subsystems defined to the gateway exceeds supported values + on cluster {{ $labels.cluster }} + expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) + > 16.00 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFVersionMismatch + annotations: + description: This may indicate an issue with deployment. Check cephadm logs + summary: Too many different NVMe-oF gateway releases active on cluster {{ $labels.cluster + }} + expr: count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) + > 1 for: 1h labels: severity: warning -- name: persistent-volume-alert.rules - rules: - - alert: PersistentVolumeUsageNearFull + type: ceph_default + - alert: NVMeoFHighClientCount annotations: - description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed - 75%. Free up some space or expand the PVC. - message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion - or PVC expansion is required. - severity_level: warning - storage_type: ceph - expr: | - (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75 - for: 5s - labels: - severity: warning - - alert: PersistentVolumeUsageCritical - annotations: - description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed - 85%. Free up some space or expand the PVC immediately. - message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion - or PVC expansion is required. - severity_level: error - storage_type: ceph - expr: | - (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85 - for: 5s - labels: - severity: critical -- name: cluster-state-alert.rules - rules: - - alert: CephClusterErrorState - annotations: - description: Storage cluster is in error state for more than 10m. - message: Storage cluster is in error state - severity_level: error - storage_type: ceph - expr: | - ceph_health_status{job="rook-ceph-mgr"} > 1 - for: 10m - labels: - severity: critical - - alert: CephClusterWarningState - annotations: - description: Storage cluster is in warning state for more than 10m. - message: Storage cluster is in degraded state - severity_level: warning - storage_type: ceph - expr: | - ceph_health_status{job="rook-ceph-mgr"} == 1 - for: 15m - labels: - severity: warning - - alert: CephOSDVersionMismatch - annotations: - description: There are {{ $value }} different versions of Ceph OSD components - running. - message: There are multiple versions of storage services running. - severity_level: warning - storage_type: ceph - expr: | - count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 - for: 10m - labels: - severity: warning - - alert: CephMonVersionMismatch - annotations: - description: There are {{ $value }} different versions of Ceph Mon components - running. - message: There are multiple versions of storage services running. - severity_level: warning - storage_type: ceph - expr: | - count(count(ceph_mon_metadata{job="rook-ceph-mgr", ceph_version != ""}) by (ceph_version)) > 1 - for: 10m - labels: - severity: warning -- name: cluster-utilization-alert.rules - rules: - - alert: CephClusterNearFull - annotations: - description: Storage cluster utilization has crossed 75% and will become read-only - at 85%. Free up some space or expand the storage cluster. - message: Storage cluster is nearing full. Data deletion or cluster expansion - is required. - severity_level: warning - storage_type: ceph - expr: | - ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.75 - for: 5s - labels: - severity: warning - - alert: CephClusterCriticallyFull - annotations: - description: Storage cluster utilization has crossed 80% and will become read-only - at 85%. Free up some space or expand the storage cluster immediately. - message: Storage cluster is critically full and needs immediate data deletion - or cluster expansion. - severity_level: error - storage_type: ceph - expr: | - ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.80 - for: 5s - labels: - severity: critical - - alert: CephClusterReadOnly - annotations: - description: Storage cluster utilization has crossed 85% and will become read-only - now. Free up some space or expand the storage cluster immediately. - message: Storage cluster is read-only now and needs immediate data deletion - or cluster expansion. - severity_level: error - storage_type: ceph - expr: | - ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes >= 0.85 - for: 0s - labels: - severity: critical -- name: pool-quota.rules - rules: - - alert: CephPoolQuotaBytesNearExhaustion - annotations: - description: Storage pool {{ $labels.name }} quota usage has crossed 70%. - message: Storage pool quota(bytes) is near exhaustion. - severity_level: warning - storage_type: ceph - expr: | - (ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.70 + description: The supported limit for clients connecting to a subsystem is 32 + summary: The number of clients connected to {{ $labels.nqn }} is too high on + cluster {{ $labels.cluster }} + expr: ceph_nvmeof_subsystem_host_count > 32.00 for: 1m labels: severity: warning - - alert: CephPoolQuotaBytesCriticallyExhausted + type: ceph_default + - alert: NVMeoFHighHostCPU annotations: - description: Storage pool {{ $labels.name }} quota usage has crossed 90%. - message: Storage pool quota(bytes) is critically exhausted. - severity_level: critical - storage_type: ceph - expr: | - (ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.90 - for: 1m + description: High CPU on a gateway host can lead to CPU contention and performance + degradation + summary: The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host + }}) on cluster {{ $labels.cluster }} + expr: 100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) + * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) + >= 80.00 + for: 10m labels: - severity: critical + severity: warning + type: ceph_default + - alert: NVMeoFInterfaceDown + annotations: + description: A NIC used by one or more subsystems is in a down state + summary: Network interface {{ $labels.device }} is down on cluster {{ $labels.cluster + }} + expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"} + for: 30s + labels: + oid: 1.3.6.1.4.1.50495.1.2.1.14.1 + severity: warning + type: ceph_default + - alert: NVMeoFInterfaceDuplex + annotations: + description: Until this is resolved, performance from the gateway will be degraded + summary: Network interface {{ $labels.device }} is not running in full duplex + mode on cluster {{ $labels.cluster }} + expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"} + for: 30s + labels: + severity: warning + type: ceph_default + - alert: NVMeoFHighReadLatency + annotations: + description: High latencies may indicate a constraint within the cluster e.g. + CPU, network. Please investigate + summary: The average read latency over the last 5 mins has reached 10 ms or + more on {{ $labels.gateway }} + expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) + / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") + > 0.01 + for: 5m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFHighWriteLatency + annotations: + description: High latencies may indicate a constraint within the cluster e.g. + CPU, network. Please investigate + summary: The average write latency over the last 5 mins has reached 20 ms or + more on {{ $labels.gateway }} + expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) + / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*") + > 0.02 + for: 5m + labels: + severity: warning + type: ceph_default diff --git a/assets/ceph/dashboards/ceph-cluster-advanced.json b/assets/ceph/dashboards/ceph-cluster-advanced.json new file mode 100644 index 0000000..ff31ebf --- /dev/null +++ b/assets/ceph/dashboards/ceph-cluster-advanced.json @@ -0,0 +1,3813 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "heatmap", + "name": "Heatmap", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "Ceph cluster overview", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CLUSTER STATE", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "0": { + "text": "HEALTHY" + }, + "1": { + "text": "WARNING" + }, + "2": { + "text": "ERROR" + } + }, + "type": "value" + }, + { + "id": 1, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#9ac48a" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 2 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 3, + "interval": "1m", + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_health_status{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Ceph health status", + "transparent": true, + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.10000000000000001 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0.29999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 4, + "interval": "1m", + "links": [ ], + "maxDataPoints": 100, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_cluster_total_bytes{cluster=~\"$cluster\", }-ceph_cluster_total_used_bytes{cluster=~\"$cluster\", })/ceph_cluster_total_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Available Capacity", + "transparent": false, + "type": "gauge" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 2, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.025000000000000001 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 1 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 5, + "interval": "1m", + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Cluster Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 1, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 6, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w_in_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Write Throughput", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 1, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#9ac48a", + "value": 0 + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 7, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r_out_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Read Throughput", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgb(255, 0, 0)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 0, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 8, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "All", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "All", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "In", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_in{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "In", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Out", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_in{cluster=~\"$cluster\", } == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Out", + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + }, + { + "aggregation": "Last", + "alias": "Up", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Up", + "refId": "D", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Down", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up{cluster=~\"$cluster\", } == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Down", + "refId": "E", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + } + ], + "title": "OSDs", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 6, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 9, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status{cluster=~\"$cluster\", } == 1) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Active", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status{cluster=~\"$cluster\", } == 0) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Standby", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "MGRs", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Warning" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#987d24", + "mode": "fixed" + } + } + ] + } + ] + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 4 + }, + "id": 10, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"^Ceph.+\", severity=\"critical\", cluster=~\"$cluster\", }) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Critical", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"^Ceph.+\", severity=\"warning\", cluster=~\"$cluster\", }) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Warning", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "Firing Alerts", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.025000000000000001 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 0.10000000000000001 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 4 + }, + "id": 11, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_used_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Used Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 4 + }, + "id": 12, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Write IOPS", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#9ac48a", + "value": 0 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 4 + }, + "id": 13, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Read IOPS", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "$datasource", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 15, + "y": 4 + }, + "id": 14, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "In Quorum", + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_mon_quorum_status{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "In Quorum", + "refId": "A", + "units": "none", + "valueHandler": "Text Only" + }, + { + "aggregation": "Last", + "alias": "Total", + "crit": 1, + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mon_quorum_status{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "B", + "units": "none", + "valueHandler": "Text Only", + "warn": 2 + }, + { + "aggregation": "Last", + "alias": "MONs out of Quorum", + "crit": 1.6000000000000001, + "datasource": "$datasource", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Annotation", + "displayValueWithAlias": "Never", + "expr": "count(ceph_mon_quorum_status{cluster=~\"$cluster\", }) - sum(ceph_mon_quorum_status{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MONs out of Quorum", + "range": true, + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1.1000000000000001 + } + ], + "title": "Monitors", + "transparent": false, + "type": "stat" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 15, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CLUSTER STATS", + "titleSize": "h6", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 16, + "limit": 10, + "onlyAlertsOnDashboard": true, + "options": { + "alertInstanceLabelFilter": "{alertname=~\"^Ceph.+\", cluster=~\"$cluster\", }", + "alertName": "", + "dashboardAlerts": false, + "groupBy": [ ], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": false, + "normal": false, + "pending": true + }, + "viewMode": "list" + }, + "show": "current", + "sortOrder": 1, + "stateFilter": [ ], + "title": "Alerts", + "type": "alertlist" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#c0921f", + "value": 75 + }, + { + "color": "#E02F44", + "value": 85 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Total Capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + }, + { + "id": "custom.thresholdsStyle", + "value": { + "mode": "dashed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 17, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Total Capacity", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "ceph_cluster_total_used_bytes{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Used", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "Capacity", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 18, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w_in_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r_out_bytes{cluster=~\"$cluster\", }[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "Cluster Throughput", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 19, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_w{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_op_r{cluster=~\"$cluster\", }[1m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "IOPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 20, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_bytes_used{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Used Bytes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rbd Stored" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "transparent", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 21, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_stored_raw{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Pool Used RAW Bytes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 24 + }, + "id": 22, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_quota_objects{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Objects Quota", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 24 + }, + "id": 23, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_quota_bytes{cluster=~\"$cluster\", }) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Quota Bytes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 24 + }, + "id": 24, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "(ceph_pool_objects{cluster=~\"$cluster\", }) * on (pool_id) group_left(name)(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Objects Per Pool", + "type": "timeseries" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 25, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OBJECTS", + "titleSize": "h6", + "type": "row" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 6, + "x": 0, + "y": 32 + }, + "id": 26, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_pool_objects{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Total", + "range": true, + "refId": "A", + "step": 200 + } + ], + "title": "OSD Type Count", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 6, + "y": 32 + }, + "id": 27, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_active{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Active", + "range": true, + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_clean{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Clean", + "range": true, + "refId": "B" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_peering{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peering", + "range": true, + "refId": "C" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_degraded{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Degraded", + "range": true, + "refId": "D", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_stale{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Stale", + "range": true, + "refId": "E", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_unclean_pgs{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Unclean", + "range": true, + "refId": "F", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_undersized{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Undersized", + "range": true, + "refId": "G", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_incomplete{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Incomplete", + "range": true, + "refId": "H" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_forced_backfill{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Forced Backfill", + "range": true, + "refId": "I" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_forced_recovery{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Forced Recovery", + "range": true, + "refId": "J" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_creating{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Creating", + "range": true, + "refId": "K" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_wait_backfill{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Wait Backfill", + "range": true, + "refId": "L" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_deep{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Deep", + "range": true, + "refId": "M" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_scrubbing{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Scrubbing", + "range": true, + "refId": "N" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_recovering{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Recovering", + "range": true, + "refId": "O" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_repair{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Repair", + "range": true, + "refId": "P" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_down{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Down", + "range": true, + "refId": "Q" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_peered{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peered", + "range": true, + "refId": "R" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_backfill{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Backfill", + "range": true, + "refId": "S" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_remapped{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Remapped", + "range": true, + "refId": "T" + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_backfill_toofull{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Backfill Toofull", + "range": true, + "refId": "U" + } + ], + "title": "PGs State", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 32 + }, + "id": 28, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_degraded{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Degraded", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_stale{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Stale", + "range": true, + "refId": "B", + "step": 300 + }, + { + "datasource": "$datasource", + "expr": "sum(ceph_pg_undersized{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Undersized", + "range": true, + "refId": "C", + "step": 300 + } + ], + "title": "Stuck PGs", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 38 + }, + "id": 29, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(irate(ceph_osd_recovery_ops{cluster=~\"$cluster\", }[$interval]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "OPS", + "refId": "A", + "step": 300 + } + ], + "title": "Recovery Operations", + "type": "timeseries" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 30, + "panels": [ + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 31, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + }, + "value": "1" + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_osd_apply_latency_ms{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Apply Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": 10 + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#65c5db", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 32, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#65c5db", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "ceph_osd_commit_latency_ms{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Commit Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": 10 + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#806eb7", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 33, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#806eb7", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 2, + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[5m]) >= 0", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Read Op Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#f9934e", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 34, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#f9934e", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 2, + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[5m]) >= 0", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Write Op Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 35, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "avg(rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "avg(rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[5m]) / rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "B" + } + ], + "title": "Recovery Operations", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 36, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "avg(ceph_osd_apply_latency_ms{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "apply", + "metric": "ceph_osd_perf_apply_latency_seconds", + "refId": "A", + "step": 4 + }, + { + "datasource": "$datasource", + "expr": "avg(ceph_osd_commit_latency_ms{cluster=~\"$cluster\", })", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "commit", + "metric": "ceph_osd_perf_commit_latency_seconds", + "refId": "B", + "step": 4 + } + ], + "title": "AVG OSD Apply + Commit Latency", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "LATENCY", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": true, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 45 + }, + "id": 37, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "", + "titleSize": "h6", + "type": "row" + }, + { + "columns": [ ], + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 38, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "OSD Services", + "range": false, + "refId": "A" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mon_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Mon Services", + "range": false, + "refId": "B" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mds_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "MDS Services", + "range": false, + "refId": "C" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_rgw_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "RGW Services", + "range": false, + "refId": "D" + }, + { + "datasource": "$datasource", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mgr_metadata{cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "MGR Services", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Ceph Versions", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { }, + "indexByName": { }, + "renameByName": { + "Time": "", + "Value #A": "OSD Services", + "Value #B": "Mon Services", + "Value #C": "MDS Services", + "Value #D": "RGW Services", + "Value #E": "MGR Services", + "ceph_version": "Ceph Version" + } + } + } + ], + "type": "table" + } + ], + "refresh": "1m", + "rows": [ ], + "schemaVersion": 38, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 10, + "auto_min": "1m", + "current": { + "text": "$__auto_interval_interval", + "value": "$__auto_interval_interval" + }, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "5s", + "value": "5s" + }, + { + "selected": false, + "text": "10s", + "value": "10s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "5s,10s,30s,1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "type": "interval", + "valuelabels": { } + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Cluster - Advanced", + "uid": "dn13KBeTv", + "version": 0 +} diff --git a/assets/ceph/dashboards/cephfs-overview.json b/assets/ceph/dashboards/cephfs-overview.json new file mode 100644 index 0000000..f65ce4d --- /dev/null +++ b/assets/ceph/dashboards/cephfs-overview.json @@ -0,0 +1,360 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "MDS Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "none" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(ceph_objecter_op_r{ceph_daemon=~\"($mds_servers).*\", cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read Ops", + "refId": "A" + }, + { + "expr": "sum(rate(ceph_objecter_op_w{ceph_daemon=~\"($mds_servers).*\", cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write Ops", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "MDS Workload - $mds_servers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": "Reads(-) / Writes (+)", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "none" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_mds_server_handle_client_request{ceph_daemon=~\"($mds_servers).*\", cluster=~\"$cluster\", }", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Client Request Load - $mds_servers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": "Client Requests", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "MDS Server", + "multi": false, + "name": "mds_servers", + "options": [ ], + "query": "label_values(ceph_mds_inodes{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "MDS Performance", + "uid": "tbO9LAiZz", + "version": 0 +} diff --git a/assets/ceph/dashboards/host-details.json b/assets/ceph/dashboards/host-details.json new file mode 100644 index 0000000..ef357d3 --- /dev/null +++ b/assets/ceph/dashboards/host-details.json @@ -0,0 +1,1434 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "$ceph_hosts System Overview", + "titleSize": "h6", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(sum by (ceph_daemon) (ceph_osd_metadata{cluster=~\"$cluster\", }))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "OSDs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { + "interrupt": "#447EBC", + "steal": "#6D1F62", + "system": "#890F02", + "user": "#3F6833", + "wait": "#C15C17" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percent" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 3, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (mode) (\n rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval])\n) / (\n scalar(\n sum(rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]))\n ) * 100\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mode}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": "% Utilization", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "Available": "#508642", + "Free": "#508642", + "Total": "#bf1b00", + "Used": "#bf1b00", + "total": "#bf1b00", + "used": "#0a50a1" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 9, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "total", + "color": "#bf1b00", + "fill": 0, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Free", + "refId": "A" + }, + { + "expr": "node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "total", + "refId": "B" + }, + { + "expr": "(\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "buffers/cache", + "refId": "C" + }, + { + "expr": "(\n node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) - (\n (\n node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) +\n (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "used", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "RAM Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "RAM used", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "decbytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (device) (\n rate(\n node_network_receive_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "sum by (device) (\n rate(node_network_transmit_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval])\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "pps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "rate(node_network_transmit_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network drop rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "pps", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 6 + }, + "id": 8, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_osd_stat_bytes{cluster=~\"$cluster\", } and\n on (ceph_daemon) ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", cluster=~\"$cluster\", }\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Raw Capacity", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "pps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 6 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "rate(node_network_transmit_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network error rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "pps", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 10, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OSD Disk Performance Statistics", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 12 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n rate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) writes", + "refId": "A" + }, + { + "expr": "label_replace(\n (\n rate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{cluster=~\"$cluster\", },\"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ops", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 12, + "y": 12 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*read/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n rate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) write", + "refId": "A" + }, + { + "expr": "label_replace(\n (\n rate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Throughput by Disk", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 21 + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by(instance, device) (label_replace(\n (rate(node_disk_write_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001) or\n (rate(node_disk_read_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}})", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percent" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 12, + "y": 21 + }, + "id": 14, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n (rate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) / 10) or\n rate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) * 100\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}})", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": "%Util", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "displayName", + "value": "Instance" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Slow Ops" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 30 + }, + "id": 15, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\", cluster=~\"$cluster\", }))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops per Host", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "cluster": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "ceph_hosts", + "options": [ ], + "query": "label_values({__name__=~\"ceph_.+_metadata\", cluster=~\"$cluster\", }, hostname)", + "refresh": 1, + "regex": "([^.]*).*", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Host Details", + "uid": "rtOg0AiWz", + "version": 0 +} diff --git a/assets/ceph/dashboards/hosts-overview.json b/assets/ceph/dashboards/hosts-overview.json new file mode 100644 index 0000000..adbf676 --- /dev/null +++ b/assets/ceph/dashboards/hosts-overview.json @@ -0,0 +1,892 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(sum by (hostname) (ceph_osd_metadata{cluster=~\"$cluster\", }))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "OSD Hosts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG CPU Busy", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg ((\n (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) - ((\n node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (\n node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n )\n )\n) / (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"}\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG RAM Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "IOPS Load at the device as reported by the OS on all OSD hosts", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum ((\n rate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n) + (\n rate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Physical IOPS", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg (\n label_replace(\n (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or\n (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),\n \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($osd_hosts).*\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n )\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG Disk Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total send/receive network load across all hosts in the ceph cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum (\n (\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((node_bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((node_bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Network Load", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show the top 10 busiest hosts by cpu", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percent" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,\n 100 * (\n 1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n )\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Busy - Top 10 Hosts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Top 10 hosts by network load", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10, (sum by(instance) (\n(\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) +\n(\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) unless on (device, instance)\n label_replace((node_bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network Load - Top 10 Hosts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "osd_hosts", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\", }, hostname)", + "refresh": 1, + "regex": "([^.]*).*", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "mon_hosts", + "options": [ ], + "query": "label_values(ceph_mon_metadata{cluster=~\"$cluster\", }, hostname)", + "refresh": 1, + "regex": "mon.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "mds_hosts", + "options": [ ], + "query": "label_values(ceph_mds_inodes{hostname, cluster=~\"$cluster\", })", + "refresh": 1, + "regex": "mds.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "rgw_hosts", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{hostname, cluster=~\"$cluster\", })", + "refresh": 1, + "regex": "rgw.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Host Overview", + "uid": "y0KGL0iZz", + "version": 0 +} diff --git a/assets/ceph/dashboards/multi-cluster-overview.json b/assets/ceph/dashboards/multi-cluster-overview.json new file mode 100644 index 0000000..ba6d29c --- /dev/null +++ b/assets/ceph/dashboards/multi-cluster-overview.json @@ -0,0 +1,2073 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ ], + "targetBlank": false, + "title": "Browse Dashboards", + "tooltip": "", + "type": "dashboards", + "url": "" + } + ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Clusters", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warning" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-yellow", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-red", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Healthy" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-green", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 0, + "y": 2 + }, + "id": 3, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "count(ceph_health_status==0) or vector(0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Healthy", + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "count(ceph_health_status==1)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Warning", + "refId": "B" + }, + { + "datasource": "$datasource", + "expr": "count(ceph_health_status==2)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Error", + "refId": "C" + } + ], + "title": "Status", + "transparent": false, + "type": "stat" + }, + { + "columns": [ ], + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "color-text" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "semi-dark-green", + "index": 2, + "text": "Healthy" + }, + "1": { + "color": "semi-dark-yellow", + "index": 0, + "text": "Warning" + }, + "2": { + "color": "semi-dark-red", + "index": 1, + "text": "Error" + } + }, + "type": "value" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Capacity Used" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cluster" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "/d/edtb0oxdq/ceph-cluster?var-cluster=${__data.fields.Cluster}&${DS_PROMETHEUS:queryparam}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Alerts" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "match": null, + "result": { + "index": 0, + "text": "0" + } + }, + "type": "special" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 19, + "x": 5, + "y": 2 + }, + "id": 4, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ceph_health_status", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ceph_mgr_metadata", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ALERTS{alertstate=\"firing\", cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ceph_cluster_by_class_total_used_bytes", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Details", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "cluster", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Value #B": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "ceph_daemon": true, + "device_class": true, + "hostname": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "job 1": true, + "job 2": true, + "job 3": true, + "replica 1": true, + "replica 2": true, + "replica 3": true + }, + "indexByName": { + "Time 1": 8, + "Time 2": 13, + "Time 3": 21, + "Time 4": 7, + "Time 5": 22, + "Time 6": 23, + "Value #A": 1, + "Value #B": 20, + "Value #C": 3, + "Value #D": 6, + "__name__ 1": 9, + "__name__ 2": 14, + "__name__ 3": 24, + "ceph_daemon": 15, + "ceph_version": 2, + "cluster": 0, + "device_class": 25, + "hostname": 16, + "instance 1": 10, + "instance 2": 17, + "instance 3": 26, + "job 1": 11, + "job 2": 18, + "job 3": 27, + "replica 1": 12, + "replica 2": 19, + "replica 3": 28 + }, + "renameByName": { + "Value #A": "Status", + "Value #C": "Alerts", + "Value #D": "Capacity Used", + "ceph_version": "Version", + "cluster": "Cluster" + } + } + } + ], + "type": "table" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 5, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Overview", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 10 + }, + "id": 6, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ceph_health_status{cluster=~\"$cluster\"}) or vector(0)", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Cluster Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "links": [ ], + "mappings": [ ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "semi-dark-yellow", + "value": 0.75 + }, + { + "color": "red", + "value": 0.84999999999999998 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 3, + "y": 10 + }, + "id": 7, + "interval": "1m", + "links": [ ], + "maxDataPoints": 100, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "sum(ceph_cluster_total_used_bytes{cluster=~\"$cluster\"}) / sum(ceph_cluster_total_bytes{cluster=~\"$cluster\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "A" + } + ], + "title": "Capacity Used", + "transparent": false, + "type": "gauge" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 7, + "y": 10 + }, + "id": 8, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(ceph_cluster_total_bytes{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 10, + "y": 10 + }, + "id": 9, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ceph_osd_metadata{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "OSDs", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 13, + "y": 10 + }, + "id": 10, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(sum by (hostname) (ceph_osd_metadata{cluster=~\"$cluster\"}))", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Hosts", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 10 + }, + "id": 11, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_wr{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_rd{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B" + } + ], + "title": "Client IOPS", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 10 + }, + "id": 12, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "avg(ceph_osd_apply_latency_ms{cluster=~\"$cluster\"})", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Apply", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "avg(ceph_osd_commit_latency_ms{cluster=~\"$cluster\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Commit", + "range": true, + "refId": "B" + } + ], + "title": "OSD Latencies", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 14 + }, + "id": 13, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ALERTS{alertstate=\"firing\", cluster=~\"$cluster\"}) or vector(0)", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Alert Count", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 7, + "y": 14 + }, + "id": 14, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(ceph_cluster_total_used_bytes{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Used", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 10, + "y": 14 + }, + "id": 15, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "predict_linear(avg(increase(ceph_cluster_total_used_bytes{cluster=~\"${Cluster}\"}[1d]))[7d:1h],120)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Capacity Prediction", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 13, + "y": 14 + }, + "id": 16, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "count(ceph_pool_metadata{cluster=~\"$cluster\"})", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pools", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 14 + }, + "id": 17, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_rd_bytes{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_pool_wr_bytes{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B" + } + ], + "title": "Client Bandwidth", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 14 + }, + "id": 18, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(irate(ceph_osd_recovery_ops{cluster=~\"$cluster\"}[$__interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A" + } + ], + "title": "Recovery Rate", + "transparent": false, + "type": "stat" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 19, + "panels": [ + { + "colors": null, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-red", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Warning" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "semi-dark-yellow", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 0, + "y": 19 + }, + "id": 20, + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "$datasource", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"critical\", cluster=~\"$cluster\"}) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Critical", + "range": false, + "refId": "A" + }, + { + "datasource": "$datasource", + "expr": "count(ALERTS{alertstate=\"firing\",severity=\"warning\", cluster=~\"$cluster\"}) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Warning", + "range": false, + "refId": "B" + } + ], + "title": "Status", + "transparent": false, + "type": "stat" + }, + { + "columns": [ ], + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 19, + "x": 5, + "y": 19 + }, + "id": 21, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Severity" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "ALERTS{alertstate=\"firing\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "cluster", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "instance": true, + "job": true, + "oid": true, + "replica": true, + "type": true + }, + "indexByName": { + "Time": 0, + "Value": 9, + "__name__": 1, + "alertname": 2, + "alertstate": 4, + "cluster": 3, + "instance": 6, + "job": 7, + "severity": 5, + "type": 8 + }, + "renameByName": { + "alertname": "Name", + "alertstate": "State", + "cluster": "Cluster", + "severity": "Severity" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 22, + "limit": 10, + "onlyAlertsOnDashboard": true, + "options": { + "alertName": "", + "dashboardAlerts": false, + "groupBy": [ ], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": false, + "normal": false, + "pending": true + }, + "viewMode": "list" + }, + "show": "current", + "sortOrder": 1, + "stateFilter": [ ], + "title": "Alerts(Grouped)", + "type": "alertlist" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 23, + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 30 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "topk(5, ceph_cluster_total_used_bytes/ceph_cluster_total_bytes)", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{cluster}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Top 5 - Capacity Utilization(%)", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 30 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "topk(10, sum by (cluster) (irate(ceph_osd_op_w[$__interval])) \n+ sum by (cluster) (irate(ceph_osd_op_r[$__interval])) )", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{cluster}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Top 5 - Cluster IOPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 30 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "topk(10, ceph_pool_bytes_used{cluster=~\"$cluster\", }/ceph_pool_max_avail{cluster=~\"$cluster\", } * on(pool_id, cluster) group_left(instance, name) ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{cluster}} - {{name}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Top 10 - Capacity Utilization(%) by Pool", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cluster Stats", + "titleSize": "h6", + "type": "row" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph - Multi-cluster", + "uid": "BnxelG7Sx", + "version": 0 +} diff --git a/assets/ceph/dashboards/osd-device-details.json b/assets/ceph/dashboards/osd-device-details.json new file mode 100644 index 0000000..60f1ecc --- /dev/null +++ b/assets/ceph/dashboards/osd-device-details.json @@ -0,0 +1,914 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OSD Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "read", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r_latency_sum{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w_latency_sum{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "Reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd R/W IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "Read Bytes", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r_out_bytes{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read Bytes", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w_in_bytes{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write Bytes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd R/W Bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 6, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Physical Device Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(\n label_replace(\n rate(node_disk_read_time_seconds_total[$__rate_interval]) /\n rate(node_disk_reads_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n ) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}/{{device}} Reads", + "refId": "A" + }, + { + "expr": "(\n label_replace(\n rate(node_disk_write_time_seconds_total[$__rate_interval]) /\n rate(node_disk_writes_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device)\n label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}/{{device}} Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device Latency for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 11 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_writes_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}} Writes", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(node_disk_reads_completed_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}} Reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device R/W IOPS for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 12, + "y": 11 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_read_bytes_total[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}} {{device}} Reads", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(node_disk_written_bytes_total[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", },\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}} {{device}} Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device R/W Bytes for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 18, + "y": 11 + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_io_time_seconds_total[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{ceph_daemon=~\"$osd\", cluster=~\"$cluster\", }, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device Util% for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "OSD", + "multi": false, + "name": "osd", + "options": [ ], + "query": "label_values(ceph_osd_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "OSD device details", + "uid": "CrAHE0iZz", + "version": 0 +} diff --git a/assets/ceph/dashboards/osds-overview.json b/assets/ceph/dashboards/osds-overview.json new file mode 100644 index 0000000..948f0d7 --- /dev/null +++ b/assets/ceph/dashboards/osds-overview.json @@ -0,0 +1,1339 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "grafana-piechart-panel", + "name": "Pie Chart", + "type": "panel", + "version": "1.3.3" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "table", + "name": "Table", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { + "@95%ile": "#e0752d" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg (\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AVG read", + "refId": "A" + }, + { + "expr": "max(\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MAX read", + "refId": "B" + }, + { + "expr": "quantile(0.95,\n (\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n * 1000\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "@95%ile", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "OSD Read Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ceph_daemon" + }, + "properties": [ + { + "id": "displayName", + "value": "OSD ID" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency (ms)" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 3, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (sort(\n (\n rate(ceph_osd_op_r_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000\n )\n ))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest READ Latencies", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "cluster": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "aliasColors": { + "@95%ile write": "#e0752d" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(\n rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval])\n * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AVG write", + "refId": "A" + }, + { + "expr": "max(\n rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MAX write", + "refId": "B" + }, + { + "expr": "quantile(0.95, (\n rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000\n))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "@95%ile write", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "OSD Write Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ceph_daemon" + }, + "properties": [ + { + "id": "displayName", + "value": "OSD ID" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency (ms)" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "NaN": { + "index": 0, + "text": "0.00" + } + }, + "type": "value" + } + ] + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 5, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (sort(\n (rate(ceph_osd_op_w_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n 1000)\n ))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest WRITE Latencies", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "cluster": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 8 + }, + "id": 6, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "count by (device_class) (ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device_class}}", + "refId": "A" + } + ], + "title": "OSD Types Summary", + "type": "piechart" + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 4, + "y": 8 + }, + "id": 7, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "count(ceph_bluefs_wal_total_bytes{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "bluestore", + "refId": "A" + }, + { + "expr": "absent(ceph_bluefs_wal_total_bytes{cluster=~\"$cluster\", }) * count(ceph_osd_metadata{cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "filestore", + "refId": "B" + } + ], + "title": "OSD Objectstore Types", + "type": "piechart" + }, + { + "datasource": "$datasource", + "description": "The pie chart shows the various OSD sizes used within the cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 8 + }, + "id": 8, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } < 1099511627776)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<1TB", + "refId": "A" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 1099511627776 < 2199023255552)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<2TB", + "refId": "B" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 2199023255552 < 3298534883328)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<3TB", + "refId": "C" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 3298534883328 < 4398046511104)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<4TB", + "refId": "D" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 4398046511104 < 6597069766656)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<6TB", + "refId": "E" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 6597069766656 < 8796093022208)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<8TB", + "refId": "F" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 8796093022208 < 10995116277760)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<10TB", + "refId": "G" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 10995116277760 < 13194139533312)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<12TB", + "refId": "H" + }, + { + "expr": "count(ceph_osd_stat_bytes{cluster=~\"$cluster\", } >= 13194139533312)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<12TB+", + "refId": "I" + } + ], + "title": "OSD Size Summary", + "type": "piechart" + }, + { + "aliasColors": { }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 12, + "y": 8 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_osd_numpg{cluster=~\"$cluster\", }", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "PGs per OSD", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Distribution of PGs per OSD", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": 20, + "mode": "histogram", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "# of OSDs", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster", + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 8 + }, + "id": 10, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_bluestore_onode_hits{cluster=~\"$cluster\", }) / (\n sum(ceph_bluestore_onode_hits{cluster=~\"$cluster\", }) +\n sum(ceph_bluestore_onode_misses{cluster=~\"$cluster\", })\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": ".75", + "title": "OSD onode Hits Ratio", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 11, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "R/W Profile", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show the read/write workload profile overtime", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Read/Write Profile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "This table shows the 10 OSDs with the highest number of slow ops", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ceph_daemon" + }, + "properties": [ + { + "id": "displayName", + "value": "OSD ID" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Slow Ops" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 25 + }, + "id": 13, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "cluster": true, + "instance": true, + "job": true, + "type": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "OSD Overview", + "uid": "lo02I1Aiz", + "version": 0 +} diff --git a/assets/ceph/dashboards/pool-detail.json b/assets/ceph/dashboards/pool-detail.json new file mode 100644 index 0000000..5e5bf6e --- /dev/null +++ b/assets/ceph/dashboards/pool-detail.json @@ -0,0 +1,724 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "(ceph_pool_stored{cluster=~\"$cluster\", } / (ceph_pool_stored{cluster=~\"$cluster\", } + ceph_pool_max_avail{cluster=~\"$cluster\", })) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": ".7,.8", + "title": "Capacity used", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": 100, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Time till pool is full assuming the average fill rate of the last 6 hours", + "format": "s", + "gauge": { + "maxValue": false, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 7, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": "" + }, + "tableColumn": "", + "targets": [ + { + "expr": "(ceph_pool_max_avail{cluster=~\"$cluster\", } / deriv(ceph_pool_stored{cluster=~\"$cluster\", }[6h])) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", } > 0\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "current", + "title": "Time till full", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": false + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "deriv(ceph_pool_objects{cluster=~\"$cluster\", }[1m]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Objects per second", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Object Ingress/Egress", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ops", + "label": "Objects out(-) / in(+) ", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "iops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "reads", + "refId": "A" + }, + { + "expr": "rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Client IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "iops", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_pool_rd_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n on(pool_id) group_left(instance, name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "reads", + "refId": "A" + }, + { + "expr": "rate(ceph_pool_wr_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Client Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_pool_objects{cluster=~\"$cluster\", } *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{name=~\"$pool_name\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of Objects", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Objects", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Objects", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Pool Name", + "multi": false, + "name": "pool_name", + "options": [ ], + "query": "label_values(ceph_pool_metadata{cluster=~\"$cluster\", }, name)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Pool Details", + "uid": "-xyV8KCiz", + "version": 0 +} diff --git a/assets/ceph/dashboards/pool-overview.json b/assets/ceph/dashboards/pool-overview.json new file mode 100644 index 0000000..fa32b33 --- /dev/null +++ b/assets/ceph/dashboards/pool-overview.json @@ -0,0 +1,1691 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(ceph_pool_metadata{cluster=~\"$cluster\", })", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Pools", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Count of the pools that have compression enabled", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(ceph_pool_metadata{compression_mode!=\"none\", cluster=~\"$cluster\", })", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Pools with Compression", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total raw capacity available to the cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 4, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_osd_stat_bytes{cluster=~\"$cluster\", })", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Total Raw Capacity", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total raw capacity consumed by user data and associated overheads (metadata + redundancy)", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 5, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_pool_bytes_used{cluster=~\"$cluster\", })", + "format": "", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Raw Capacity Consumed", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Total of client data stored in the cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 6, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_pool_stored{cluster=~\"$cluster\", })", + "format": "", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Logical Stored ", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 7, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } -\n ceph_pool_compress_bytes_used{cluster=~\"$cluster\", }\n)\n", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Savings", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 18, + "y": 0 + }, + "id": 8, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(\n sum(ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0) /\n sum(ceph_pool_stored_raw{cluster=~\"$cluster\", } and ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0)\n) * 100\n", + "format": "table", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Eligibility", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "description": "This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 9, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0)\n / sum(ceph_pool_compress_bytes_used{cluster=~\"$cluster\", } > 0\n)\n", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Factor", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool Name" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pool_id" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool ID" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Factor" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #D" + }, + "properties": [ + { + "id": "displayName", + "value": "% Used" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 85 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #B" + }, + "properties": [ + { + "id": "displayName", + "value": "Usable Free" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #C" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Eligibility" + }, + { + "id": "unit", + "value": "percent" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #E" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Savings" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #F" + }, + "properties": [ + { + "id": "displayName", + "value": "Growth (5d)" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 85 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #G" + }, + "properties": [ + { + "id": "displayName", + "value": "IOPS" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #H" + }, + "properties": [ + { + "id": "displayName", + "value": "Bandwidth" + }, + { + "id": "unit", + "value": "Bps" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "type" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "compression_mode" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "description" + }, + "properties": [ + { + "id": "displayName", + "value": "Type" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #J" + }, + "properties": [ + { + "id": "displayName", + "value": "Stored" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #I" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #K" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 10, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "(\n ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } /\n ceph_pool_compress_bytes_used{cluster=~\"$cluster\", } > 0\n) and on(pool_id) (\n (\n (ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0) /\n ceph_pool_stored_raw{cluster=~\"$cluster\", }\n ) * 100 > 0.5\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "A", + "refId": "A" + }, + { + "expr": "ceph_pool_max_avail{cluster=~\"$cluster\", } *\n on(pool_id) group_left(name) ceph_pool_metadata{cluster=~\"$cluster\", }\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "B", + "refId": "B" + }, + { + "expr": "(\n (ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } > 0) /\n ceph_pool_stored_raw{cluster=~\"$cluster\", }\n) * 100\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "C", + "refId": "C" + }, + { + "expr": "ceph_pool_percent_used{cluster=~\"$cluster\", } *\n on(pool_id) group_left(name) ceph_pool_metadata{cluster=~\"$cluster\", }\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "D", + "refId": "D" + }, + { + "expr": "ceph_pool_compress_under_bytes{cluster=~\"$cluster\", } -\n ceph_pool_compress_bytes_used{cluster=~\"$cluster\", } > 0\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "E", + "refId": "E" + }, + { + "expr": "delta(ceph_pool_stored{cluster=~\"$cluster\", }[5d])", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "F", + "refId": "F" + }, + { + "expr": "rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval])\n + rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "G", + "refId": "G" + }, + { + "expr": "rate(ceph_pool_rd_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "H", + "refId": "H" + }, + { + "expr": "ceph_pool_metadata{cluster=~\"$cluster\", }", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "I", + "refId": "I" + }, + { + "expr": "ceph_pool_stored{cluster=~\"$cluster\", } * on(pool_id) group_left ceph_pool_metadata{cluster=~\"$cluster\", }", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "J", + "refId": "J" + }, + { + "expr": "ceph_pool_metadata{compression_mode!=\"none\", cluster=~\"$cluster\", }", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "K", + "refId": "K" + }, + { + "expr": "", + "format": "", + "intervalFactor": "", + "legendFormat": "L", + "refId": "L" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Pool Overview", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "seriesToRows", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value #A": true, + "Value #B": false, + "Value #C": true, + "Value #D": false, + "Value #E": true, + "Value #I": true, + "Value #K": true, + "__name__": true, + "cluster": true, + "compression_mode": true, + "instance": true, + "job": true, + "pool_id": true, + "type": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "This chart shows the sum of read and write IOPS from all clients by pool", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($topk,\n round(\n (\n rate(ceph_pool_rd{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval])\n ), 1\n ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{cluster=~\"$cluster\", })\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}} ", + "refId": "A" + }, + { + "expr": "topk($topk,\n rate(ceph_pool_wr{cluster=~\"$cluster\", }[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{cluster=~\"$cluster\", }\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}} - write", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Top $topk Client IOPS by Pool", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "IOPS", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "The chart shows the sum of read and write bytes from all clients, by pool", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($topk,\n (\n rate(ceph_pool_rd_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{cluster=~\"$cluster\", }[$__rate_interval])\n ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{cluster=~\"$cluster\", }\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Top $topk Client Bandwidth by Pool", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Throughput", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Historical view of capacity usage, to help identify growth and trends in pool consumption", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_pool_bytes_used{cluster=~\"$cluster\", } * on(pool_id) group_right ceph_pool_metadata{cluster=~\"$cluster\", }", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Pool Capacity Usage (RAW)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Capacity Used", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "15", + "value": "15" + }, + "hide": 0, + "includeAll": false, + "label": "TopK", + "multi": false, + "name": "topk", + "options": [ + { + "text": "15", + "value": "15" + } + ], + "query": "15", + "refresh": 0, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Pools Overview", + "uid": "z99hzWtmk", + "version": 0 +} diff --git a/assets/ceph/dashboards/radosgw-detail.json b/assets/ceph/dashboards/radosgw-detail.json new file mode 100644 index 0000000..35de6b0 --- /dev/null +++ b/assets/ceph/dashboards/radosgw-detail.json @@ -0,0 +1,651 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "grafana-piechart-panel", + "name": "Pie Chart", + "type": "panel", + "version": "1.3.3" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Host Detail : $rgw_servers", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GET {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "sum by (instance_id) (\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUT {{ceph_daemon}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$rgw_servers GET/PUT Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 6, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth by HTTP Operation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + "GETs": "#7eb26d", + "Other": "#447ebc", + "PUTs": "#eab839", + "Requests": "#3f2b5b", + "Requests Failed": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 13, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rgw_failed_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests Failed {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "B" + }, + { + "expr": "rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "C" + }, + { + "expr": "(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) -\n (\n rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Other {{ceph_daemon}}", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Breakdown", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [ ] + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#bf1b00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "GETs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7eb26d", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Other (HEAD,POST,DELETE)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#447ebc", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PUTs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#eab839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#3f2b5b", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 6, + "options": { + "displayLabels": [ ], + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": [ + "percent", + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(ceph_rgw_failed_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failures {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "B" + }, + { + "expr": "rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "C" + }, + { + "expr": "(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) -\n (\n rate(ceph_rgw_get{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_put{cluster=~\"$cluster\", }[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}", + "refId": "D" + } + ], + "title": "Workload Breakdown", + "type": "piechart" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Instance Detail", + "uid": "x5ARzZtmk", + "version": 0 +} diff --git a/assets/ceph/dashboards/radosgw-overview.json b/assets/ceph/dashboards/radosgw-overview.json new file mode 100644 index 0000000..5e185b6 --- /dev/null +++ b/assets/ceph/dashboards/radosgw-overview.json @@ -0,0 +1,1336 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Overview - All Gateways", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GET {{rgw_host}}", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUT {{rgw_host}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average GET/PUT Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "none" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 7, + "x": 8, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total Requests/sec by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_op_get_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_get_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GET Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Total bytes transferred in/out of all radosgw instances within the cluster", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs", + "refId": "A" + }, + { + "expr": "sum(rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth Consumed by Type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Total bytes transferred in/out through get/put operations, by radosgw instance", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 7, + "x": 8, + "y": 8 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_op_get_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rgw_op_put_obj_bytes{cluster=~\"$cluster\", }[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "s" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 6, + "x": 15, + "y": 8 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_op_put_obj_lat_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rgw_op_put_obj_lat_count{cluster=~\"$cluster\", }[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{cluster=~\"$cluster\", },\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "PUT Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 12, + "w": 9, + "x": 0, + "y": 12 + }, + "id": 9, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Overview - HAProxy Metrics", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 5, + "x": 0, + "y": 12 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*Back.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*1.*/" + }, + { + "alias": "/.*2.*/" + }, + { + "alias": "/.*3.*/" + }, + { + "alias": "/.*4.*/" + }, + { + "alias": "/.*5.*/" + }, + { + "alias": "/.*other.*/" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"frontend\"}[$__rate_interval]\n )\n) by (code)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Frontend {{ code }}", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_http_responses_total{code=~\"$code\", job=~\"$job_haproxy\", instance=~\"$ingress_service\", proxy=~\"backend\"}[$__rate_interval]\n )\n) by (code)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Backend {{ code }}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total responses by HTTP code", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 5, + "x": 5, + "y": 12 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*Response.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Backend.*/", + "transform": "negative-Y" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_http_requests_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_response_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Response errors", + "refId": "B" + }, + { + "expr": "sum(\n rate(\n haproxy_frontend_request_errors_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests errors", + "refId": "C" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_redispatch_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Backend redispatch", + "refId": "D" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_retry_warnings_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Backend retry", + "refId": "E" + }, + { + "expr": "sum(\n rate(\n haproxy_frontend_requests_denied_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request denied", + "refId": "F" + }, + { + "expr": "sum(\n haproxy_backend_current_queue{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Backend Queued", + "refId": "G" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total requests / responses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 5, + "x": 10, + "y": 12 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*Back.*/", + "transform": "negative-Y" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_connections_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Front", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_connection_attempts_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Back", + "refId": "B" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_connection_errors_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n )\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Back errors", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total number of connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 6, + "x": 15, + "y": 12 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + [ + { + "alias": "/.*OUT.*/", + "transform": "negative-Y" + } + ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(\n rate(\n haproxy_frontend_bytes_in_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "IN Front", + "refId": "A" + }, + { + "expr": "sum(\n rate(\n haproxy_frontend_bytes_out_total{proxy=~\"frontend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "OUT Front", + "refId": "B" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_bytes_in_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "IN Back", + "refId": "C" + }, + { + "expr": "sum(\n rate(\n haproxy_backend_bytes_out_total{proxy=~\"backend\", job=~\"$job_haproxy\", instance=~\"$ingress_service\"}[$__rate_interval]\n ) * 8\n) by (instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "OUT Back", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Current total of incoming / outgoing bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": ".*", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "HTTP Code", + "multi": false, + "name": "code", + "options": [ ], + "query": "label_values(haproxy_server_http_responses_total{job=~\"$job_haproxy\", instance=~\"$ingress_service\"}, code)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job haproxy", + "multi": true, + "name": "job_haproxy", + "options": [ ], + "query": "label_values(haproxy_server_status, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "Ingress Service", + "multi": false, + "name": "ingress_service", + "options": [ ], + "query": "label_values(haproxy_server_status{job=~\"$job_haproxy\"}, instance)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Overview", + "uid": "WAkugZpiz", + "version": 0 +} diff --git a/assets/ceph/dashboards/radosgw-sync-overview.json b/assets/ceph/dashboards/radosgw-sync-overview.json new file mode 100644 index 0000000..a7550d2 --- /dev/null +++ b/assets/ceph/dashboards/radosgw-sync-overview.json @@ -0,0 +1,614 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_sum{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Replication (throughput) from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_count{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Replication (objects) from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Objects/s", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_poll_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Polling Request Latency from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_errors{cluster=~\"$cluster\", }[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Unsuccessful Object Replications from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Count/s", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 16, + "x": 8, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "$datasource", + "expr": "rate(ceph_rgw_sync_delta_sync_delta[$__rate_interval])", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance_id}} - {{shard_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Replication(Time) Delta per shard", + "type": "timeseries" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 1, + "regex": "rgw.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Sync Overview", + "uid": "rgw-sync-overview", + "version": 0 +} diff --git a/assets/ceph/dashboards/rbd-details.json b/assets/ceph/dashboards/rbd-details.json new file mode 100644 index 0000000..500c51f --- /dev/null +++ b/assets/ceph/dashboards/rbd-details.json @@ -0,0 +1,465 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.3" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "Detailed Performance of RBD Images (IOPS/Throughput/Latency)", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "iops" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_ops{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_ops{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "iops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "iops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_bytes{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_bytes{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ns" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_latency_sum{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rbd_write_latency_count{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_latency_sum{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval]) /\n rate(ceph_rbd_read_latency_count{pool=\"$pool\", image=\"$image\", cluster=~\"$cluster\", }[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "pool", + "options": [ ], + "query": "label_values(ceph_rbd_read_ops{cluster=~\"$cluster\", }, pool)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "image", + "options": [ ], + "query": "label_values(ceph_rbd_read_ops{cluster=~\"$cluster\", , pool=\"$pool\"}, image)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RBD Details", + "uid": "YhCYGcuZz", + "version": 0 +} diff --git a/assets/ceph/dashboards/rbd-overview.json b/assets/ceph/dashboards/rbd-overview.json new file mode 100644 index 0000000..34666c6 --- /dev/null +++ b/assets/ceph/dashboards/rbd-overview.json @@ -0,0 +1,885 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.4.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "5.0.0" + }, + { + "id": "table", + "name": "Table", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_rbd_write_ops{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_rbd_read_ops{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_rbd_write_bytes{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_rbd_read_bytes{cluster=~\"$cluster\", }[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "ns" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(\n sum(rate(ceph_rbd_write_latency_sum{cluster=~\"$cluster\", }[$__rate_interval])) /\n sum(rate(ceph_rbd_write_latency_count{cluster=~\"$cluster\", }[$__rate_interval]))\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "A" + }, + { + "expr": "round(\n sum(rate(ceph_rbd_read_latency_sum{cluster=~\"$cluster\", }[$__rate_interval])) /\n sum(rate(ceph_rbd_read_latency_count{cluster=~\"$cluster\", }[$__rate_interval]))\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "pool" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "image" + }, + "properties": [ + { + "id": "displayName", + "value": "Image" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "IOPS" + }, + { + "id": "unit", + "value": "iops" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 5, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n (\n sort((\n rate(ceph_rbd_write_ops{cluster=~\"$cluster\", }[$__rate_interval]) +\n on (image, pool, namespace) rate(ceph_rbd_read_ops{cluster=~\"$cluster\", }[$__rate_interval])\n ))\n )\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest IOPS", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "pool" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "image" + }, + "properties": [ + { + "id": "displayName", + "value": "Image" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Throughput" + }, + { + "id": "unit", + "value": "Bps" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 7 + }, + "id": 6, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n sort(\n sum(\n rate(ceph_rbd_read_bytes{cluster=~\"$cluster\", }[$__rate_interval]) +\n rate(ceph_rbd_write_bytes{cluster=~\"$cluster\", }[$__rate_interval])\n ) by (pool, image, namespace)\n )\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest Throughput", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "null", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "pool" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "image" + }, + "properties": [ + { + "id": "displayName", + "value": "Image" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Latency" + }, + { + "id": "unit", + "value": "ns" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 7 + }, + "id": 7, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "topk(10,\n sum(\n rate(ceph_rbd_write_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_write_latency_count{cluster=~\"$cluster\", }[$__rate_interval]), 1) +\n rate(ceph_rbd_read_latency_sum{cluster=~\"$cluster\", }[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_read_latency_count{cluster=~\"$cluster\", }[$__rate_interval]), 1)\n ) by (pool, image, namespace)\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest Latency", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [ ] + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RBD Overview", + "uid": "41FrpeUiz", + "version": 0 +} diff --git a/assets/ceph/dashboards/rgw-s3-analytics.json b/assets/ceph/dashboards/rgw-s3-analytics.json new file mode 100644 index 0000000..397279f --- /dev/null +++ b/assets/ceph/dashboards/rgw-s3-analytics.json @@ -0,0 +1,4715 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Overview", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total PUTs", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 4, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum\n(ceph_rgw_op_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total GETs", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Objects", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 6, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum\n((sum by(instance_id)(ceph_rgw_op_put_obj_bytes) > 0) / (sum by(instance_id)(ceph_rgw_op_put_obj_ops) > 0) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average Object Size", + "transparent": false, + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 4 + }, + "id": 7, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "List Objects", + "range": false, + "refId": "A" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_buckets_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "List Buckets", + "range": true, + "refId": "B" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Put Objects", + "range": false, + "refId": "C" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_per_bucket_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "D" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "E" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_bucket_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Buckets", + "range": false, + "refId": "F" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_copy_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": true, + "refId": "G" + } + ], + "title": "Total Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 4 + }, + "id": 8, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Put Objects", + "range": false, + "refId": "A" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_per_bucket_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "B" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "C" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": true, + "refId": "D" + } + ], + "title": "Total Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 4 + }, + "id": 9, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "List Object", + "range": false, + "refId": "A" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_list_buckets_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "List Bucket", + "range": true, + "refId": "B" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Put Object", + "range": false, + "refId": "C" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Get Object", + "range": false, + "refId": "D" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Object", + "range": false, + "refId": "E" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_del_bucket_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Delete Bucket", + "range": false, + "refId": "F" + }, + { + "datasource": "${datasource}", + "expr": "sum(ceph_rgw_op_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Copy Object", + "range": true, + "refId": "G" + } + ], + "title": "Total Latencies", + "type": "bargauge" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "color-text" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 10, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Upload Objects", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": false, + "refId": "D" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per Bucket by Bandwidth", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": false, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #D": 4, + "Value #F": 5, + "bucket": 1, + "ceph_daemon": 0 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "bucket": "Bucket", + "ceph_daemon": "Daemon", + "ceph_daemon 1": "Daemon" + } + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 11, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "List Objects", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Upload Objects", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_bucket_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency(ms) Per Bucket", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "Bucket", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "ceph_daemon 5": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "bucket": 1, + "ceph_daemon": 0 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "bucket": "Bucket", + "ceph_daemon": "Daemon" + } + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 12, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_put_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Upload Objects", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_get_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Get Objects", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_del_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Delete Objects", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Copy Objects", + "range": false, + "refId": "D" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per User By Bandwidth", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "groupBy", + "options": { + "fields": { + "User": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + }, + "instance": { + "aggregations": [ ] + }, + "user": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #D": 4, + "Value #F": 5, + "ceph_daemon": 0, + "user": 1 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "ceph_daemon": "Daemon", + "user": "User" + } + } + } + ], + "type": "table" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 13, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_list_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_put_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_get_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_del_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "ceph_rgw_op_per_user_copy_obj_lat_sum *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", }", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency(ms) Per User", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "User", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "User": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + }, + "user": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "ceph_daemon 5": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "ceph_daemon": 0, + "user": 1 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "ceph_daemon": "Daemon", + "user": "User" + } + } + } + ], + "type": "table" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 14, + "panels": [ + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 29 + }, + "id": 15, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Bucket PUTs by Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 29 + }, + "id": 16, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Bucket GETs by Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 29 + }, + "id": 17, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5,\n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Buckets PUTs By Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 29 + }, + "id": 18, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5,\n sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Buckets GETs By Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 37 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket PUTs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 37 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket GETs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 37 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_copy_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Copy by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 37 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_del_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Delete by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 45 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket GETs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 45 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket PUTs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 45 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_list_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket List by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 45 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_del_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Delete by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (bucket, ceph_daemon) ((ceph_rgw_op_per_bucket_copy_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{bucket}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Bucket Copy by Operations", + "type": "timeseries" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 28, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_copy_obj_bytes *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (bucket, ceph_daemon) (ceph_rgw_op_per_bucket_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per Bucket by Operations", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "Bucket", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "Bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "bucket": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "ceph_daemon 1": true, + "ceph_daemon 2": true, + "ceph_daemon 3": true, + "ceph_daemon 4": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "instance_id 1": true, + "instance_id 2": true, + "instance_id 3": true, + "instance_id 4": true, + "instance_id 5": true, + "instance_id 6": true, + "instance_id 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "bucket": 1, + "ceph_daemon": 0 + }, + "renameByName": { + "Bucket": "", + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "List", + "Value #D": "Delete", + "Value #E": "Copy", + "Value #F": "Copy", + "Value #G": "", + "bucket": "Bucket", + "ceph_daemon": "Daemon" + } + } + } + ], + "type": "table" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Buckets", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 29, + "panels": [ + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 62 + }, + "id": 30, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_ops ) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users PUTs By Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 62 + }, + "id": 31, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_ops ) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users GETs by Operations", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 62 + }, + "id": 32, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users PUTs by Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 62 + }, + "id": 33, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ ] + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "topk(5, \n sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })\n)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": false, + "refId": "A" + } + ], + "title": "Top 5 Users GETs By Size", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 70 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User PUTs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 70 + }, + "id": 35, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User GETs by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 70 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_del_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User Delete by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 70 + }, + "id": 37, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_copy_obj_bytes) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User COPY by Size", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 78 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_get_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User GETs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 78 + }, + "id": 39, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_put_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User PUTs by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 78 + }, + "id": 40, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_list_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User List by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 78 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_del_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User Delete by Operations", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "unit": "number" + }, + "properties": [ + { + "id": "color" + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 86 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (user, ceph_daemon) ((ceph_rgw_op_per_user_copy_obj_ops) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}} - {{user}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "User Copy by Operations", + "type": "timeseries" + }, + { + "columns": [ ], + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "number" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 86 + }, + "id": 43, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "PUTs" + } + ] + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_put_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_get_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_del_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_copy_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": false, + "expr": "sum by (user, ceph_daemon) (ceph_rgw_op_per_user_list_obj_ops *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~\"$rgw_servers\", cluster=~\"$cluster\", })", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Summary Per User By Operations", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "joinByField", + "options": { + "byField": "User", + "mode": "outer" + } + }, + { + "id": "groupBy", + "options": { + "fields": { + "User": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #A": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #B": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #C": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #D": { + "aggregations": [ ], + "operation": "groupby" + }, + "Value #F": { + "aggregations": [ ], + "operation": "groupby" + }, + "ceph_daemon": { + "aggregations": [ ], + "operation": "groupby" + }, + "user": { + "aggregations": [ ], + "operation": "groupby" + } + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #F": 6, + "ceph_daemon": 0, + "user": 1 + }, + "renameByName": { + "Value #A": "PUTs", + "Value #B": "GETs", + "Value #C": "LIST", + "Value #D": "DELETE", + "Value #F": "COPY", + "ceph_daemon": "Daemon", + "user": "User" + } + } + } + ], + "type": "table" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Users", + "titleSize": "h6", + "type": "row" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_health_status, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{cluster=~\"$cluster\", }, ceph_daemon)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "datasource": "$datasource", + "hide": 2, + "label": "filters", + "name": "Filters", + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW S3 Analytics", + "uid": "BnxelG7Sz", + "version": 0 +} diff --git a/assets/ceph/rules.yaml b/assets/ceph/rules.yaml index 793e45a..19765bd 100644 --- a/assets/ceph/rules.yaml +++ b/assets/ceph/rules.yaml @@ -1,26 +1 @@ -groups: -- name: ceph.rules - rules: - - expr: | - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace) - record: cluster:ceph_node_down:join_kube - - expr: | - avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) - record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m -- name: telemeter.rules - rules: - - expr: | - count(ceph_osd_metadata{job="rook-ceph-mgr"}) - record: job:ceph_osd_metadata:count - - expr: | - count(kube_persistentvolume_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"}) - record: job:kube_pv:count - - expr: | - sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"}) - record: job:ceph_pools_iops:total - - expr: | - sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"}) - record: job:ceph_pools_iops_bytes:total - - expr: | - count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version)) - record: job:ceph_versions_running:count +null diff --git a/site/content/ceph/_index.md b/site/content/ceph/_index.md index b2f7995..1d8458e 100644 --- a/site/content/ceph/_index.md +++ b/site/content/ceph/_index.md @@ -9,7 +9,7 @@ A set of Prometheus alerts for Ceph. The scope of this project is to provide Ceph specific Prometheus rule files using Prometheus Mixins. {{< panel style="danger" >}} -Jsonnet source code is available at [github.com/ceph/ceph-mixins](https://github.com/ceph/ceph-mixins) +Jsonnet source code is available at [github.com/ceph/ceph](https://github.com/ceph/ceph/tree/master/monitoring/ceph-mixin) {{< /panel >}} ## Alerts @@ -18,163 +18,188 @@ Jsonnet source code is available at [github.com/ceph/ceph-mixins](https://github Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/alerts.yaml). {{< /panel >}} -### ceph-mgr-status +### cluster -##### CephMgrIsAbsent +### health + +### mon + +##### CephMonDownQuorumAtRisk {{< code lang="yaml" >}} -alert: CephMgrIsAbsent +alert: CephMonDownQuorumAtRisk annotations: - description: Ceph Manager has disappeared from Prometheus target discovery. - message: Storage metrics collector service not available anymore. - severity_level: critical - storage_type: ceph + description: '{{ $min := printf "floor(count(ceph_mon_metadata{cluster=''%s''}) + / 2) + 1" .Labels.cluster | query | first | value }}Quorum requires a majority + of monitors (x {{ $min }}) to be active. Without quorum the cluster will become + inoperable, affecting all services and connected clients. The following monitors + are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''} == 0) + on(cluster,ceph_daemon) + group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster | query }} - {{ + .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: Monitor quorum is at risk on cluster {{ $labels.cluster }} expr: | - label_replace((up{job="rook-ceph-mgr"} == 0 or absent(up{job="rook-ceph-mgr"})), "namespace", "openshift-storage", "", "") -for: 5m -labels: - severity: critical -{{< /code >}} - -##### CephMgrIsMissingReplicas - -{{< code lang="yaml" >}} -alert: CephMgrIsMissingReplicas -annotations: - description: Ceph Manager is missing replicas. - message: Storage metrics collector service doesn't have required no of replicas. - severity_level: warning - storage_type: ceph -expr: | - sum(kube_deployment_spec_replicas{deployment=~"rook-ceph-mgr-.*"}) by (namespace) < 1 -for: 5m -labels: - severity: warning -{{< /code >}} - -### ceph-mds-status - -##### CephMdsMissingReplicas - -{{< code lang="yaml" >}} -alert: CephMdsMissingReplicas -annotations: - description: Minimum required replicas for storage metadata service not available. - Might affect the working of storage cluster. - message: Insufficient replicas for storage metadata service. - severity_level: warning - storage_type: ceph -expr: | - sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2 -for: 5m -labels: - severity: warning -{{< /code >}} - -### quorum-alert.rules - -##### CephMonQuorumAtRisk - -{{< code lang="yaml" >}} -alert: CephMonQuorumAtRisk -annotations: - description: Storage cluster quorum is low. Contact Support. - message: Storage quorum at risk - severity_level: error - storage_type: ceph -expr: | - count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1) -for: 15m -labels: - severity: critical -{{< /code >}} - -##### CephMonQuorumLost - -{{< code lang="yaml" >}} -alert: CephMonQuorumLost -annotations: - description: Storage cluster quorum is lost. Contact Support. - message: Storage quorum is lost - severity_level: critical - storage_type: ceph -expr: | - count(kube_pod_status_phase{pod=~"rook-ceph-mon-.*", phase=~"Running|running"} == 1) by (namespace) < 2 -for: 5m -labels: - severity: critical -{{< /code >}} - -##### CephMonHighNumberOfLeaderChanges - -{{< code lang="yaml" >}} -alert: CephMonHighNumberOfLeaderChanges -annotations: - description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname - }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently. - message: Storage Cluster has seen many leader changes recently. - severity_level: warning - storage_type: ceph -expr: | - (ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95 -for: 5m -labels: - severity: warning -{{< /code >}} - -### ceph-node-alert.rules - -##### CephNodeDown - -{{< code lang="yaml" >}} -alert: CephNodeDown -annotations: - description: Storage node {{ $labels.node }} went down. Please check the node immediately. - message: Storage node {{ $labels.node }} went down - severity_level: error - storage_type: ceph -expr: | - cluster:ceph_node_down:join_kube == 0 + ( + (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) ( + count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1) + ) + ) == 1 for: 30s labels: + oid: 1.3.6.1.4.1.50495.1.2.1.3.1 severity: critical + type: ceph_default {{< /code >}} -### osd-alert.rules - -##### CephOSDCriticallyFull +##### CephMonDown {{< code lang="yaml" >}} -alert: CephOSDCriticallyFull +alert: CephMonDown annotations: - description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class - type {{$labels.device_class}} has crossed 80% on host {{ $labels.hostname }}. - Immediately free up some space or add capacity of type {{$labels.device_class}}. - message: Back-end storage device is critically full. - severity_level: error - storage_type: ceph + description: '{{ $down := printf "count(ceph_mon_quorum_status{cluster=''%s''} == + 0)" .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $down 1.0 + }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is + still intact, but the loss of an additional monitor will make your cluster inoperable. + The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''} + == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" + .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname + }} {{- end }}' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down + summary: One or more monitors down on cluster {{ $labels.cluster }} expr: | - (ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.80 -for: 40s + (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1))) +for: 30s labels: - severity: critical + severity: warning + type: ceph_default {{< /code >}} -##### CephOSDFlapping +##### CephMonDiskspaceCritical {{< code lang="yaml" >}} -alert: CephOSDFlapping +alert: CephMonDiskspaceCritical annotations: - description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times in last - 5 minutes. Please check the pod events or ceph status to find out the cause. - message: Ceph storage osd flapping. - severity_level: error - storage_type: ceph -expr: | - changes(ceph_osd_up[5m]) >= 10 -for: 0s + description: The free space available to a monitor's store is critically low. You + should increase the space available to the monitor(s). The default directory is + /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db + on the mon pod's worker node for Rook. Look for old, rotated versions of *.log + and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories + under /var/lib/rook and other directories on the same filesystem, often /var/log + and /var/tmp are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} {{- end }} + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit + summary: Filesystem space on at least one monitor is critically low on cluster {{ + $labels.cluster }} +expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 +for: 1m labels: + oid: 1.3.6.1.4.1.50495.1.2.1.3.2 severity: critical + type: ceph_default +{{< /code >}} + +##### CephMonDiskspaceLow + +{{< code lang="yaml" >}} +alert: CephMonDiskspaceLow +annotations: + description: The space available to a monitor's store is approaching full (>70% + is the default). You should increase the space available to the monitor(s). The + default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, + and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look + for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. + Also check any other directories under /var/lib/rook and other directories on + the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts + are; {{- range query "ceph_mon_metadata"}} - {{ .Labels.hostname }} {{- end }} + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low + summary: Drive space on at least one monitor is approaching full on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 +for: 5m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephMonClockSkew + +{{< code lang="yaml" >}} +alert: CephMonClockSkew +annotations: + description: Ceph monitors rely on closely synchronized time to maintain quorum + and cluster consistency. This event indicates that the time on at least one mon + has drifted too far from the lead mon. Review cluster status with ceph -s. This + will show which monitors are affected. Check the time sync status on each monitor + host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony + daemon. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew + summary: Clock skew detected among monitors on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +### osd + +##### CephOSDDownHigh + +{{< code lang="yaml" >}} +alert: CephOSDDownHigh +annotations: + description: '{{ $value | humanize }}% or {{ with printf "count (ceph_osd_up{cluster=''%s''} + == 0)" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with printf + "count (ceph_osd_up{cluster=''%s''})" .Labels.cluster | query }}{{ . | first | + value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range + printf "(ceph_osd_up{cluster=''%s''} * on(cluster, ceph_daemon) group_left(hostname) + ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} + on {{ .Labels.hostname }} {{- end }}' + summary: More than 10% of OSDs are down on cluster {{ $labels.cluster }} +expr: count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 + >= 10 +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephOSDHostDown + +{{< code lang="yaml" >}} +alert: CephOSDHostDown +annotations: + description: 'The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=''%s''} + * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster + | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}' + summary: An OSD host is offline on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 +for: 5m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.8 + severity: warning + type: ceph_default +{{< /code >}} + +##### CephOSDDown + +{{< code lang="yaml" >}} +alert: CephOSDDown +annotations: + description: '{{ $num := printf "count(ceph_osd_up{cluster=''%s''} == 0) " .Labels.cluster + | query | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end + }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if + eq $s "" }}is{{ else }}are{{ end }} down: {{- range printf "(ceph_osd_up{cluster=''%s''} + * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster + | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down + summary: An OSD has been marked down on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="OSD_DOWN"} == 1 +for: 5m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.2 + severity: warning + type: ceph_default {{< /code >}} ##### CephOSDNearFull @@ -182,359 +207,1346 @@ labels: {{< code lang="yaml" >}} alert: CephOSDNearFull annotations: - description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class - type {{$labels.device_class}} has crossed 75% on host {{ $labels.hostname }}. - Immediately free up some space or add capacity of type {{$labels.device_class}}. - message: Back-end storage device is nearing full. - severity_level: warning - storage_type: ceph -expr: | - (ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75 -for: 40s + description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health + detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to + the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull + summary: OSD(s) running low on free space (NEARFULL) on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 +for: 5m labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.3 severity: warning + type: ceph_default {{< /code >}} -##### CephOSDDiskNotResponding +##### CephOSDFull {{< code lang="yaml" >}} -alert: CephOSDDiskNotResponding +alert: CephOSDFull annotations: - description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host - }}. - message: Disk not responding - severity_level: error - storage_type: ceph -expr: | - label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") -for: 15m -labels: - severity: critical -{{< /code >}} - -##### CephOSDDiskUnavailable - -{{< code lang="yaml" >}} -alert: CephOSDDiskUnavailable -annotations: - description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host - }}. - message: Disk not accessible - severity_level: error - storage_type: ceph -expr: | - label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") + description: An OSD has reached the FULL threshold. Writes to pools that share the + affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify + the problem. To resolve, add capacity to the affected OSD's failure domain, restore + down/out OSDs, or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full + summary: OSD full, writes blocked on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="OSD_FULL"} > 0 for: 1m labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.6 severity: critical + type: ceph_default {{< /code >}} -##### CephOSDSlowOps +##### CephOSDBackfillFull {{< code lang="yaml" >}} -alert: CephOSDSlowOps +alert: CephOSDBackfillFull annotations: - description: '{{ $value }} Ceph OSD requests are taking too long to process. Please - check ceph status to find out the cause.' - message: OSD requests are taking too long to process. - severity_level: warning - storage_type: ceph -expr: | - ceph_healthcheck_slow_ops > 0 + description: An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance + operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify + the problem. To resolve, add capacity to the affected OSD's failure domain, restore + down/out OSDs, or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull + summary: OSD(s) too full for backfill operations on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephOSDTooManyRepairs + +{{< code lang="yaml" >}} +alert: CephOSDTooManyRepairs +annotations: + description: Reads from an OSD have used a secondary PG to return data to the client, + indicating a potential failing drive. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs + summary: OSD reports a high number of read errors on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 for: 30s labels: severity: warning + type: ceph_default {{< /code >}} -##### CephDataRecoveryTakingTooLong +##### CephOSDTimeoutsPublicNetwork {{< code lang="yaml" >}} -alert: CephDataRecoveryTakingTooLong +alert: CephOSDTimeoutsPublicNetwork annotations: - description: Data recovery has been active for too long. Contact Support. - message: Data recovery is slow - severity_level: warning - storage_type: ceph -expr: | - ceph_pg_undersized > 0 -for: 2h + description: OSD heartbeats on the cluster's 'public' network (frontend) are running + slow. Investigate the network for latency or loss issues. Use 'ceph health detail' + to show the affected OSDs. + summary: Network issues delaying OSD heartbeats (public network) on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1 +for: 1m labels: severity: warning + type: ceph_default {{< /code >}} -##### CephPGRepairTakingTooLong +##### CephOSDTimeoutsClusterNetwork {{< code lang="yaml" >}} -alert: CephPGRepairTakingTooLong +alert: CephOSDTimeoutsClusterNetwork annotations: - description: Self heal operations taking too long. Contact Support. - message: Self heal problems detected - severity_level: warning - storage_type: ceph + description: OSD heartbeats on the cluster's 'cluster' network (backend) are slow. + Investigate the network for latency issues on this subnet. Use 'ceph health detail' + to show the affected OSDs. + summary: Network issues delaying OSD heartbeats (cluster network) on cluster {{ + $labels.cluster }} +expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephOSDInternalDiskSizeMismatch + +{{< code lang="yaml" >}} +alert: CephOSDInternalDiskSizeMismatch +annotations: + description: One or more OSDs have an internal inconsistency between metadata and + the size of the device. This could lead to the OSD(s) crashing in future. You + should redeploy the affected OSDs. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch + summary: OSD size inconsistency error on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephDeviceFailurePredicted + +{{< code lang="yaml" >}} +alert: CephDeviceFailurePredicted +annotations: + description: The device health module has determined that one or more devices will + fail soon. To review device status use 'ceph device ls'. To show a specific device + use 'ceph device info '. Mark the OSD out so that data may migrate to + other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and + redeploy the OSD. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2 + summary: Device(s) predicted to fail soon on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephDeviceFailurePredictionTooHigh + +{{< code lang="yaml" >}} +alert: CephDeviceFailurePredictionTooHigh +annotations: + description: The device health module has determined that devices predicted to fail + can not be remediated automatically, since too many OSDs would be removed from + the cluster to ensure performance and availability. Prevent data integrity issues + by adding new OSDs so that data may be relocated. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany + summary: Too many devices are predicted to fail on cluster {{ $labels.cluster }}, + unable to resolve +expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.7 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephDeviceFailureRelocationIncomplete + +{{< code lang="yaml" >}} +alert: CephDeviceFailureRelocationIncomplete +annotations: + description: "The device health module has determined that one or more devices will + fail soon, but the normal process of relocating the data on the device to other + OSDs in the cluster is blocked. +Ensure that the cluster has available free space. + It may be necessary to add capacity to the cluster to allow data from the failing + device to successfully migrate, or to enable the balancer." + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use + summary: Device failure is predicted, but unable to relocate data on cluster {{ + $labels.cluster }} +expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephOSDFlapping + +{{< code lang="yaml" >}} +alert: CephOSDFlapping +annotations: + description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked + down and back up {{ $value | humanize }} times once a minute for 5 minutes. This + may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster + network, or the public network if no cluster network is deployed. Check the network + stats on the listed host(s). + documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds + summary: Network issues are causing OSDs to flap (mark each other down) on cluster + {{ $labels.cluster }} +expr: (rate(ceph_osd_up[5m]) * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) + * 60 > 1 +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.4 + severity: warning + type: ceph_default +{{< /code >}} + +##### CephOSDReadErrors + +{{< code lang="yaml" >}} +alert: CephOSDReadErrors +annotations: + description: An OSD has encountered read errors, but the OSD has recovered by retrying + the reads. This may indicate an issue with hardware or the kernel. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors + summary: Device read errors detected on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 +for: 30s +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephPGImbalance + +{{< code lang="yaml" >}} +alert: CephPGImbalance +annotations: + description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by + more than 30% from average PG count. + summary: PGs are not balanced across OSDs on cluster {{ $labels.cluster }} expr: | - ceph_pg_inconsistent > 0 + abs( + ((ceph_osd_numpg > 0) - on (cluster,job) group_left avg(ceph_osd_numpg > 0) by (cluster,job)) / + on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on (cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 +for: 5m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.4.5 + severity: warning + type: ceph_default +{{< /code >}} + +### mds + +##### CephFilesystemDamaged + +{{< code lang="yaml" >}} +alert: CephFilesystemDamaged +annotations: + description: Filesystem metadata has been corrupted. Data may be inaccessible. Analyze + metrics from the MDS daemon admin socket, or escalate to support. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: CephFS filesystem is damaged on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephFilesystemOffline + +{{< code lang="yaml" >}} +alert: CephFilesystemOffline +annotations: + description: All MDS ranks are unavailable. The MDS daemons managing metadata are + down, rendering the filesystem offline. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down + summary: CephFS filesystem is offline on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.3 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephFilesystemDegraded + +{{< code lang="yaml" >}} +alert: CephFilesystemDegraded +annotations: + description: One or more metadata daemons (MDS ranks) are failed or in a damaged + state. At best the filesystem is partially available, at worst the filesystem + is completely unusable. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded + summary: CephFS filesystem is degraded on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="FS_DEGRADED"} > 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.4 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephFilesystemMDSRanksLow + +{{< code lang="yaml" >}} +alert: CephFilesystemMDSRanksLow +annotations: + description: The filesystem's 'max_mds' setting defines the number of MDS ranks + in the filesystem. The current number of active MDS daemons is less than this + value. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max + summary: Ceph MDS daemon count is lower than configured on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephFilesystemInsufficientStandby + +{{< code lang="yaml" >}} +alert: CephFilesystemInsufficientStandby +annotations: + description: The minimum number of standby daemons required by standby_count_wanted + is less than the current number of standby daemons. Adjust the standby count or + increase the number of MDS daemons. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby + summary: Ceph filesystem standby daemons too few on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephFilesystemFailureNoStandby + +{{< code lang="yaml" >}} +alert: CephFilesystemFailureNoStandby +annotations: + description: An MDS daemon has failed, leaving only one active rank and no available + standby. Investigate the cause of the failure or add a standby MDS. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds + summary: MDS daemon failed, no further standby available on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.5 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephFilesystemReadOnly + +{{< code lang="yaml" >}} +alert: CephFilesystemReadOnly +annotations: + description: The filesystem has switched to READ ONLY due to an unexpected error + when writing to the metadata pool. Either analyze the output from the MDS daemon + admin socket, or escalate to support. + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages + summary: CephFS filesystem in read only mode due to write error(s) on cluster {{ + $labels.cluster }} +expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.5.2 + severity: critical + type: ceph_default +{{< /code >}} + +### mgr + +##### CephMgrModuleCrash + +{{< code lang="yaml" >}} +alert: CephMgrModuleCrash +annotations: + description: One or more mgr modules have crashed and have yet to be acknowledged + by an administrator. A crashed module may impact functionality within the cluster. + Use the 'ceph crash' command to determine which module has failed, and archive + it to acknowledge the failure. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash + summary: A manager module has recently crashed on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 +for: 5m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.6.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephMgrPrometheusModuleInactive + +{{< code lang="yaml" >}} +alert: CephMgrPrometheusModuleInactive +annotations: + description: The mgr/prometheus module at {{ $labels.instance }} is unreachable. + This could mean that the module has been disabled or the mgr daemon itself is + down. Without the mgr/prometheus module metrics and alerts will no longer function. + Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine + whether the mgr is active. If the mgr is not active, restart it, otherwise you + can determine module status with 'ceph mgr module ls'. If it is not listed as + enabled, enable it with 'ceph mgr module enable prometheus'. + summary: The mgr/prometheus module is not available +expr: up{job="ceph"} == 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.6.2 + severity: critical + type: ceph_default +{{< /code >}} + +### pgs + +##### CephPGsInactive + +{{< code lang="yaml" >}} +alert: CephPGsInactive +annotations: + description: '{{ $value }} PGs have been inactive for more than 5 minutes in pool + {{ $labels.name }}. Inactive placement groups are not able to serve read/write + requests.' + summary: One or more placement groups are inactive on cluster {{ $labels.cluster + }} +expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total + - ceph_pg_active) > 0 +for: 5m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephPGsUnclean + +{{< code lang="yaml" >}} +alert: CephPGsUnclean +annotations: + description: '{{ $value }} PGs have been unclean for more than 15 minutes in pool + {{ $labels.name }}. Unclean PGs have not recovered from a previous failure.' + summary: One or more placement groups are marked unclean on cluster {{ $labels.cluster + }} +expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total + - ceph_pg_clean) > 0 +for: 15m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.2 + severity: warning + type: ceph_default +{{< /code >}} + +##### CephPGsDamaged + +{{< code lang="yaml" >}} +alert: CephPGsDamaged +annotations: + description: During data consistency checks (scrub), at least one PG has been flagged + as being damaged or inconsistent. Check to see which PG is affected, and attempt + a manual repair if necessary. To list problematic placement groups, use 'rados + list-inconsistent-pg '. To repair PGs use the 'ceph pg repair ' + command. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged + summary: Placement group damaged, manual intervention needed on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 +for: 5m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.4 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephPGRecoveryAtRisk + +{{< code lang="yaml" >}} +alert: CephPGRecoveryAtRisk +annotations: + description: Data redundancy is at risk since one or more OSDs are at or above the + 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or + delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full + summary: OSDs are too full for recovery on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.5 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephPGUnavailableBlockingIO + +{{< code lang="yaml" >}} +alert: CephPGUnavailableBlockingIO +annotations: + description: Data availability is reduced, impacting the cluster's ability to service + I/O. One or more placement groups (PGs) are in a state that blocks I/O. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability + summary: PG is unavailable on cluster {{ $labels.cluster }}, blocking I/O +expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) + == 1 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.3 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephPGBackfillAtRisk + +{{< code lang="yaml" >}} +alert: CephPGBackfillAtRisk +annotations: + description: Data redundancy may be at risk due to lack of free space within the + cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more + capacity, or delete unwanted data. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full + summary: Backfill operations are blocked due to lack of free space on cluster {{ + $labels.cluster }} +expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.7.6 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephPGNotScrubbed + +{{< code lang="yaml" >}} +alert: CephPGNotScrubbed +annotations: + description: 'One or more PGs have not been scrubbed recently. Scrubs check metadata + integrity, protecting against bit-rot. They check that metadata is consistent + across data replicas. When PGs miss their scrub interval, it may indicate that + the scrub window is too small, or PGs were not in a ''clean'' state during the + scrub window. You can manually initiate a scrub with: ceph pg scrub ' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed + summary: Placement group(s) have not been scrubbed on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 +for: 5m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephPGsHighPerOSD + +{{< code lang="yaml" >}} +alert: CephPGsHighPerOSD +annotations: + description: |- + The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). + Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs + summary: Placement groups per OSD is too high on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephPGNotDeepScrubbed + +{{< code lang="yaml" >}} +alert: CephPGNotDeepScrubbed +annotations: + description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect + against bit-rot. They compare data replicas to ensure consistency. When PGs miss + their deep scrub interval, it may indicate that the window is too small or PGs + were not in a 'clean' state during the deep-scrub window. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed + summary: Placement group(s) have not been deep scrubbed on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 +for: 5m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +### nodes + +##### CephNodeRootFilesystemFull + +{{< code lang="yaml" >}} +alert: CephNodeRootFilesystemFull +annotations: + description: 'Root volume is dangerously full: {{ $value | humanize }}% free.' + summary: Root filesystem is dangerously full +expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} + * 100 < 5 +for: 5m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephNodeNetworkPacketDrops + +{{< code lang="yaml" >}} +alert: CephNodeNetworkPacketDrops +annotations: + description: Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 + packets/s on interface {{ $labels.device }}. + summary: One or more NICs reports packet drops +expr: | + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.2 + severity: warning + type: ceph_default +{{< /code >}} + +##### CephNodeNetworkPacketErrors + +{{< code lang="yaml" >}} +alert: CephNodeNetworkPacketErrors +annotations: + description: Node {{ $labels.instance }} experiences packet errors > 0.01% or > + 10 packets/s on interface {{ $labels.device }}. + summary: One or more NICs reports packet errors on cluster {{ $labels.cluster }} +expr: | + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.3 + severity: warning + type: ceph_default +{{< /code >}} + +##### CephNodeNetworkBondDegraded + +{{< code lang="yaml" >}} +alert: CephNodeNetworkBondDegraded +annotations: + description: Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}. + summary: Degraded Bond on Node {{ $labels.instance }} on cluster {{ $labels.cluster + }} +expr: | + node_bonding_slaves - node_bonding_active != 0 +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephNodeDiskspaceWarning + +{{< code lang="yaml" >}} +alert: CephNodeDiskspaceWarning +annotations: + description: Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will + be full in less than 5 days based on the 48 hour trailing fill rate. + summary: Host filesystem free space is getting low on cluster {{ $labels.cluster + }} +expr: predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) + * on(cluster, instance) group_left(nodename) node_uname_info < 0 +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.8.4 + severity: warning + type: ceph_default +{{< /code >}} + +##### CephNodeInconsistentMTU + +{{< code lang="yaml" >}} +alert: CephNodeInconsistentMTU +annotations: + description: Node {{ $labels.instance }} has a different MTU size ({{ $value }}) + than the median of devices named {{ $labels.device }}. + summary: MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster + }} +expr: node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( max + by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} + > 0)) )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( min + by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} + > 0)) ) +labels: + severity: warning + type: ceph_default +{{< /code >}} + +### pools + +##### CephPoolGrowthWarning + +{{< code lang="yaml" >}} +alert: CephPoolGrowthWarning +annotations: + description: Pool '{{ $labels.name }}' will be full in less than 5 days assuming + the average fill-up rate of the past 48 hours. + summary: Pool growth rate may soon exceed capacity on cluster {{ $labels.cluster + }} +expr: (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster,pool_id, + instance) group_right() ceph_pool_metadata) >= 95 +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.9.2 + severity: warning + type: ceph_default +{{< /code >}} + +##### CephPoolBackfillFull + +{{< code lang="yaml" >}} +alert: CephPoolBackfillFull +annotations: + description: A pool is approaching the near full threshold, which will prevent recovery/backfill + operations from completing. Consider adding more capacity. + summary: Free space in a pool is too low for recovery/backfill on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephPoolFull + +{{< code lang="yaml" >}} +alert: CephPoolFull +annotations: + description: A pool has reached its MAX quota, or OSDs supporting the pool have + reached the FULL threshold. Until this is resolved, writes to the pool will be + blocked. Pool Breakdown (top 5) {{- range printf "topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} + * on(cluster,pool_id) group_right ceph_pool_metadata))" .Labels.cluster | query + }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, + or add capacity to the cluster first then increase the pool's quota (e.g. ceph + osd pool set quota max_bytes ) + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full + summary: Pool is full - writes are blocked on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="POOL_FULL"} > 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.9.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephPoolNearFull + +{{< code lang="yaml" >}} +alert: CephPoolNearFull +annotations: + description: A pool has exceeded the warning (percent full) threshold, or OSDs supporting + the pool have reached the NEARFULL threshold. Writes may continue, but you are + at risk of the pool going read-only if more capacity isn't made available. Determine + the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase + the pool's quota, or add capacity to the cluster first then increase the pool's + quota (e.g. ceph osd pool set quota max_bytes ). Also ensure + that the balancer is active. + summary: One or more Ceph pools are nearly full on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 +for: 5m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +### healthchecks + +##### CephSlowOps + +{{< code lang="yaml" >}} +alert: CephSlowOps +annotations: + description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time + exceeded)' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: OSD operations are slow to complete on cluster {{ $labels.cluster }} +expr: ceph_healthcheck_slow_ops > 0 +for: 30s +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### CephDaemonSlowOps + +{{< code lang="yaml" >}} +alert: CephDaemonSlowOps +annotations: + description: '{{ $labels.ceph_daemon }} operations are taking too long to process + (complaint time exceeded)' + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops + summary: '{{ $labels.ceph_daemon }} operations are slow to complete on cluster {{ + $labels.cluster }}' +expr: ceph_daemon_health_metrics{type="SLOW_OPS"} > 0 +for: 30s +labels: + severity: warning + type: ceph_default +{{< /code >}} + +### cephadm + +##### CephadmUpgradeFailed + +{{< code lang="yaml" >}} +alert: CephadmUpgradeFailed +annotations: + description: The cephadm cluster upgrade process has failed. The cluster remains + in an undetermined state. Please review the cephadm logs, to understand the nature + of the issue + summary: Ceph version upgrade has failed on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.11.2 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephadmDaemonFailed + +{{< code lang="yaml" >}} +alert: CephadmDaemonFailed +annotations: + description: A daemon managed by cephadm is no longer active. Determine, which daemon + is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon + start ' + summary: A ceph daemon managed by cephadm is down on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.11.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephadmPaused + +{{< code lang="yaml" >}} +alert: CephadmPaused +annotations: + description: Cluster management has been paused manually. This will prevent the + orchestrator from service management and reconciliation. If this is not intentional, + resume cephadm operations with 'ceph orch resume' + documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused + summary: Orchestration tasks via cephadm are PAUSED on cluster {{ $labels.cluster + }} +expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +### hardware + +##### HardwareStorageError + +{{< code lang="yaml" >}} +alert: HardwareStorageError +annotations: + description: Some storage devices are in error. Check `ceph health detail`. + summary: Storage devices error(s) detected on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.1 + severity: critical + type: ceph_default +{{< /code >}} + +##### HardwareMemoryError + +{{< code lang="yaml" >}} +alert: HardwareMemoryError +annotations: + description: DIMM error(s) detected. Check `ceph health detail`. + summary: DIMM error(s) detected on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.2 + severity: critical + type: ceph_default +{{< /code >}} + +##### HardwareProcessorError + +{{< code lang="yaml" >}} +alert: HardwareProcessorError +annotations: + description: Processor error(s) detected. Check `ceph health detail`. + summary: Processor error(s) detected on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.3 + severity: critical + type: ceph_default +{{< /code >}} + +##### HardwareNetworkError + +{{< code lang="yaml" >}} +alert: HardwareNetworkError +annotations: + description: Network error(s) detected. Check `ceph health detail`. + summary: Network error(s) detected on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.4 + severity: critical + type: ceph_default +{{< /code >}} + +##### HardwarePowerError + +{{< code lang="yaml" >}} +alert: HardwarePowerError +annotations: + description: Power supply error(s) detected. Check `ceph health detail`. + summary: Power supply error(s) detected on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="HARDWARE_POWER"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.5 + severity: critical + type: ceph_default +{{< /code >}} + +##### HardwareFanError + +{{< code lang="yaml" >}} +alert: HardwareFanError +annotations: + description: Fan error(s) detected. Check `ceph health detail`. + summary: Fan error(s) detected on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="HARDWARE_FANS"} > 0 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.13.6 + severity: critical + type: ceph_default +{{< /code >}} + +### PrometheusServer + +##### PrometheusJobMissing + +{{< code lang="yaml" >}} +alert: PrometheusJobMissing +annotations: + description: The prometheus job that scrapes from Ceph is no longer defined, this + will effectively mean you'll have no metrics or alerts for the cluster. Please + review the job definitions in the prometheus.yml file of the prometheus instance. + summary: The scrape job for Ceph is missing from Prometheus +expr: absent(up{job="ceph"}) +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.12.1 + severity: critical + type: ceph_default +{{< /code >}} + +### rados + +##### CephObjectMissing + +{{< code lang="yaml" >}} +alert: CephObjectMissing +annotations: + description: The latest version of a RADOS object can not be found, even though + all OSDs are up. I/O requests for this object from clients will block (hang). + Resolving this issue may require the object to be rolled back to a prior version + manually, and manually verified. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound + summary: Object(s) marked UNFOUND on cluster {{ $labels.cluster }} +expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right(cluster) + (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) + == 1 +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.1 + severity: critical + type: ceph_default +{{< /code >}} + +### generic + +##### CephDaemonCrash + +{{< code lang="yaml" >}} +alert: CephDaemonCrash +annotations: + description: One or more daemons have crashed recently, and need to be acknowledged. + This notification ensures that software crashes do not go unseen. To acknowledge + a crash, use the 'ceph crash archive ' command. + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash + summary: One or more Ceph daemons have crashed, and are pending acknowledgement + on cluster {{ $labels.cluster }} +expr: ceph_health_detail{name="RECENT_CRASH"} == 1 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.1.2 + severity: critical + type: ceph_default +{{< /code >}} + +### rbdmirror + +##### CephRBDMirrorImagesPerDaemonHigh + +{{< code lang="yaml" >}} +alert: CephRBDMirrorImagesPerDaemonHigh +annotations: + description: Number of image replications per daemon is not supposed to go beyond + threshold 100 + summary: Number of image replications are now above 100 on cluster {{ $labels.cluster + }} +expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) + > 100 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.2 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephRBDMirrorImagesNotInSync + +{{< code lang="yaml" >}} +alert: CephRBDMirrorImagesNotInSync +annotations: + description: Both local and remote RBD mirror images should be in sync. + summary: Some of the RBD mirror images are not in sync with the remote counter parts + on cluster {{ $labels.cluster }} +expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, + image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk + by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) + != 0 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.3 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephRBDMirrorImagesNotInSyncVeryHigh + +{{< code lang="yaml" >}} +alert: CephRBDMirrorImagesNotInSyncVeryHigh +annotations: + description: More than 10% of the images have synchronization problems. + summary: Number of unsynchronized images are very high on cluster {{ $labels.cluster + }} +expr: count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, namespace, + pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, + image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != + 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1) +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.4 + severity: critical + type: ceph_default +{{< /code >}} + +##### CephRBDMirrorImageTransferBandwidthHigh + +{{< code lang="yaml" >}} +alert: CephRBDMirrorImageTransferBandwidthHigh +annotations: + description: Detected a heavy increase in bandwidth for rbd replications (over 80%) + in the last 30 min. This might not be a problem, but it is good to review the + number of images being replicated simultaneously + summary: The replication network usage on cluster {{ $labels.cluster }} has been + increased over 80% in the last 30 minutes. Review the number of images being replicated. + This alert will be cleaned automatically after 30 minutes +expr: rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80 +for: 1m +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.10.5 + severity: warning + type: ceph_default +{{< /code >}} + +### nvmeof + +##### NVMeoFSubsystemNamespaceLimit + +{{< code lang="yaml" >}} +alert: NVMeoFSubsystemNamespaceLimit +annotations: + description: Subsystems have a max namespace limit defined at creation time. This + alert means that no more namespaces can be added to {{ $labels.nqn }} + summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces + on cluster {{ $labels.cluster }}' +expr: (count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFTooManyGateways + +{{< code lang="yaml" >}} +alert: NVMeoFTooManyGateways +annotations: + description: You may create many gateways, but 4 is the tested limit + summary: Max supported gateways exceeded on cluster {{ $labels.cluster }} +expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFMaxGatewayGroupSize + +{{< code lang="yaml" >}} +alert: NVMeoFMaxGatewayGroupSize +annotations: + description: You may create many gateways in a gateway group, but 4 is the tested + limit + summary: Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster + {{ $labels.cluster }} +expr: count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFSingleGatewayGroup + +{{< code lang="yaml" >}} +alert: NVMeoFSingleGatewayGroup +annotations: + description: Although a single member gateway group is valid, it should only be + used for test purposes + summary: The gateway group {{ $labels.group }} consists of a single gateway - HA + is not possible on cluster {{ $labels.cluster }} +expr: count(ceph_nvmeof_gateway_info) by(cluster,group) == 1 +for: 5m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFHighGatewayCPU + +{{< code lang="yaml" >}} +alert: NVMeoFHighGatewayCPU +annotations: + description: Typically, high CPU may indicate degraded performance. Consider increasing + the number of reactor cores + summary: CPU used by {{ $labels.instance }} NVMe-oF Gateway is high on cluster {{ + $labels.cluster }} +expr: label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") + > 80.00 +for: 10m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFGatewayOpenSecurity + +{{< code lang="yaml" >}} +alert: NVMeoFGatewayOpenSecurity +annotations: + description: It is good practice to ensure subsystems use host security to reduce + the risk of unexpected data loss + summary: Subsystem {{ $labels.nqn }} has been defined without host level security + on cluster {{ $labels.cluster }} +expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"} +for: 5m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFTooManySubsystems + +{{< code lang="yaml" >}} +alert: NVMeoFTooManySubsystems +annotations: + description: Although you may continue to create subsystems in {{ $labels.gateway_host + }}, the configuration may not be supported + summary: The number of subsystems defined to the gateway exceeds supported values + on cluster {{ $labels.cluster }} +expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) + > 16.00 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFVersionMismatch + +{{< code lang="yaml" >}} +alert: NVMeoFVersionMismatch +annotations: + description: This may indicate an issue with deployment. Check cephadm logs + summary: Too many different NVMe-oF gateway releases active on cluster {{ $labels.cluster + }} +expr: count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > + 1 for: 1h labels: severity: warning + type: ceph_default {{< /code >}} -### persistent-volume-alert.rules - -##### PersistentVolumeUsageNearFull +##### NVMeoFHighClientCount {{< code lang="yaml" >}} -alert: PersistentVolumeUsageNearFull +alert: NVMeoFHighClientCount annotations: - description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. - Free up some space or expand the PVC. - message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion - or PVC expansion is required. - severity_level: warning - storage_type: ceph -expr: | - (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75 -for: 5s -labels: - severity: warning -{{< /code >}} - -##### PersistentVolumeUsageCritical - -{{< code lang="yaml" >}} -alert: PersistentVolumeUsageCritical -annotations: - description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. - Free up some space or expand the PVC immediately. - message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion - or PVC expansion is required. - severity_level: error - storage_type: ceph -expr: | - (kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85 -for: 5s -labels: - severity: critical -{{< /code >}} - -### cluster-state-alert.rules - -##### CephClusterErrorState - -{{< code lang="yaml" >}} -alert: CephClusterErrorState -annotations: - description: Storage cluster is in error state for more than 10m. - message: Storage cluster is in error state - severity_level: error - storage_type: ceph -expr: | - ceph_health_status{job="rook-ceph-mgr"} > 1 -for: 10m -labels: - severity: critical -{{< /code >}} - -##### CephClusterWarningState - -{{< code lang="yaml" >}} -alert: CephClusterWarningState -annotations: - description: Storage cluster is in warning state for more than 10m. - message: Storage cluster is in degraded state - severity_level: warning - storage_type: ceph -expr: | - ceph_health_status{job="rook-ceph-mgr"} == 1 -for: 15m -labels: - severity: warning -{{< /code >}} - -##### CephOSDVersionMismatch - -{{< code lang="yaml" >}} -alert: CephOSDVersionMismatch -annotations: - description: There are {{ $value }} different versions of Ceph OSD components running. - message: There are multiple versions of storage services running. - severity_level: warning - storage_type: ceph -expr: | - count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1 -for: 10m -labels: - severity: warning -{{< /code >}} - -##### CephMonVersionMismatch - -{{< code lang="yaml" >}} -alert: CephMonVersionMismatch -annotations: - description: There are {{ $value }} different versions of Ceph Mon components running. - message: There are multiple versions of storage services running. - severity_level: warning - storage_type: ceph -expr: | - count(count(ceph_mon_metadata{job="rook-ceph-mgr", ceph_version != ""}) by (ceph_version)) > 1 -for: 10m -labels: - severity: warning -{{< /code >}} - -### cluster-utilization-alert.rules - -##### CephClusterNearFull - -{{< code lang="yaml" >}} -alert: CephClusterNearFull -annotations: - description: Storage cluster utilization has crossed 75% and will become read-only - at 85%. Free up some space or expand the storage cluster. - message: Storage cluster is nearing full. Data deletion or cluster expansion is - required. - severity_level: warning - storage_type: ceph -expr: | - ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.75 -for: 5s -labels: - severity: warning -{{< /code >}} - -##### CephClusterCriticallyFull - -{{< code lang="yaml" >}} -alert: CephClusterCriticallyFull -annotations: - description: Storage cluster utilization has crossed 80% and will become read-only - at 85%. Free up some space or expand the storage cluster immediately. - message: Storage cluster is critically full and needs immediate data deletion or - cluster expansion. - severity_level: error - storage_type: ceph -expr: | - ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.80 -for: 5s -labels: - severity: critical -{{< /code >}} - -##### CephClusterReadOnly - -{{< code lang="yaml" >}} -alert: CephClusterReadOnly -annotations: - description: Storage cluster utilization has crossed 85% and will become read-only - now. Free up some space or expand the storage cluster immediately. - message: Storage cluster is read-only now and needs immediate data deletion or cluster - expansion. - severity_level: error - storage_type: ceph -expr: | - ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes >= 0.85 -for: 0s -labels: - severity: critical -{{< /code >}} - -### pool-quota.rules - -##### CephPoolQuotaBytesNearExhaustion - -{{< code lang="yaml" >}} -alert: CephPoolQuotaBytesNearExhaustion -annotations: - description: Storage pool {{ $labels.name }} quota usage has crossed 70%. - message: Storage pool quota(bytes) is near exhaustion. - severity_level: warning - storage_type: ceph -expr: | - (ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.70 + description: The supported limit for clients connecting to a subsystem is 32 + summary: The number of clients connected to {{ $labels.nqn }} is too high on cluster + {{ $labels.cluster }} +expr: ceph_nvmeof_subsystem_host_count > 32.00 for: 1m labels: severity: warning + type: ceph_default {{< /code >}} -##### CephPoolQuotaBytesCriticallyExhausted +##### NVMeoFHighHostCPU {{< code lang="yaml" >}} -alert: CephPoolQuotaBytesCriticallyExhausted +alert: NVMeoFHighHostCPU annotations: - description: Storage pool {{ $labels.name }} quota usage has crossed 90%. - message: Storage pool quota(bytes) is critically exhausted. - severity_level: critical - storage_type: ceph -expr: | - (ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.90 -for: 1m + description: High CPU on a gateway host can lead to CPU contention and performance + degradation + summary: The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host + }}) on cluster {{ $labels.cluster }} +expr: 100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) + * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) + >= 80.00 +for: 10m labels: - severity: critical + severity: warning + type: ceph_default {{< /code >}} -## Recording rules - -{{< panel style="warning" >}} -Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/rules.yaml). -{{< /panel >}} - -### ceph.rules - -##### cluster:ceph_node_down:join_kube +##### NVMeoFInterfaceDown {{< code lang="yaml" >}} -expr: | - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace) -record: cluster:ceph_node_down:join_kube +alert: NVMeoFInterfaceDown +annotations: + description: A NIC used by one or more subsystems is in a down state + summary: Network interface {{ $labels.device }} is down on cluster {{ $labels.cluster + }} +expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"} +for: 30s +labels: + oid: 1.3.6.1.4.1.50495.1.2.1.14.1 + severity: warning + type: ceph_default {{< /code >}} -##### cluster:ceph_disk_latency:join_ceph_node_disk_irate1m +##### NVMeoFInterfaceDuplex {{< code lang="yaml" >}} -expr: | - avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) -record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m +alert: NVMeoFInterfaceDuplex +annotations: + description: Until this is resolved, performance from the gateway will be degraded + summary: Network interface {{ $labels.device }} is not running in full duplex mode + on cluster {{ $labels.cluster }} +expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"} +for: 30s +labels: + severity: warning + type: ceph_default {{< /code >}} -### telemeter.rules - -##### job:ceph_osd_metadata:count +##### NVMeoFHighReadLatency {{< code lang="yaml" >}} -expr: | - count(ceph_osd_metadata{job="rook-ceph-mgr"}) -record: job:ceph_osd_metadata:count +alert: NVMeoFHighReadLatency +annotations: + description: High latencies may indicate a constraint within the cluster e.g. CPU, + network. Please investigate + summary: The average read latency over the last 5 mins has reached 10 ms or more + on {{ $labels.gateway }} +expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) + / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") + > 0.01 +for: 5m +labels: + severity: warning + type: ceph_default {{< /code >}} -##### job:kube_pv:count +##### NVMeoFHighWriteLatency {{< code lang="yaml" >}} -expr: | - count(kube_persistentvolume_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"}) -record: job:kube_pv:count +alert: NVMeoFHighWriteLatency +annotations: + description: High latencies may indicate a constraint within the cluster e.g. CPU, + network. Please investigate + summary: The average write latency over the last 5 mins has reached 20 ms or more + on {{ $labels.gateway }} +expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) + / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*") + > 0.02 +for: 5m +labels: + severity: warning + type: ceph_default {{< /code >}} -##### job:ceph_pools_iops:total +## Dashboards +Following dashboards are generated from mixins and hosted on github: -{{< code lang="yaml" >}} -expr: | - sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"}) -record: job:ceph_pools_iops:total -{{< /code >}} - -##### job:ceph_pools_iops_bytes:total -{{< code lang="yaml" >}} -expr: | - sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"}) -record: job:ceph_pools_iops_bytes:total -{{< /code >}} - -##### job:ceph_versions_running:count - -{{< code lang="yaml" >}} -expr: | - count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version)) -record: job:ceph_versions_running:count -{{< /code >}} - +- [ceph-cluster-advanced](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/ceph-cluster-advanced.json) +- [cephfs-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/cephfs-overview.json) +- [host-details](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/host-details.json) +- [hosts-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/hosts-overview.json) +- [multi-cluster-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/multi-cluster-overview.json) +- [osd-device-details](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/osd-device-details.json) +- [osds-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/osds-overview.json) +- [pool-detail](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/pool-detail.json) +- [pool-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/pool-overview.json) +- [radosgw-detail](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/radosgw-detail.json) +- [radosgw-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/radosgw-overview.json) +- [radosgw-sync-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/radosgw-sync-overview.json) +- [rbd-details](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/rbd-details.json) +- [rbd-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/rbd-overview.json) +- [rgw-s3-analytics](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/rgw-s3-analytics.json)