diff --git a/assets/ceph/alerts.yaml b/assets/ceph/alerts.yaml index 16b942f..a9511b8 100644 --- a/assets/ceph/alerts.yaml +++ b/assets/ceph/alerts.yaml @@ -1051,8 +1051,20 @@ groups: }}, the configuration may not be supported summary: The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }} - expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) - > 16.00 + expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) + > 128.00 + for: 1m + labels: + severity: warning + type: ceph_default + - alert: NVMeoFTooManyNamespaces + annotations: + description: Although you may continue to create namespaces in {{ $labels.gateway_host + }}, the configuration may not be supported + summary: The number of namespaces defined to the gateway exceeds supported values + on cluster {{ $labels.cluster }} + expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) + > 1024.00 for: 1m labels: severity: warning diff --git a/assets/prometheus/alerts.yaml b/assets/prometheus/alerts.yaml index 40463db..d5cb945 100644 --- a/assets/prometheus/alerts.yaml +++ b/assets/prometheus/alerts.yaml @@ -52,10 +52,10 @@ groups: severity: warning - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus - {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a - specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus {{$labels.instance}} + to Alertmanager {{$labels.alertmanager}} were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. expr: | ( rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) diff --git a/site/content/ceph/_index.md b/site/content/ceph/_index.md index fa50875..550f969 100644 --- a/site/content/ceph/_index.md +++ b/site/content/ceph/_index.md @@ -1407,8 +1407,25 @@ annotations: }}, the configuration may not be supported summary: The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }} -expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) - > 16.00 +expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) + > 128.00 +for: 1m +labels: + severity: warning + type: ceph_default +{{< /code >}} + +##### NVMeoFTooManyNamespaces + +{{< code lang="yaml" >}} +alert: NVMeoFTooManyNamespaces +annotations: + description: Although you may continue to create namespaces in {{ $labels.gateway_host + }}, the configuration may not be supported + summary: The number of namespaces defined to the gateway exceeds supported values + on cluster {{ $labels.cluster }} +expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) + > 1024.00 for: 1m labels: severity: warning diff --git a/site/content/prometheus/_index.md b/site/content/prometheus/_index.md index 00d41fb..ccb7621 100644 --- a/site/content/prometheus/_index.md +++ b/site/content/prometheus/_index.md @@ -89,17 +89,17 @@ labels: {{< /code >}} ##### PrometheusErrorSendingAlertsToSomeAlertmanagers -'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus +'{{ printf "%.1f" $value }}% of alerts sent by Prometheus {{$labels.instance}} -Prometheus has encountered more than 1% errors sending alerts to a specific +More than 1% of alerts sent by Prometheus to a specific Alertmanager were {{< code lang="yaml" >}} alert: PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus - {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a specific - Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus {{$labels.instance}} + to Alertmanager {{$labels.alertmanager}} were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager were + affected by errors. expr: | ( rate(prometheus_notifications_errors_total{job="prometheus"}[5m])