diff --git a/assets/kube-cockroachdb/alerts.yaml b/assets/kube-cockroachdb/alerts.yaml index dd88ef9..0b377f3 100644 --- a/assets/kube-cockroachdb/alerts.yaml +++ b/assets/kube-cockroachdb/alerts.yaml @@ -3,8 +3,9 @@ groups: rules: - alert: CockroachInstanceFlapping annotations: - message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted - {{ $value }} time(s) in 10m' + description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted + {{ $value }} time(s) in 10m.' + summary: CockroachDB instances have restarted in the last 10 minutes. expr: | resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5 for: 1m @@ -12,7 +13,8 @@ groups: severity: warning - alert: CockroachLivenessMismatch annotations: - message: Liveness mismatch for {{ $labels.instance }} + description: Liveness mismatch for {{ $labels.instance }} + summary: CockroachDB has liveness mismatches. expr: | (cockroachdb_liveness_livenodes{job="cockroachdb-public"}) != @@ -22,7 +24,8 @@ groups: severity: warning - alert: CockroachVersionMismatch annotations: - message: Cluster {{ $labels.cluster }} running {{ $value }} different versions + description: Cluster {{ $labels.cluster }} running {{ $value }} different versions + summary: CockroachDB cluster is running different versions. expr: | count by(cluster) (count_values by(tag, cluster) ("version", cockroachdb_build_timestamp{job="cockroachdb-public"})) > 1 for: 1h @@ -30,8 +33,9 @@ groups: severity: warning - alert: CockroachStoreDiskLow annotations: - message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value - }} available disk fraction + description: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ + $value }} available disk fraction + summary: CockroachDB is at low disk capacity. expr: | :cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15 for: 30m @@ -39,7 +43,8 @@ groups: severity: critical - alert: CockroachClusterDiskLow annotations: - message: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction + description: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction + summary: CockroachDB cluster is at critically low disk capacity. expr: | cluster:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.2 for: 30m @@ -47,7 +52,8 @@ groups: severity: critical - alert: CockroachUnavailableRanges annotations: - message: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges + description: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges + summary: CockroachDB has unavailable ranges. expr: | (sum by(instance, cluster) (cockroachdb_ranges_unavailable{job="cockroachdb-public"})) > 0 for: 10m @@ -55,7 +61,9 @@ groups: severity: critical - alert: CockroachNoLeaseRanges annotations: - message: Instance {{ $labels.instance }} has {{ $value }} ranges without leases + description: Instance {{ $labels.instance }} has {{ $value }} ranges without + leases + summary: CockroachDB has ranges without leases. expr: | (sum by(instance, cluster) (cockroachdb_replicas_leaders_not_leaseholders{job="cockroachdb-public"})) > 0 for: 10m @@ -63,8 +71,9 @@ groups: severity: warning - alert: CockroachHighOpenFDCount annotations: - message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value + description: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} fraction used' + summary: CockroachDB has too many open file descriptors. expr: | cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8 for: 10m diff --git a/site/content/kube-cockroachdb/_index.md b/site/content/kube-cockroachdb/_index.md index 52dab92..4e2e4cb 100644 --- a/site/content/kube-cockroachdb/_index.md +++ b/site/content/kube-cockroachdb/_index.md @@ -23,8 +23,9 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni {{< code lang="yaml" >}} alert: CockroachInstanceFlapping annotations: - message: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted {{ - $value }} time(s) in 10m' + description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted + {{ $value }} time(s) in 10m.' + summary: CockroachDB instances have restarted in the last 10 minutes. expr: | resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5 for: 1m @@ -37,7 +38,8 @@ labels: {{< code lang="yaml" >}} alert: CockroachLivenessMismatch annotations: - message: Liveness mismatch for {{ $labels.instance }} + description: Liveness mismatch for {{ $labels.instance }} + summary: CockroachDB has liveness mismatches. expr: | (cockroachdb_liveness_livenodes{job="cockroachdb-public"}) != @@ -52,7 +54,8 @@ labels: {{< code lang="yaml" >}} alert: CockroachVersionMismatch annotations: - message: Cluster {{ $labels.cluster }} running {{ $value }} different versions + description: Cluster {{ $labels.cluster }} running {{ $value }} different versions + summary: CockroachDB cluster is running different versions. expr: | count by(cluster) (count_values by(tag, cluster) ("version", cockroachdb_build_timestamp{job="cockroachdb-public"})) > 1 for: 1h @@ -65,8 +68,9 @@ labels: {{< code lang="yaml" >}} alert: CockroachStoreDiskLow annotations: - message: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value }} - available disk fraction + description: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value + }} available disk fraction + summary: CockroachDB is at low disk capacity. expr: | :cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15 for: 30m @@ -79,7 +83,8 @@ labels: {{< code lang="yaml" >}} alert: CockroachClusterDiskLow annotations: - message: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction + description: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction + summary: CockroachDB cluster is at critically low disk capacity. expr: | cluster:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.2 for: 30m @@ -92,7 +97,8 @@ labels: {{< code lang="yaml" >}} alert: CockroachUnavailableRanges annotations: - message: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges + description: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges + summary: CockroachDB has unavailable ranges. expr: | (sum by(instance, cluster) (cockroachdb_ranges_unavailable{job="cockroachdb-public"})) > 0 for: 10m @@ -105,7 +111,8 @@ labels: {{< code lang="yaml" >}} alert: CockroachNoLeaseRanges annotations: - message: Instance {{ $labels.instance }} has {{ $value }} ranges without leases + description: Instance {{ $labels.instance }} has {{ $value }} ranges without leases + summary: CockroachDB has ranges without leases. expr: | (sum by(instance, cluster) (cockroachdb_replicas_leaders_not_leaseholders{job="cockroachdb-public"})) > 0 for: 10m @@ -118,8 +125,9 @@ labels: {{< code lang="yaml" >}} alert: CockroachHighOpenFDCount annotations: - message: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value }} - fraction used' + description: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value + }} fraction used' + summary: CockroachDB has too many open file descriptors. expr: | cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8 for: 10m