mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
96 lines
4.6 KiB
YAML
96 lines
4.6 KiB
YAML
|
groups:
|
||
|
- name: aerospike
|
||
|
rules:
|
||
|
- alert: AerospikeNodeHighMemoryUsage
|
||
|
annotations:
|
||
|
description: '{{ printf "%.0f" $value }} percent of system memory used on node
|
||
|
{{$labels.instance}} on cluster {{$labels.aerospike_cluster}}, which is above
|
||
|
the threshold of 80.'
|
||
|
summary: There is a limited amount of memory available for a node.
|
||
|
expr: |
|
||
|
100 - sum without (service) (aerospike_node_stats_system_free_mem_pct) >= 80
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: critical
|
||
|
- alert: AerospikeNamespaceHighDiskUsage
|
||
|
annotations:
|
||
|
description: '{{ printf "%.0f" $value }} percent of disk space available for
|
||
|
namespace {{$labels.ns}} on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}},
|
||
|
which is above the threshold of 80.'
|
||
|
summary: There is a limited amount of disk space available for a node.
|
||
|
expr: |
|
||
|
100 - sum without (service) (aerospike_namespace_device_free_pct) >= 80
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: critical
|
||
|
- alert: AerospikeUnavailablePartitions
|
||
|
annotations:
|
||
|
description: '{{ printf "%.0f" $value }} unavailable partition(s) in namespace
|
||
|
{{$labels.ns}}, on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}},
|
||
|
which is above the threshold of 0.'
|
||
|
summary: There are unavailable partitions in the Aerospike cluster.
|
||
|
expr: |
|
||
|
sum without(service) (aerospike_namespace_unavailable_partitions) > 0
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: critical
|
||
|
- alert: AerospikeDeadPartitions
|
||
|
annotations:
|
||
|
description: '{{ printf "%.0f" $value }} dead partition(s) in namespace {{$labels.ns}},
|
||
|
on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}}, which
|
||
|
is above the threshold of 0.'
|
||
|
summary: There are dead partitions in the Aerospike cluster.
|
||
|
expr: |
|
||
|
sum without(service) (aerospike_namespace_dead_partitions) > 0
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: critical
|
||
|
- alert: AerospikeNamespaceRejectingWrites
|
||
|
annotations:
|
||
|
description: Namespace {{$labels.ns}} on node {{$labels.instance}} on cluster
|
||
|
{{$labels.aerospike_cluster}} is currently rejecting all client-originated
|
||
|
writes.
|
||
|
summary: A namespace is currently rejecting all writes. Check for unavailable/dead
|
||
|
partitions, clock skew, or nodes running out of memory/disk.
|
||
|
expr: |
|
||
|
sum without(service) (aerospike_namespace_stop_writes + aerospike_namespace_clock_skew_stop_writes) > 0
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: critical
|
||
|
- alert: AerospikeHighClientReadErrorRate
|
||
|
annotations:
|
||
|
description: '{{ printf "%.0f" $value }} percent of client read transactions
|
||
|
are resulting in errors for namespace {{$labels.ns}}, on node {{$labels.instance}},
|
||
|
on cluster {{$labels.aerospike_cluster}}, which is above the threshold of
|
||
|
25.'
|
||
|
summary: There is a high rate of errors for client read transactions.
|
||
|
expr: |
|
||
|
sum without(service) (rate(aerospike_namespace_client_read_error[5m])) / (clamp_min(sum without(service) (rate(aerospike_namespace_client_read_error[5m])) + sum without(service) (rate(aerospike_namespace_client_read_success[5m])), 1)) > 25
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: warning
|
||
|
- alert: AerospikeHighClientWriteErrorRate
|
||
|
annotations:
|
||
|
description: '{{ printf "%.0f" $value }} percent of client write transactions
|
||
|
are resulting in errors for namespace {{$labels.ns}}, on node {{$labels.instance}},
|
||
|
on cluster {{$labels.aerospike_cluster}}, which is above the threshold of
|
||
|
25.'
|
||
|
summary: There is a high rate of errors for client write transactions.
|
||
|
expr: |
|
||
|
sum without(service) (rate(aerospike_namespace_client_write_error[5m])) / (clamp_min(sum without(service) (rate(aerospike_namespace_client_write_error[5m])) + sum without(service) (rate(aerospike_namespace_client_write_success[5m])), 1)) > 25
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: warning
|
||
|
- alert: AerospikeHighClientUDFErrorRate
|
||
|
annotations:
|
||
|
description: '{{ printf "%.0f" $value }} percent of client UDF transactions
|
||
|
are resulting in errors for namespace {{$labels.ns}}, on node {{$labels.instance}},
|
||
|
on cluster {{$labels.aerospike_cluster}}, which is above the threshold of
|
||
|
25.'
|
||
|
summary: There is a high rate of errors for client UDF transactions.
|
||
|
expr: |
|
||
|
sum without(service) (rate(aerospike_namespace_client_udf_error[5m])) / (clamp_min(sum without(service) (rate(aerospike_namespace_client_udf_error[5m])) + sum without(service) (rate(aerospike_namespace_client_udf_complete[5m])), 1)) > 25
|
||
|
for: 5m
|
||
|
labels:
|
||
|
severity: warning
|