mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
b3b400137a
Add blackbox exporter Add mysql exporter
90 lines
4.1 KiB
YAML
90 lines
4.1 KiB
YAML
groups:
|
|
- name: apache-hadoop
|
|
rules:
|
|
- alert: ApacheHadoopLowHDFSCapacity
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value }} percent remaining HDFS usage on {{$labels.hadoop_cluster}}
|
|
- {{$labels.instance}}, which is below the threshold of 20.'
|
|
summary: Remaining HDFS cluster capacity is low which may result in DataNode
|
|
failures or prevent DataNodes from writing data.
|
|
expr: |
|
|
min without(job, name) (100 * hadoop_namenode_capacityremaining / clamp_min(hadoop_namenode_capacitytotal, 1)) < 20
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: ApacheHadoopHDFSMissingBlocks
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value }} HDFS missing blocks on {{$labels.hadoop_cluster}}
|
|
- {{$labels.instance}}, which is above the threshold of 0.'
|
|
summary: There are missing blocks in the HDFS cluster which may indicate potential
|
|
data loss.
|
|
expr: |
|
|
max without(job, name) (hadoop_namenode_missingblocks) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: ApacheHadoopHDFSHighVolumeFailures
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value }} HDFS volume failures on {{$labels.hadoop_cluster}}
|
|
- {{$labels.instance}}, which is above the threshold of 0.'
|
|
summary: A volume failure in HDFS cluster may indicate hardware failures.
|
|
expr: |
|
|
max without(job, name) (hadoop_namenode_volumefailurestotal) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: ApacheHadoopHighDeadDataNodes
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value }} dead HDFS volume failures on {{$labels.hadoop_cluster}}
|
|
- {{$labels.instance}}, which is above the threshold of 0.'
|
|
summary: Number of dead DataNodes has increased, which could result in data
|
|
loss and increased network activity.
|
|
expr: |
|
|
max without(job, name) (hadoop_namenode_numdeaddatanodes) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: ApacheHadoopHighNodeManagerCPUUsage
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value }} CPU usage on {{$labels.hadoop_cluster}}
|
|
- {{$labels.instance}}, which is above the threshold of 80.'
|
|
summary: A NodeManager has a CPU usage higher than the configured threshold.
|
|
expr: |
|
|
max without(job, name) (100 * hadoop_nodemanager_nodecpuutilization) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: ApacheHadoopHighNodeManagerMemoryUsage
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value}} percent NodeManager memory usage on
|
|
{{$labels.hadoop_cluster}} - {{$labels.instance}}, which is above the threshold
|
|
of 80.'
|
|
summary: A NodeManager has a higher memory utilization than the configured threshold.
|
|
expr: |
|
|
max without(job, name) (100 * hadoop_nodemanager_allocatedgb / clamp_min(hadoop_nodemanager_availablegb + hadoop_nodemanager_allocatedgb,1)) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: ApacheHadoopHighResourceManagerVirtualCoreCPUUsage
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value }} virtual core CPU usage on {{$labels.hadoop_cluster}}
|
|
- {{$labels.instance}}, which is above the threshold of 80.'
|
|
summary: A ResourceManager has a virtual core CPU usage higher than the configured
|
|
threshold.
|
|
expr: |
|
|
max without(job, name) (100 * hadoop_resourcemanager_allocatedvcores / clamp_min(hadoop_resourcemanager_availablevcores + hadoop_resourcemanager_allocatedvcores,1)) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: ApacheHadoopHighResourceManagerMemoryUsage
|
|
annotations:
|
|
description: '{{ printf "%.0f" $value}} percent ResourceManager memory usage
|
|
on {{$labels.hadoop_cluster}} - {{$labels.instance}}, which is above the threshold
|
|
of 80.'
|
|
summary: A ResourceManager has a higher memory utilization than the configured
|
|
threshold.
|
|
expr: |
|
|
max without(job, name) (100 * hadoop_resourcemanager_allocatedmb / clamp_min(hadoop_resourcemanager_availablemb + hadoop_resourcemanager_allocatedmb,1)) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|