mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
assets,site/content: daily assets regeneration
This commit is contained in:
parent
5328885dcc
commit
b4693f5dbf
16 changed files with 26327 additions and 3 deletions
1123
assets/docker/dashboards/docker.json
Normal file
1123
assets/docker/dashboards/docker.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1 +1,25 @@
|
|||
null
|
||||
groups:
|
||||
- name: jvm-jvm-alerts
|
||||
rules:
|
||||
- alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
|
||||
last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
|
||||
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JvmThreadsDeadlocked
|
||||
annotations:
|
||||
description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
|
||||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job!=""}) > 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
2317
assets/jvm/dashboards/jvm-dashboard.json
Normal file
2317
assets/jvm/dashboards/jvm-dashboard.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1 +1,176 @@
|
|||
null
|
||||
groups:
|
||||
- name: kafka-kafka-alerts
|
||||
rules:
|
||||
- alert: KafkaLagKeepsIncreasing
|
||||
annotations:
|
||||
description: 'Kafka lag keeps increasing over the last 15 minutes for consumer
|
||||
group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag keeps increasing.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""}[5m]))
|
||||
> 0
|
||||
for: 15m
|
||||
keep_firing_for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KafkaLagIsTooHigh
|
||||
annotations:
|
||||
description: 'Total kafka lag across all partitions is too high ({{ printf "%.0f"
|
||||
$value }}) for consumer group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag is too high.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""})
|
||||
> 100
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KafkaISRExpandRate
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} In-Sync Replica (ISR) is expanding by {{ $value }} per second. If a broker
|
||||
goes down, ISR for some of the partitions shrink. When that broker is up again,
|
||||
ISRs are expanded once the replicas are fully caught up. Other than that,
|
||||
the expected value for ISR expansion rate is 0. If ISR is expanding and shrinking
|
||||
frequently, adjust Allowed replica lag.
|
||||
summary: Kafka ISR expansion rate is expanding.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster,instance) (sum by (job,kafka_cluster,instance) (kafka_server_replicamanager_isrexpandspersec{job="integrations/kafka"})) != 0
|
||||
for: 5m
|
||||
keep_firing_for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KafkaISRShrinkRate
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} In-Sync Replica (ISR) is shrinking by {{ $value }} per second. If a broker
|
||||
goes down, ISR for some of the partitions shrink. When that broker is up again,
|
||||
ISRs are expanded once the replicas are fully caught up. Other than that,
|
||||
the expected value for ISR shrink rate is 0. If ISR is expanding and shrinking
|
||||
frequently, adjust Allowed replica lag.
|
||||
summary: Kafka ISR expansion rate is shrinking.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster,instance) (sum by (job,kafka_cluster,instance) (kafka_server_replicamanager_isrshrinkspersec{job="integrations/kafka"})) != 0
|
||||
for: 5m
|
||||
keep_firing_for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KafkaOfflinePartitonCount
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} offline
|
||||
partitions. After successful leader election, if the leader for partition
|
||||
dies, then the partition moves to the OfflinePartition state. Offline partitions
|
||||
are not available for reading and writing. Restart the brokers, if needed,
|
||||
and check the logs for errors.
|
||||
summary: Kafka has offline partitons.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster) (kafka_controller_kafkacontroller_offlinepartitionscount{job="integrations/kafka"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KafkaUnderReplicatedPartitionCount
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} has {{ $value }} under replicated partitons
|
||||
summary: Kafka has under replicated partitons.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster,instance) (kafka_cluster_partition_underreplicated{job="integrations/kafka"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KafkaNoActiveController
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} broker(s)
|
||||
reporting as the active controller in the last 5 minute interval. During steady
|
||||
state there should be only one active controller per cluster.
|
||||
summary: Kafka has no active controller.
|
||||
expr: sum by(job,kafka_cluster) (kafka_controller_kafkacontroller_activecontrollercount{job="integrations/kafka"})
|
||||
!= 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KafkaUncleanLeaderElection
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} unclean
|
||||
partition leader elections reported in the last 5 minute interval. When unclean
|
||||
leader election is held among out-of-sync replicas, there is a possibility
|
||||
of data loss if any messages were not synced prior to the loss of the former
|
||||
leader. So if the number of unclean elections is greater than 0, investigate
|
||||
broker logs to determine why leaders were re-elected, and look for WARN or
|
||||
ERROR messages. Consider setting the broker configuration parameter unclean.leader.election.enable
|
||||
to false so that a replica outside of the set of in-sync replicas is never
|
||||
elected leader.
|
||||
summary: Kafka has unclean leader elections.
|
||||
expr: (sum by (job,kafka_cluster,instance) (kafka_controller_controllerstats_uncleanleaderelectionspersec{job="integrations/kafka"}))
|
||||
!= 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KafkaBrokerCount
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} broker count is 0.
|
||||
summary: Kafka has no brokers online.
|
||||
expr: count by(job,kafka_cluster) (kafka_server_kafkaserver_brokerstate{job="integrations/kafka"})
|
||||
== 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KafkaZookeeperSyncConnect
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} has disconected from Zookeeper.
|
||||
summary: Kafka Zookeeper sync disconected.
|
||||
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job="integrations/kafka",
|
||||
quantile="0.95"}[5m])) < 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: kafka-jvm-alerts
|
||||
rules:
|
||||
- alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
|
||||
last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_bytes_used{area="heap", job="integrations/kafka"}))/(sum
|
||||
without (id) (jvm_memory_bytes_max{area="heap", job="integrations/kafka"} !=
|
||||
-1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JvmThreadsDeadlocked
|
||||
annotations:
|
||||
description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
|
||||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job="integrations/kafka"}) > 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: kafka-zookeeper-jvm-alerts
|
||||
rules:
|
||||
- alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
|
||||
last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_bytes_used{area="heap", job=~"integrations/kafka-zookeeper|integrations/kafka"}))/(sum
|
||||
without (id) (jvm_memory_bytes_max{area="heap", job=~"integrations/kafka-zookeeper|integrations/kafka"}
|
||||
!= -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JvmThreadsDeadlocked
|
||||
annotations:
|
||||
description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
|
||||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job=~"integrations/kafka-zookeeper|integrations/kafka"})
|
||||
> 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
6265
assets/kafka/dashboards/connect-overview.json
Normal file
6265
assets/kafka/dashboards/connect-overview.json
Normal file
File diff suppressed because it is too large
Load diff
4419
assets/kafka/dashboards/kafka-ksqldb-overview.json
Normal file
4419
assets/kafka/dashboards/kafka-ksqldb-overview.json
Normal file
File diff suppressed because it is too large
Load diff
3759
assets/kafka/dashboards/kafka-overview-dashboard.json
Normal file
3759
assets/kafka/dashboards/kafka-overview-dashboard.json
Normal file
File diff suppressed because it is too large
Load diff
827
assets/kafka/dashboards/kafka-topic-dashboard.json
Normal file
827
assets/kafka/dashboards/kafka-topic-dashboard.json
Normal file
|
@ -0,0 +1,827 @@
|
|||
{
|
||||
"links": [
|
||||
{
|
||||
"asDropdown": false,
|
||||
"includeVars": true,
|
||||
"keepTime": true,
|
||||
"tags": [
|
||||
"kafka-integration"
|
||||
],
|
||||
"title": "All Kafka dashboards",
|
||||
"type": "dashboards"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 0,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"panels": [ ],
|
||||
"title": "Topics",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Topic start offset"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "none"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Topic end offset"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "none"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Messages in per second"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "mps"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Topic log size"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "decbytes"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
"refId": "Topic start offset"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_logendoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
"refId": "Topic end offset"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic,partition) (\n rate(kafka_topic_partition_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
"refId": "Messages in per second"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max by (job,kafka_cluster,topic,partition) (\n kafka_log_log_size{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
"refId": "Topic log size"
|
||||
}
|
||||
],
|
||||
"title": "Topic overview",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "timeSeriesTable"
|
||||
},
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "renameByRegex",
|
||||
"options": {
|
||||
"regex": "Trend #(.*)",
|
||||
"renamePattern": "$1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "filterByValue",
|
||||
"options": {
|
||||
"filters": [
|
||||
{
|
||||
"config": {
|
||||
"id": "isNull",
|
||||
"options": { }
|
||||
},
|
||||
"fieldName": "Messages in per second"
|
||||
}
|
||||
],
|
||||
"match": "all",
|
||||
"type": "exclude"
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"description": "Messages in per second.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"gradientMode": "opacity",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "Messages in per second"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "mps"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_topic_partition_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
"refId": "Messages in per second"
|
||||
}
|
||||
],
|
||||
"title": "Messages in per second",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"description": "Topic bytes in rate.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"gradientMode": "opacity",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"showPoints": "never"
|
||||
},
|
||||
"decimals": 1,
|
||||
"unit": "bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "Topic bytes in"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "Bps"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 15
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesinpersec{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
"refId": "Topic bytes in"
|
||||
}
|
||||
],
|
||||
"title": "Topic bytes in",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"description": "Topic bytes out rate.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"gradientMode": "opacity",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"showPoints": "never"
|
||||
},
|
||||
"decimals": 1,
|
||||
"unit": "bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "Topic bytes out"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "Bps"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 15
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,topic) (\n rate(kafka_server_brokertopicmetrics_bytesoutpersec{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ topic }}",
|
||||
"refId": "Topic bytes out"
|
||||
}
|
||||
],
|
||||
"title": "Topic bytes out",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 0,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"panels": [ ],
|
||||
"title": "Consumer groups",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "-- Mixed --"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Consumer group consume rate"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "mps"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Consumer group lag"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "short"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Consumer group lag in ms"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "ms"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
"refId": "Consumer group consume rate"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
"refId": "Consumer group lag"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
"refId": "Consumer group lag in ms"
|
||||
}
|
||||
],
|
||||
"title": "Consumer group overview",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "timeSeriesTable"
|
||||
},
|
||||
{
|
||||
"id": "merge"
|
||||
},
|
||||
{
|
||||
"id": "renameByRegex",
|
||||
"options": {
|
||||
"regex": "Trend #(.*)",
|
||||
"renamePattern": "$1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "filterByValue",
|
||||
"options": {
|
||||
"filters": [
|
||||
{
|
||||
"config": {
|
||||
"id": "isNotNull",
|
||||
"options": { }
|
||||
},
|
||||
"fieldName": "Consumer group consume rate"
|
||||
}
|
||||
],
|
||||
"match": "all",
|
||||
"type": "include"
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"description": "Consumer group consume rate.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"gradientMode": "opacity",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "Consumer group consume rate"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "mps"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n rate(kafka_consumergroup_current_offset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}[$__rate_interval])\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
"refId": "Consumer group consume rate"
|
||||
}
|
||||
],
|
||||
"title": "Consumer group consume rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"description": "Current approximate lag of a ConsumerGroup at Topic/Partition.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"gradientMode": "opacity",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "Consumer group lag"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "short"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "sum by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
"refId": "Consumer group lag"
|
||||
}
|
||||
],
|
||||
"title": "Consumer group lag",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"description": "Current approximate lag of a ConsumerGroup at Topic/Partition.",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"gradientMode": "opacity",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"showPoints": "never"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byFrameRefID",
|
||||
"options": "Consumer group lag in ms"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [ ]
|
||||
},
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "ms"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 32
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [ ],
|
||||
"displayMode": "list"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "v11.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "avg by (job,kafka_cluster,consumergroup,topic) (\n kafka_consumer_lag_millis{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\",consumergroup=~\"$consumergroup\"}\n)",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"legendFormat": "{{ consumergroup }} ({{ topic }})",
|
||||
"refId": "Consumer group lag in ms"
|
||||
}
|
||||
],
|
||||
"title": "Consumer group lag in ms",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"kafka-integration"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"label": "Data source",
|
||||
"name": "datasource",
|
||||
"query": "prometheus",
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": ".+",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Job",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\"}, job)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"allValue": ".+",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Kafka_cluster",
|
||||
"multi": true,
|
||||
"name": "kafka_cluster",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\"}, kafka_cluster)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"allValue": ".+",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Topic",
|
||||
"multi": true,
|
||||
"name": "topic",
|
||||
"query": "label_values(kafka_log_log_logstartoffset{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\"}, topic)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"allValue": ".+",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"includeAll": true,
|
||||
"label": "Consumergroup",
|
||||
"multi": true,
|
||||
"name": "consumergroup",
|
||||
"query": "label_values(kafka_consumergroup_uncommitted_offsets{job=\"integrations/kafka\",topic!=\"__consumer_offsets\",consumergroup!=\"\",job=~\"$job\",kafka_cluster=~\"$kafka_cluster\",topic=~\"$topic\"}, consumergroup)",
|
||||
"refresh": 2,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Kafka topic overview",
|
||||
"uid": "kafka-kafka-topic-dashboard"
|
||||
}
|
1147
assets/kafka/dashboards/schema-registry-overview.json
Normal file
1147
assets/kafka/dashboards/schema-registry-overview.json
Normal file
File diff suppressed because it is too large
Load diff
2974
assets/kafka/dashboards/zookeeper-overview.json
Normal file
2974
assets/kafka/dashboards/zookeeper-overview.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1 +1,14 @@
|
|||
null
|
||||
groups:
|
||||
- name: jvm-micrometer-jvm-alerts
|
||||
rules:
|
||||
- alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the
|
||||
last 5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
|
||||
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
2941
assets/spring-boot/dashboards/jvm-dashboard.json
Normal file
2941
assets/spring-boot/dashboards/jvm-dashboard.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -10,3 +10,8 @@ A set of Grafana dashboards for Docker (based on cadvisor).
|
|||
Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/docker-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [docker](https://github.com/monitoring-mixins/website/blob/master/assets/docker/dashboards/docker.json)
|
||||
|
|
|
@ -10,3 +10,48 @@ title: jvm
|
|||
Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jvm-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/jvm/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### jvm-jvm-alerts
|
||||
|
||||
##### JvmMemoryFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
|
||||
5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
|
||||
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JvmThreadsDeadlocked
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JvmThreadsDeadlocked
|
||||
annotations:
|
||||
description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
|
||||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job!=""}) > 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [jvm-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/jvm/dashboards/jvm-dashboard.json)
|
||||
|
|
|
@ -10,3 +10,264 @@ title: kafka
|
|||
Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/kafka-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/kafka/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### kafka-kafka-alerts
|
||||
|
||||
##### KafkaLagKeepsIncreasing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaLagKeepsIncreasing
|
||||
annotations:
|
||||
description: 'Kafka lag keeps increasing over the last 15 minutes for consumer group:
|
||||
{{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag keeps increasing.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (delta(kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""}[5m]))
|
||||
> 0
|
||||
for: 15m
|
||||
keep_firing_for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaLagIsTooHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaLagIsTooHigh
|
||||
annotations:
|
||||
description: 'Total kafka lag across all partitions is too high ({{ printf "%.0f"
|
||||
$value }}) for consumer group: {{$labels.consumergroup}}, topic: {{$labels.topic}}.'
|
||||
summary: Kafka lag is too high.
|
||||
expr: sum by (job,kafka_cluster, topic, consumergroup) (kafka_consumergroup_uncommitted_offsets{job="integrations/kafka",topic!="__consumer_offsets",consumergroup!=""})
|
||||
> 100
|
||||
for: 15m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaISRExpandRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaISRExpandRate
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} In-Sync Replica (ISR) is expanding by {{ $value }} per second. If a broker
|
||||
goes down, ISR for some of the partitions shrink. When that broker is up again,
|
||||
ISRs are expanded once the replicas are fully caught up. Other than that, the
|
||||
expected value for ISR expansion rate is 0. If ISR is expanding and shrinking
|
||||
frequently, adjust Allowed replica lag.
|
||||
summary: Kafka ISR expansion rate is expanding.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster,instance) (sum by (job,kafka_cluster,instance) (kafka_server_replicamanager_isrexpandspersec{job="integrations/kafka"})) != 0
|
||||
for: 5m
|
||||
keep_firing_for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaISRShrinkRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaISRShrinkRate
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} In-Sync Replica (ISR) is shrinking by {{ $value }} per second. If a broker
|
||||
goes down, ISR for some of the partitions shrink. When that broker is up again,
|
||||
ISRs are expanded once the replicas are fully caught up. Other than that, the
|
||||
expected value for ISR shrink rate is 0. If ISR is expanding and shrinking frequently,
|
||||
adjust Allowed replica lag.
|
||||
summary: Kafka ISR expansion rate is shrinking.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster,instance) (sum by (job,kafka_cluster,instance) (kafka_server_replicamanager_isrshrinkspersec{job="integrations/kafka"})) != 0
|
||||
for: 5m
|
||||
keep_firing_for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaOfflinePartitonCount
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaOfflinePartitonCount
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} offline
|
||||
partitions. After successful leader election, if the leader for partition dies,
|
||||
then the partition moves to the OfflinePartition state. Offline partitions are
|
||||
not available for reading and writing. Restart the brokers, if needed, and check
|
||||
the logs for errors.
|
||||
summary: Kafka has offline partitons.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster) (kafka_controller_kafkacontroller_offlinepartitionscount{job="integrations/kafka"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaUnderReplicatedPartitionCount
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaUnderReplicatedPartitionCount
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} has {{ $value }} under replicated partitons
|
||||
summary: Kafka has under replicated partitons.
|
||||
expr: |
|
||||
sum by (job,kafka_cluster,instance) (kafka_cluster_partition_underreplicated{job="integrations/kafka"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaNoActiveController
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaNoActiveController
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} broker(s)
|
||||
reporting as the active controller in the last 5 minute interval. During steady
|
||||
state there should be only one active controller per cluster.
|
||||
summary: Kafka has no active controller.
|
||||
expr: sum by(job,kafka_cluster) (kafka_controller_kafkacontroller_activecontrollercount{job="integrations/kafka"})
|
||||
!= 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaUncleanLeaderElection
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaUncleanLeaderElection
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} unclean
|
||||
partition leader elections reported in the last 5 minute interval. When unclean
|
||||
leader election is held among out-of-sync replicas, there is a possibility of
|
||||
data loss if any messages were not synced prior to the loss of the former leader.
|
||||
So if the number of unclean elections is greater than 0, investigate broker logs
|
||||
to determine why leaders were re-elected, and look for WARN or ERROR messages.
|
||||
Consider setting the broker configuration parameter unclean.leader.election.enable
|
||||
to false so that a replica outside of the set of in-sync replicas is never elected
|
||||
leader.
|
||||
summary: Kafka has unclean leader elections.
|
||||
expr: (sum by (job,kafka_cluster,instance) (kafka_controller_controllerstats_uncleanleaderelectionspersec{job="integrations/kafka"}))
|
||||
!= 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaBrokerCount
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaBrokerCount
|
||||
annotations:
|
||||
description: Kafka cluster {{ $labels.kafka_cluster }} broker count is 0.
|
||||
summary: Kafka has no brokers online.
|
||||
expr: count by(job,kafka_cluster) (kafka_server_kafkaserver_brokerstate{job="integrations/kafka"})
|
||||
== 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### KafkaZookeeperSyncConnect
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KafkaZookeeperSyncConnect
|
||||
annotations:
|
||||
description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster
|
||||
}} has disconected from Zookeeper.
|
||||
summary: Kafka Zookeeper sync disconected.
|
||||
expr: avg by(job,kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnectspersec{job="integrations/kafka",
|
||||
quantile="0.95"}[5m])) < 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### kafka-jvm-alerts
|
||||
|
||||
##### JvmMemoryFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
|
||||
5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_bytes_used{area="heap", job="integrations/kafka"}))/(sum
|
||||
without (id) (jvm_memory_bytes_max{area="heap", job="integrations/kafka"} != -1)))
|
||||
* 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JvmThreadsDeadlocked
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JvmThreadsDeadlocked
|
||||
annotations:
|
||||
description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
|
||||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job="integrations/kafka"}) > 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### kafka-zookeeper-jvm-alerts
|
||||
|
||||
##### JvmMemoryFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
|
||||
5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_bytes_used{area="heap", job=~"integrations/kafka-zookeeper|integrations/kafka"}))/(sum
|
||||
without (id) (jvm_memory_bytes_max{area="heap", job=~"integrations/kafka-zookeeper|integrations/kafka"}
|
||||
!= -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JvmThreadsDeadlocked
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JvmThreadsDeadlocked
|
||||
annotations:
|
||||
description: 'JVM deadlock detected: Threads in the JVM application {{$labels.instance}}
|
||||
are in a cyclic dependency with each other. The restart is required to resolve
|
||||
the deadlock.'
|
||||
summary: JVM deadlock detected.
|
||||
expr: (jvm_threads_deadlocked{job=~"integrations/kafka-zookeeper|integrations/kafka"})
|
||||
> 0
|
||||
for: 2m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [connect-overview](https://github.com/monitoring-mixins/website/blob/master/assets/kafka/dashboards/connect-overview.json)
|
||||
- [kafka-ksqldb-overview](https://github.com/monitoring-mixins/website/blob/master/assets/kafka/dashboards/kafka-ksqldb-overview.json)
|
||||
- [kafka-overview-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/kafka/dashboards/kafka-overview-dashboard.json)
|
||||
- [kafka-topic-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/kafka/dashboards/kafka-topic-dashboard.json)
|
||||
- [schema-registry-overview](https://github.com/monitoring-mixins/website/blob/master/assets/kafka/dashboards/schema-registry-overview.json)
|
||||
- [zookeeper-overview](https://github.com/monitoring-mixins/website/blob/master/assets/kafka/dashboards/zookeeper-overview.json)
|
||||
|
|
|
@ -10,3 +10,32 @@ title: spring-boot
|
|||
Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/spring-boot-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/spring-boot/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### jvm-micrometer-jvm-alerts
|
||||
|
||||
##### JvmMemoryFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JvmMemoryFillingUp
|
||||
annotations:
|
||||
description: JVM heap memory usage is at {{ printf "%.0f" $value }}% over the last
|
||||
5 minutes on {{$labels.instance}}, which is above the threshold of 80%.
|
||||
summary: JVM heap memory filling up.
|
||||
expr: ((sum without (id) (jvm_memory_used_bytes{area="heap", job!=""}))/(sum without
|
||||
(id) (jvm_memory_max_bytes{area="heap", job!=""} != -1))) * 100 > 80
|
||||
for: 5m
|
||||
keep_firing_for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [jvm-dashboard](https://github.com/monitoring-mixins/website/blob/master/assets/spring-boot/dashboards/jvm-dashboard.json)
|
||||
|
|
Loading…
Reference in a new issue