mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 11:48:53 +00:00
commit
8cd343a913
29 changed files with 497 additions and 5107 deletions
contrib/kube-prometheus
assets
alertmanager
grafana
_grafanalib.pydeployment.dashboard.py
generated
kubernetes-capacity-planning.dashboard.pykubernetes-cluster-health.dashboard.pykubernetes-cluster-status.dashboard.pykubernetes-control-plane-status.dashboard.pykubernetes-resource-requests.dashboard.pynodes.dashboard.pypods.dashboard.pyprometheus-datasource.jsonraw-json-dashboards
statefulset.dashboard.pyprometheus/rules
jsonnet/kube-prometheus
manifests
|
@ -1,14 +0,0 @@
|
|||
global:
|
||||
resolve_timeout: 5m
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'null'
|
|
@ -1,91 +0,0 @@
|
|||
from grafanalib import core
|
||||
from grafanalib.core import Graph, Time, SparkLine, \
|
||||
Gauge, Templating, XAxis, YAxes
|
||||
|
||||
|
||||
def Dashboard(
|
||||
title, version, time, rows, graphTooltip=0, templating=None,
|
||||
):
|
||||
optional_args = {}
|
||||
if templating is not None:
|
||||
optional_args['templating'] = templating
|
||||
return core.Dashboard(
|
||||
title=title, refresh=None, schemaVersion=14,
|
||||
version=version, time=time, timezone='browser', inputs=[
|
||||
{
|
||||
'name': 'prometheus',
|
||||
'label': 'prometheus',
|
||||
'description': '',
|
||||
'type': 'datasource',
|
||||
'pluginId': 'prometheus',
|
||||
'pluginName': 'Prometheus'
|
||||
},
|
||||
], rows=rows, graphTooltip=graphTooltip, editable=False, **optional_args,
|
||||
)
|
||||
|
||||
|
||||
def Row(
|
||||
panels, height=None, title='Dashboard Row', showTitle=False
|
||||
):
|
||||
assert isinstance(height, (type(None), int))
|
||||
return core.Row(
|
||||
panels=panels, height=height, title=title, showTitle=showTitle,
|
||||
titleSize='h6', editable=False,
|
||||
)
|
||||
|
||||
|
||||
def SingleStat(
|
||||
title, id, targets, colorValue=False, gauge=Gauge(show=True),
|
||||
valueFontSize='80%', thresholds=None, valueName='avg', valueMaps=None,
|
||||
rangeMaps=None, mappingTypes=None, mappingType=None, postfix=None,
|
||||
sparkline=SparkLine(), prefixFontSize='50%', colors=[
|
||||
(50, 172, 45, 0.97),
|
||||
(237, 129, 40, 0.89),
|
||||
(245, 54, 54, 0.9),
|
||||
], span=None, format='none', transparent=None,
|
||||
):
|
||||
def merge_target(target):
|
||||
return {**{
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
}, **target}
|
||||
targets = [merge_target(t) for t in targets]
|
||||
|
||||
return core.SingleStat(
|
||||
title=title, id=id, colorValue=colorValue,
|
||||
dataSource='prometheus', gauge=gauge,
|
||||
valueFontSize=valueFontSize, thresholds=thresholds,
|
||||
valueName=valueName, valueMaps=valueMaps, rangeMaps=rangeMaps,
|
||||
mappingTypes=mappingTypes, targets=targets,
|
||||
mappingType=mappingType, format=format, colors=colors, span=span,
|
||||
postfix=postfix, sparkline=sparkline, prefixFontSize=prefixFontSize,
|
||||
hideTimeOverride=None, transparent=transparent, editable=False,
|
||||
)
|
||||
|
||||
|
||||
def Graph(
|
||||
id, title, targets, dashLength=None, dashes=False, spaceLength=None,
|
||||
xAxis=None, yAxes=None, nullPointMode='connected',
|
||||
):
|
||||
def merge_target(target):
|
||||
return {**{
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
}, **target}
|
||||
|
||||
targets = [merge_target(t) for t in targets]
|
||||
assert isinstance(yAxes, YAxes)
|
||||
return core.Graph(
|
||||
id=id, title=title, dashLength=dashLength, dashes=dashes,
|
||||
spaceLength=spaceLength, targets=targets, xAxis=xAxis, yAxes=yAxes,
|
||||
dataSource='prometheus', nullPointMode=nullPointMode, editable=False,
|
||||
)
|
||||
|
||||
|
||||
def YAxis(format='none', label='', min=0, show=True):
|
||||
return core.YAxis(
|
||||
format=format, label=label, min=min, show=show
|
||||
)
|
|
@ -1,467 +0,0 @@
|
|||
import sys
|
||||
import os.path
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from _grafanalib import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Deployment',
|
||||
version=1,
|
||||
graphTooltip=1,
|
||||
time=Time(start='now-6h'),
|
||||
templating=Templating(list=[
|
||||
{
|
||||
'allValue': '.*',
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': False,
|
||||
'label': 'Namespace',
|
||||
'multi': False,
|
||||
'name': 'deployment_namespace',
|
||||
'options': [],
|
||||
'query': 'label_values(kube_deployment_metadata_generation, '
|
||||
'namespace)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': None,
|
||||
'tags': [],
|
||||
'tagsQuery': '',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
{
|
||||
'allValue': None,
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': False,
|
||||
'label': 'Deployment',
|
||||
'multi': False,
|
||||
'name': 'deployment_name',
|
||||
'options': [],
|
||||
'query': 'label_values(kube_deployment_metadata_generation'
|
||||
'{namespace="$deployment_namespace"}, deployment)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': '',
|
||||
'tags': [],
|
||||
'tagsQuery': 'deployment',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
]),
|
||||
rows=[
|
||||
Row(panels=[
|
||||
SingleStat(
|
||||
title='CPU',
|
||||
id=8,
|
||||
gauge=Gauge(show=False),
|
||||
postfix='cores',
|
||||
span=4,
|
||||
valueFontSize='110%',
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
sparkline=SparkLine(
|
||||
fillColor=(31, 118, 189, 0.18),
|
||||
lineColor=(31, 120, 193),
|
||||
show=True,
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(container_cpu_usage_seconds_total'
|
||||
'{namespace=\"$deployment_namespace\",pod_name=~\"'
|
||||
'$deployment_name.*\"}[3m]))',
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Memory',
|
||||
id=9,
|
||||
postfix='GB',
|
||||
prefixFontSize='80%',
|
||||
gauge=Gauge(show=False),
|
||||
span=4,
|
||||
valueFontSize='110%',
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
sparkline=SparkLine(
|
||||
fillColor=(31, 118, 189, 0.18),
|
||||
lineColor=(31, 120, 193),
|
||||
show=True,
|
||||
),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(container_memory_usage_bytes{namespace='
|
||||
'\"$deployment_namespace\",pod_name=~\"$'
|
||||
'deployment_name.*\"}) / 1024^3',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Network',
|
||||
format='Bps',
|
||||
gauge=Gauge(thresholdMarkers=False),
|
||||
id=7,
|
||||
postfix='',
|
||||
span=4,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
sparkline=SparkLine(
|
||||
fillColor=(31, 118, 189, 0.18),
|
||||
lineColor=(31, 120, 193),
|
||||
show=True,
|
||||
),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(container_network_transmit_'
|
||||
'bytes_total'
|
||||
'{namespace=\"$deployment_namespace\",pod_name=~\"'
|
||||
'$deployment_name.*\"}[3m])) + '
|
||||
'sum(rate(container_network_receive_bytes_total'
|
||||
'{namespace=\"$deployment_namespace\",pod_name=~'
|
||||
'\"$deployment_name.*\"}[3m]))',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
height=200,
|
||||
),
|
||||
Row(
|
||||
height=100, panels=[
|
||||
SingleStat(
|
||||
title='Desired Replicas',
|
||||
id=5,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'metric': 'kube_deployment_spec_replicas',
|
||||
'expr': 'max(kube_deployment_spec_replicas'
|
||||
'{deployment="$deployment_name",namespace='
|
||||
'"$deployment_namespace"}) without '
|
||||
'(instance, pod)',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
gauge=Gauge(thresholdMarkers=False, show=False),
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Available Replicas',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
gauge=Gauge(show=False),
|
||||
id=6,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'min(kube_deployment_status_replicas_'
|
||||
'available{deployment=\"$deployment_name\",'
|
||||
'namespace=\"$deployment_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
sparkline=SparkLine(),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Observed Generation',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
gauge=Gauge(),
|
||||
id=3,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(kube_deployment_status_observed_'
|
||||
'generation{deployment=\"$deployment_name\",'
|
||||
'namespace=\"$deployment_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': "null",
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
sparkline=SparkLine(),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Metadata Generation',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
gauge=Gauge(show=False),
|
||||
id=2,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(kube_deployment_metadata_generation'
|
||||
'{deployment=\"$deployment_name\",namespace=\"'
|
||||
'$deployment_namespace\"}) without (instance, '
|
||||
'pod)',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
sparkline=SparkLine(),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=350, panels=[
|
||||
Graph(
|
||||
title='Replicas',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
id=1,
|
||||
spaceLength=10,
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(kube_deployment_status_replicas'
|
||||
'{deployment=\"$deployment_name\",namespace=\"'
|
||||
'$deployment_namespace\"}) without (instance, '
|
||||
'pod)',
|
||||
'legendFormat': 'current replicas',
|
||||
'refId': 'A',
|
||||
'step': 30,
|
||||
},
|
||||
{
|
||||
'expr': 'min(kube_deployment_status_replicas_'
|
||||
'available{deployment=\"$deployment_name\",'
|
||||
'namespace=\"$deployment_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
'legendFormat': 'available',
|
||||
'refId': 'B',
|
||||
'step': 30,
|
||||
},
|
||||
{
|
||||
'expr': 'max(kube_deployment_status_replicas_'
|
||||
'unavailable{deployment=\"$deployment_name\",'
|
||||
'namespace=\"$deployment_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
'legendFormat': 'unavailable',
|
||||
'refId': 'C',
|
||||
'step': 30,
|
||||
},
|
||||
{
|
||||
'expr': 'min(kube_deployment_status_replicas_'
|
||||
'updated{deployment=\"$deployment_name\",'
|
||||
'namespace=\"$deployment_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
'legendFormat': 'updated',
|
||||
'refId': 'D',
|
||||
'step': 30,
|
||||
},
|
||||
{
|
||||
'expr': 'max(kube_deployment_spec_replicas'
|
||||
'{deployment=\"$deployment_name\",namespace=\"'
|
||||
'$deployment_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
'legendFormat': 'desired',
|
||||
'refId': 'E',
|
||||
'step': 30,
|
||||
}
|
||||
],
|
||||
xAxis=XAxis(mode='time'),
|
||||
yAxes=YAxes(
|
||||
YAxis(min=None),
|
||||
YAxis(format='short', min=None, show=False),
|
||||
),
|
||||
),
|
||||
]
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,2 +0,0 @@
|
|||
*-dashboard.json
|
||||
*-datasource.json
|
|
@ -1,465 +0,0 @@
|
|||
from grafanalib.core import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Kubernetes Capacity Planning',
|
||||
version=4,
|
||||
gnetId=22,
|
||||
graphTooltip=0,
|
||||
refresh=False,
|
||||
editable=False,
|
||||
schemaVersion=14,
|
||||
time=Time(start='now-1h'),
|
||||
timezone='browser',
|
||||
inputs=[
|
||||
{
|
||||
'name': 'prometheus',
|
||||
'label': 'prometheus',
|
||||
'description': '',
|
||||
'type': 'datasource',
|
||||
'pluginId': 'prometheus',
|
||||
'pluginName': 'Prometheus',
|
||||
}
|
||||
],
|
||||
rows=[
|
||||
Row(
|
||||
height=250, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Idle CPU',
|
||||
id=3,
|
||||
dataSource='prometheus',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='percent', label='cpu usage',),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(node_cpu{mode=\"idle\"}[2m])) '
|
||||
'* 100',
|
||||
'hide': False,
|
||||
'intervalFactor': 10,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 50,
|
||||
},
|
||||
],
|
||||
),
|
||||
Graph(
|
||||
title='System Load',
|
||||
id=9,
|
||||
dataSource='prometheus',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='percentunit', min=None),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(node_load1)',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'load 1m',
|
||||
'refId': 'A',
|
||||
'step': 20,
|
||||
'target': '',
|
||||
},
|
||||
{
|
||||
'expr': 'sum(node_load5)',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'load 5m',
|
||||
'refId': 'B',
|
||||
'step': 20,
|
||||
'target': ''
|
||||
},
|
||||
{
|
||||
'expr': 'sum(node_load15)',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'load 15m',
|
||||
'refId': 'C',
|
||||
'step': 20,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=250, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Memory Usage',
|
||||
id=4,
|
||||
dataSource='prometheus',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=9,
|
||||
stack=True,
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'node_memory_SwapFree{instance='
|
||||
'\"172.17.0.1:9100\",job=\"prometheus\"}',
|
||||
'yaxis': 2,
|
||||
}
|
||||
],
|
||||
tooltip=Tooltip(
|
||||
msResolution=False, valueType='individual'
|
||||
),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', min='0'),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(node_memory_MemTotal) - sum(node_'
|
||||
'memory_MemFree) - sum(node_memory_Buffers) - '
|
||||
'sum(node_memory_Cached)',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory usage',
|
||||
'metric': 'memo',
|
||||
'refId': 'A',
|
||||
'step': 10,
|
||||
'target': '',
|
||||
},
|
||||
{
|
||||
'expr': 'sum(node_memory_Buffers)',
|
||||
'interval': '',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory buffers',
|
||||
'metric': 'memo',
|
||||
'refId': 'B',
|
||||
'step': 10,
|
||||
'target': '',
|
||||
},
|
||||
{
|
||||
'expr': 'sum(node_memory_Cached)',
|
||||
'interval': '',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory cached',
|
||||
'metric': 'memo',
|
||||
'refId': 'C',
|
||||
'step': 10,
|
||||
'target': '',
|
||||
},
|
||||
{
|
||||
'expr': 'sum(node_memory_MemFree)',
|
||||
'interval': '',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory free',
|
||||
'metric': 'memo',
|
||||
'refId': 'D',
|
||||
'step': 10,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Memory Usage',
|
||||
dataSource='prometheus',
|
||||
id=5,
|
||||
format='percent',
|
||||
span=3,
|
||||
gauge=Gauge(show=True),
|
||||
editable=False,
|
||||
thresholds='80, 90',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '((sum(node_memory_MemTotal) - '
|
||||
'sum(node_memory_MemFree) - sum('
|
||||
'node_memory_Buffers) - sum(node_memory_Cached)) '
|
||||
'/ sum(node_memory_MemTotal)) * 100',
|
||||
'intervalFactor': 2,
|
||||
'metric': '',
|
||||
'refId': 'A',
|
||||
'step': 60,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=246, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Disk I/O',
|
||||
dataSource='prometheus',
|
||||
id=6,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=9,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'read',
|
||||
'yaxis': 1
|
||||
},
|
||||
{
|
||||
'alias': '{instance=\"172.17.0.1:9100\"}',
|
||||
'yaxis': 2,
|
||||
},
|
||||
{
|
||||
'alias': 'io time',
|
||||
'yaxis': 2,
|
||||
},
|
||||
],
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', min=None),
|
||||
YAxis(format='ms', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(node_disk_bytes_read[5m]))',
|
||||
'hide': False,
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'read',
|
||||
'refId': 'A',
|
||||
'step': 20,
|
||||
'target': ''
|
||||
},
|
||||
{
|
||||
'expr': 'sum(rate(node_disk_bytes_written[5m]))',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'written',
|
||||
'refId': 'B',
|
||||
'step': 20
|
||||
},
|
||||
{
|
||||
'expr': 'sum(rate(node_disk_io_time_ms[5m]))',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'io time',
|
||||
'refId': 'C',
|
||||
'step': 20
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Disk Space Usage',
|
||||
dataSource='prometheus',
|
||||
id=12,
|
||||
span=3,
|
||||
editable=False,
|
||||
format='percentunit',
|
||||
valueName='current',
|
||||
gauge=Gauge(
|
||||
maxValue=1,
|
||||
show=True,
|
||||
),
|
||||
thresholds='0.75, 0.9',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(node_filesystem_size{device!='
|
||||
'\"rootfs\"}) - sum(node_filesystem_free{'
|
||||
'device!=\"rootfs\"})) / sum(node_filesystem_size'
|
||||
'{device!=\"rootfs\"})',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 60,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
),
|
||||
]
|
||||
),
|
||||
Row(
|
||||
height=250, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Network Received',
|
||||
dataSource='prometheus',
|
||||
id=8,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'transmitted',
|
||||
'yaxis': 2,
|
||||
},
|
||||
],
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', min=None),
|
||||
YAxis(format='bytes', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(node_network_receive_bytes'
|
||||
'{device!~\"lo\"}[5m]))',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 10,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
Graph(
|
||||
title='Network Transmitted',
|
||||
dataSource='prometheus',
|
||||
id=10,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'transmitted',
|
||||
'yaxis': 2,
|
||||
},
|
||||
],
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', min=None),
|
||||
YAxis(format='bytes', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(node_network_transmit_bytes'
|
||||
'{device!~\"lo\"}[5m]))',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'B',
|
||||
'step': 10,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=276, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6',
|
||||
panels=[
|
||||
Graph(
|
||||
title='Cluster Pod Utilization',
|
||||
dataSource='prometheus',
|
||||
id=11,
|
||||
span=9,
|
||||
dashes=False,
|
||||
editable=False,
|
||||
spaceLength=11,
|
||||
tooltip=Tooltip(
|
||||
msResolution=False,
|
||||
valueType='individual',
|
||||
),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='short', min=None),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(kube_pod_info)',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Current number of Pods',
|
||||
'refId': 'A',
|
||||
'step': 10,
|
||||
},
|
||||
{
|
||||
'expr': 'sum(kube_node_status_capacity_pods)',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Maximum capacity of pods',
|
||||
'refId': 'B',
|
||||
'step': 10,
|
||||
}
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Pod Utilization',
|
||||
dataSource='prometheus',
|
||||
id=7,
|
||||
editable=False,
|
||||
span=3,
|
||||
format='percent',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
gauge=Gauge(
|
||||
show=True,
|
||||
),
|
||||
thresholds='80, 90',
|
||||
valueName='current',
|
||||
targets=[
|
||||
{
|
||||
'expr': '100 - (sum(kube_node_status_capacity_'
|
||||
'pods) - sum(kube_pod_info)) / sum(kube_node_'
|
||||
'status_capacity_pods) * 100',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 60,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
),
|
||||
]
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,405 +0,0 @@
|
|||
from grafanalib.core import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Kubernetes Cluster Health',
|
||||
version=9,
|
||||
graphTooltip=0,
|
||||
schemaVersion=14,
|
||||
editable=False,
|
||||
time=Time(start='now-6h'),
|
||||
timezone='browser',
|
||||
inputs=[
|
||||
{
|
||||
'name': 'prometheus',
|
||||
'label': 'prometheus',
|
||||
'description': '',
|
||||
'type': 'datasource',
|
||||
'pluginId': 'prometheus',
|
||||
'pluginName': 'Prometheus'
|
||||
},
|
||||
],
|
||||
rows=[
|
||||
Row(
|
||||
height=254, title='Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
SingleStat(
|
||||
title='Control Plane Components Down',
|
||||
id=1,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
span=3,
|
||||
thresholds='1, 3',
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'Everything UP and healthy',
|
||||
'value': 'null',
|
||||
},
|
||||
{
|
||||
'op': '=',
|
||||
'text': '',
|
||||
'value': '',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(up{job=~"apiserver|kube-scheduler|'
|
||||
'kube-controller-manager"} == 0)',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Alerts Firing',
|
||||
id=2,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
span=3,
|
||||
valueName='current',
|
||||
thresholds='1, 3',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': '0',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(ALERTS{alertstate="firing",'
|
||||
'alertname!="DeadMansSwitch"})',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Alerts Pending',
|
||||
id=3,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
span=3,
|
||||
valueName='current',
|
||||
thresholds='3, 5',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': '0',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(ALERTS{alertstate="pending",'
|
||||
'alertname!="DeadMansSwitch"})',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Crashlooping Pods',
|
||||
id=4,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
span=3,
|
||||
valueName='current',
|
||||
thresholds='1, 3',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': '0',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'count(increase(kube_pod_container_'
|
||||
'status_restarts[1h]) > 5)',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=250, title='Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
SingleStat(
|
||||
title='Node Not Ready',
|
||||
id=5,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
span=3,
|
||||
valueName='current',
|
||||
thresholds='1, 3',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(kube_node_status_condition{'
|
||||
'condition="Ready",status!="true"})',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Node Disk Pressure',
|
||||
id=6,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
span=3,
|
||||
valueName='current',
|
||||
thresholds='1, 3',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(kube_node_status_condition'
|
||||
'{condition="DiskPressure",status="true"})',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Node Memory Pressure',
|
||||
id=7,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
span=3,
|
||||
valueName='current',
|
||||
thresholds='1, 3',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(kube_node_status_condition'
|
||||
'{condition="MemoryPressure",status="true"})',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Nodes Unschedulable',
|
||||
id=8,
|
||||
dataSource='prometheus',
|
||||
gauge=Gauge(),
|
||||
colorValue=True,
|
||||
editable=False,
|
||||
span=3,
|
||||
valueName='current',
|
||||
thresholds='1, 3',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(kube_node_spec_unschedulable)',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,450 +0,0 @@
|
|||
import sys
|
||||
import os.path
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from _grafanalib import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Kubernetes Cluster Status',
|
||||
version=3,
|
||||
time=Time(start='now-6h'),
|
||||
rows=[
|
||||
Row(
|
||||
height=129, title='Cluster Health', showTitle=True,
|
||||
panels=[
|
||||
SingleStat(
|
||||
title='Control Plane UP',
|
||||
id=5,
|
||||
gauge=Gauge(show=False),
|
||||
colorValue=True,
|
||||
mappingType=1,
|
||||
thresholds='1, 3',
|
||||
valueName='total',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'UP',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(up{job=~"apiserver|kube-scheduler|'
|
||||
'kube-controller-manager"} == 0)',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Alerts Firing',
|
||||
id=6,
|
||||
gauge=Gauge(show=False),
|
||||
colorValue=True,
|
||||
mappingType=1,
|
||||
thresholds='3, 5',
|
||||
valueName='current',
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': '0',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(ALERTS{alertstate="firing",'
|
||||
'alertname!="DeadMansSwitch"})',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=168, title='Control Plane Status', showTitle=True,
|
||||
panels=[
|
||||
SingleStat(
|
||||
title='API Servers UP',
|
||||
id=1,
|
||||
mappingType=1,
|
||||
format='percent',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
thresholds='50, 80',
|
||||
span=3,
|
||||
valueName='current',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(up{job="apiserver"} == 1) / '
|
||||
'count(up{job="apiserver"})) * 100',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Controller Managers UP',
|
||||
id=2,
|
||||
span=3,
|
||||
mappingType=1,
|
||||
thresholds='50, 80',
|
||||
format='percent',
|
||||
valueName='current',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(up{job="kube-controller-manager"} =='
|
||||
' 1) / count(up{job="kube-controller-manager"})) '
|
||||
'* 100',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Schedulers UP',
|
||||
id=3,
|
||||
span=3,
|
||||
mappingType=1,
|
||||
format='percent',
|
||||
thresholds='50, 80',
|
||||
valueName='current',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(up{job="kube-scheduler"} == 1) / '
|
||||
'count(up{job="kube-scheduler"})) * 100',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Crashlooping Control Plane Pods',
|
||||
id=4,
|
||||
colorValue=True,
|
||||
gauge=Gauge(show=False),
|
||||
span=3,
|
||||
mappingType=1,
|
||||
thresholds='1, 3',
|
||||
valueName='current',
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': '0',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'count(increase(kube_pod_container_'
|
||||
'status_restarts{namespace=~"kube-system|'
|
||||
'tectonic-system"}[1h]) > 5)',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=158, title='Capacity Planning', showTitle=True,
|
||||
panels=[
|
||||
SingleStat(
|
||||
title='CPU Utilization',
|
||||
id=8,
|
||||
format='percent',
|
||||
mappingType=1,
|
||||
span=3,
|
||||
thresholds='80, 90',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(100 - (avg by (instance) (rate('
|
||||
'node_cpu{job="node-exporter",mode="idle"}[5m])) '
|
||||
'* 100)) / count(node_cpu{job="node-exporter",'
|
||||
'mode="idle"})',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Memory Utilization',
|
||||
id=7,
|
||||
format='percent',
|
||||
span=3,
|
||||
mappingType=1,
|
||||
thresholds='80, 90',
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '((sum(node_memory_MemTotal) - sum('
|
||||
'node_memory_MemFree) - sum(node_memory_Buffers) '
|
||||
'- sum(node_memory_Cached)) / sum('
|
||||
'node_memory_MemTotal)) * 100',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Filesystem Utilization',
|
||||
id=9,
|
||||
span=3,
|
||||
format='percent',
|
||||
mappingType=1,
|
||||
thresholds='80, 90',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(node_filesystem_size{device!='
|
||||
'"rootfs"}) - sum(node_filesystem_free{device!='
|
||||
'"rootfs"})) / sum(node_filesystem_size{device!='
|
||||
'"rootfs"})',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Pod Utilization',
|
||||
id=10,
|
||||
gauge=Gauge(show=True),
|
||||
span=3,
|
||||
mappingType=1,
|
||||
format='percent',
|
||||
thresholds='80, 90',
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '100 - (sum(kube_node_status_capacity_pods'
|
||||
') - sum(kube_pod_info)) / sum(kube_node_status_'
|
||||
'capacity_pods) * 100',
|
||||
'format': 'time_series',
|
||||
},
|
||||
]
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,344 +0,0 @@
|
|||
from grafanalib.core import *
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Kubernetes Control Plane Status',
|
||||
version=3,
|
||||
graphTooltip=0,
|
||||
schemaVersion=14,
|
||||
time=Time(start='now-6h'),
|
||||
timezone='browser',
|
||||
refresh=None,
|
||||
editable=False,
|
||||
inputs=[
|
||||
{
|
||||
'name': 'prometheus',
|
||||
'label': 'prometheus',
|
||||
'description': '',
|
||||
'type': 'datasource',
|
||||
'pluginId': 'prometheus',
|
||||
'pluginName': 'Prometheus'
|
||||
},
|
||||
],
|
||||
rows=[
|
||||
Row(
|
||||
title='Dashboard Row', showTitle=False, titleSize='h6', editable=False,
|
||||
panels=[
|
||||
SingleStat(
|
||||
title='API Servers UP',
|
||||
dataSource='prometheus',
|
||||
format='percent',
|
||||
editable=False,
|
||||
gauge=Gauge(
|
||||
show=True,
|
||||
),
|
||||
id=1,
|
||||
span=3,
|
||||
thresholds='50, 80',
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(up{job=\"apiserver\"} == 1) / '
|
||||
'sum(up{job=\"apiserver\"})) * 100',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Controller Managers UP',
|
||||
dataSource='prometheus',
|
||||
format='percent',
|
||||
editable=False,
|
||||
gauge=Gauge(
|
||||
show=True,
|
||||
),
|
||||
id=2,
|
||||
span=3,
|
||||
thresholds='50, 80',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
rangeMaps=([
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
]),
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(up{job=\"kube-controller-manager\"}'
|
||||
' == 1) / sum(up{job=\"kube-controller-manager\"'
|
||||
'})) * 100',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
}
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='Schedulers UP',
|
||||
dataSource='prometheus',
|
||||
format='percent',
|
||||
editable=False,
|
||||
gauge=Gauge(
|
||||
show=True,
|
||||
),
|
||||
id=3,
|
||||
span=3,
|
||||
thresholds='50, 80',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
rangeMaps=([
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
]),
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(up{job=\"kube-scheduler\"} == 1) '
|
||||
'/ sum(up{job=\"kube-scheduler\"})) * 100',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
}
|
||||
]
|
||||
),
|
||||
SingleStat(
|
||||
title='API Server Request Error Rate',
|
||||
dataSource='prometheus',
|
||||
format='percent',
|
||||
editable=False,
|
||||
gauge=Gauge(
|
||||
show=True,
|
||||
),
|
||||
id=4,
|
||||
span=3,
|
||||
thresholds='5, 10',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': '0',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
rangeMaps=([
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
]),
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(sum by(instance) (rate('
|
||||
'apiserver_request_count{code=~"5.."}[5m])) / '
|
||||
'sum by(instance) (rate(apiserver_request_count'
|
||||
'[5m]))) * 100',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
]
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
title='Dashboard Row', showTitle=False, titleSize='h6', editable=False,
|
||||
panels=[
|
||||
Graph(
|
||||
title='API Server Request Latency',
|
||||
id=7,
|
||||
dataSource='prometheus',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
lineWidth=1,
|
||||
nullPointMode='null',
|
||||
tooltip=Tooltip(
|
||||
msResolution=False, valueType='individual',
|
||||
),
|
||||
spaceLength=10,
|
||||
yAxes=YAxes(
|
||||
YAxis(format='short', min=None),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum by(verb) (rate(apiserver_latency_'
|
||||
'seconds:quantile[5m]) >= 0)',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 30,
|
||||
}
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
title='Dashboard Row', showTitle=False, titleSize='h6', editable=False,
|
||||
panels=[
|
||||
Graph(
|
||||
title='End to End Scheduling Latency',
|
||||
id=5,
|
||||
dataSource='prometheus',
|
||||
isNew=False,
|
||||
editable=False,
|
||||
dashLength=10,
|
||||
lineWidth=1,
|
||||
nullPointMode="null",
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
dashes=False,
|
||||
tooltip=Tooltip(
|
||||
msResolution=False,
|
||||
valueType='individual',
|
||||
),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='short', min=None),
|
||||
YAxis(format='dtdurations', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'cluster:scheduler_e2e_scheduling_'
|
||||
'latency_seconds:quantile',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 60,
|
||||
}
|
||||
],
|
||||
),
|
||||
Graph(
|
||||
title='API Server Request Rates',
|
||||
id=6,
|
||||
dataSource='prometheus',
|
||||
isNew=False,
|
||||
editable=False,
|
||||
dashLength=10,
|
||||
lineWidth=1,
|
||||
nullPointMode="null",
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
dashes=False,
|
||||
tooltip=Tooltip(
|
||||
msResolution=False,
|
||||
valueType='individual',
|
||||
),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='short', min=None),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum by(instance) (rate(apiserver_'
|
||||
'request_count{code!~\"2..\"}[5m]))',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Error Rate',
|
||||
'refId': 'A',
|
||||
'step': 60,
|
||||
},
|
||||
{
|
||||
'expr': 'sum by(instance) (rate(apiserver_'
|
||||
'request_count[5m]))',
|
||||
'format': 'time_series',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Request Rate',
|
||||
'refId': 'B',
|
||||
'step': 60,
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,205 +0,0 @@
|
|||
from grafanalib.core import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Kubernetes Resource Requests',
|
||||
version=2,
|
||||
graphTooltip=0,
|
||||
refresh=False,
|
||||
editable=False,
|
||||
schemaVersion=14,
|
||||
time=Time(start='now-3h'),
|
||||
timezone='browser',
|
||||
inputs=[
|
||||
{
|
||||
'name': 'prometheus',
|
||||
'label': 'prometheus',
|
||||
'description': '',
|
||||
'type': 'datasource',
|
||||
'pluginId': 'prometheus',
|
||||
'pluginName': 'Prometheus'
|
||||
},
|
||||
],
|
||||
rows=[
|
||||
Row(
|
||||
height=300, title='CPU Cores', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='CPU Cores',
|
||||
description='This represents the total [CPU resource '
|
||||
'requests](https://kubernetes.io/docs/concepts/configu'
|
||||
'ration/manage-compute-resources-container/#meaning-of-'
|
||||
'cpu) in the cluster.\nFor comparison the total '
|
||||
'[allocatable CPU cores](https://github.com/kubernetes/'
|
||||
'community/blob/master/contributors/design-proposals/'
|
||||
'node-allocatable.md) is also shown.',
|
||||
id=1,
|
||||
dataSource='prometheus',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
lineWidth=1,
|
||||
spaceLength=10,
|
||||
nullPointMode='null',
|
||||
span=9,
|
||||
tooltip=Tooltip(
|
||||
msResolution=False, valueType='individual'
|
||||
),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='short', label='CPU Cores', min=None,),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'min(sum(kube_node_status_allocatable_'
|
||||
'cpu_cores) by (instance))',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Allocatable CPU Cores',
|
||||
'refId': 'A',
|
||||
'step': 20,
|
||||
},
|
||||
{
|
||||
'expr': 'max(sum(kube_pod_container_resource_'
|
||||
'requests_cpu_cores) by (instance))',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Requested CPU Cores',
|
||||
'refId': 'B',
|
||||
'step': 20,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='CPU Cores',
|
||||
dataSource='prometheus',
|
||||
id=2,
|
||||
format='percent',
|
||||
editable=False,
|
||||
span=3,
|
||||
gauge=Gauge(show=True),
|
||||
sparkline=SparkLine(show=True),
|
||||
valueFontSize='110%',
|
||||
thresholds='80, 90',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(sum(kube_pod_container_resource_'
|
||||
'requests_cpu_cores) by (instance)) / min(sum'
|
||||
'(kube_node_status_allocatable_cpu_cores) by '
|
||||
'(instance)) * 100',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 240,
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=300, title='Memory', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Memory',
|
||||
id=3,
|
||||
dataSource='prometheus',
|
||||
description='This represents the total [memory resource '
|
||||
'requests](https://kubernetes.io/docs/concepts/'
|
||||
'configuration/manage-compute-resources-container/'
|
||||
'#meaning-of-memory) in the cluster.\nFor comparison '
|
||||
'the total [allocatable memory](https://github.com/'
|
||||
'kubernetes/community/blob/master/contributors/'
|
||||
'design-proposals/node-allocatable.md) is also shown.',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
lineWidth=1,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=9,
|
||||
nullPointMode='null',
|
||||
tooltip=Tooltip(
|
||||
msResolution=False, valueType='individual'
|
||||
),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', label='Memory', min=None),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'min(sum(kube_node_status_allocatable_'
|
||||
'memory_bytes) by (instance))',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Allocatable Memory',
|
||||
'refId': 'A',
|
||||
'step': 20,
|
||||
},
|
||||
{
|
||||
'expr': 'max(sum(kube_pod_container_resource_'
|
||||
'requests_memory_bytes) by (instance))',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Requested Memory',
|
||||
'refId': 'B',
|
||||
'step': 20,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Memory',
|
||||
dataSource='prometheus',
|
||||
id=4,
|
||||
format='percent',
|
||||
span=3,
|
||||
gauge=Gauge(show=True),
|
||||
sparkline=SparkLine(show=True),
|
||||
editable=False,
|
||||
valueFontSize='110%',
|
||||
thresholds='80, 90',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(sum(kube_pod_container_resource_'
|
||||
'requests_memory_bytes) by (instance)) / '
|
||||
'min(sum(kube_node_status_allocatable_memory_'
|
||||
'bytes) by (instance)) * 100',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '',
|
||||
'refId': 'A',
|
||||
'step': 240,
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,423 +0,0 @@
|
|||
from grafanalib.core import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Nodes',
|
||||
version=2,
|
||||
description='Dashboard to get an overview of one server',
|
||||
gnetId=22,
|
||||
graphTooltip=0,
|
||||
refresh=False,
|
||||
editable=False,
|
||||
schemaVersion=14,
|
||||
time=Time(start='now-1h'),
|
||||
timezone='browser',
|
||||
inputs=[
|
||||
{
|
||||
'name': 'prometheus',
|
||||
'label': 'prometheus',
|
||||
'description': '',
|
||||
'type': 'datasource',
|
||||
'pluginId': 'prometheus',
|
||||
'pluginName': 'Prometheus'
|
||||
},
|
||||
],
|
||||
templating=Templating(list=[
|
||||
{
|
||||
'allValue': None,
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': False,
|
||||
'label': None,
|
||||
'multi': False,
|
||||
'name': 'server',
|
||||
'options': [],
|
||||
'query': 'label_values(node_boot_time, instance)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': '',
|
||||
'tags': [],
|
||||
'tagsQuery': '',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
]),
|
||||
rows=[
|
||||
Row(
|
||||
height=250, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Idle CPU',
|
||||
dataSource='prometheus',
|
||||
id=3,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
yAxes=YAxes(
|
||||
YAxis(
|
||||
format='percent',
|
||||
label='cpu usage',
|
||||
max=100,
|
||||
),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': '100 - (avg by (cpu) (irate(node_cpu'
|
||||
'{mode=\"idle\", instance=\"$server\"}[5m])) '
|
||||
'* 100)',
|
||||
'hide': False,
|
||||
'intervalFactor': 10,
|
||||
'legendFormat': '{{cpu}}',
|
||||
'refId': 'A',
|
||||
'step': 50,
|
||||
}
|
||||
],
|
||||
),
|
||||
Graph(
|
||||
title='System Load',
|
||||
dataSource='prometheus',
|
||||
id=9,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='percentunit', min=None,),
|
||||
YAxis(format='short', min=None,),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'node_load1{instance=\"$server\"}',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'load 1m',
|
||||
'refId': 'A',
|
||||
'step': 20,
|
||||
'target': '',
|
||||
},
|
||||
{
|
||||
'expr': 'node_load5{instance=\"$server\"}',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'load 5m',
|
||||
'refId': 'B',
|
||||
'step': 20,
|
||||
'target': '',
|
||||
},
|
||||
{
|
||||
'expr': 'node_load15{instance=\"$server\"}',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'load 15m',
|
||||
'refId': 'C',
|
||||
'step': 20,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=250, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Memory Usage',
|
||||
dataSource='prometheus',
|
||||
id=4,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=9,
|
||||
stack=True,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
tooltip=Tooltip(
|
||||
msResolution=False, valueType='individual',
|
||||
),
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'node_memory_SwapFree{instance='
|
||||
'\"172.17.0.1:9100\",job=\"prometheus\"}',
|
||||
'yaxis': 2,
|
||||
},
|
||||
],
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', min='0',),
|
||||
YAxis(format='short', min=None,),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'node_memory_MemTotal{instance='
|
||||
'\"$server\"} - node_memory_MemFree{instance='
|
||||
'\"$server\"} - node_memory_Buffers{instance='
|
||||
'\"$server\"} - node_memory_Cached{instance='
|
||||
'\"$server\"}',
|
||||
'hide': False,
|
||||
'interval': '',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory used',
|
||||
'metric': '',
|
||||
'refId': 'C',
|
||||
'step': 10,
|
||||
},
|
||||
{
|
||||
'expr': 'node_memory_Buffers{instance='
|
||||
'\"$server\"}',
|
||||
'interval': '',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory buffers',
|
||||
'metric': '',
|
||||
'refId': 'E',
|
||||
'step': 10,
|
||||
},
|
||||
{
|
||||
'expr': 'node_memory_Cached{instance=\"$server\"}',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory cached',
|
||||
'metric': '',
|
||||
'refId': 'F',
|
||||
'step': 10,
|
||||
},
|
||||
{
|
||||
'expr': 'node_memory_MemFree{instance='
|
||||
'\"$server\"}',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'memory free',
|
||||
'metric': '',
|
||||
'refId': 'D',
|
||||
'step': 10,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Memory Usage',
|
||||
dataSource='prometheus',
|
||||
id=5,
|
||||
format='percent',
|
||||
gauge=Gauge(show=True),
|
||||
editable=False,
|
||||
span=3,
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
}
|
||||
],
|
||||
thresholds='80, 90',
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '((node_memory_MemTotal{instance='
|
||||
'\"$server\"} - node_memory_MemFree{instance='
|
||||
'\"$server\"} - node_memory_Buffers{instance='
|
||||
'\"$server\"} - node_memory_Cached{instance='
|
||||
'\"$server\"}) / node_memory_MemTotal{instance='
|
||||
'\"$server\"}) * 100',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 60,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=250, titleSize='h6', title='New Row', editable=False,
|
||||
showTitle=False, panels=[
|
||||
Graph(
|
||||
title='Disk I/O',
|
||||
dataSource='prometheus',
|
||||
id=6,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=9,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
yAxes=YAxes(
|
||||
YAxis(
|
||||
format='bytes',
|
||||
min=None,
|
||||
),
|
||||
YAxis(
|
||||
format='ms',
|
||||
min=None,
|
||||
),
|
||||
),
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'read',
|
||||
'yaxis': 1,
|
||||
},
|
||||
{
|
||||
'alias': '{instance=\"172.17.0.1:9100\"}',
|
||||
'yaxis': 2,
|
||||
},
|
||||
{
|
||||
'alias': 'io time',
|
||||
'yaxis': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum by (instance) (rate(node_disk_'
|
||||
'bytes_read{instance=\"$server\"}[2m]))',
|
||||
'hide': False,
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'read',
|
||||
'refId': 'A',
|
||||
'step': 20,
|
||||
'target': '',
|
||||
},
|
||||
{
|
||||
'expr': 'sum by (instance) (rate(node_disk_'
|
||||
'bytes_written{instance=\"$server\"}[2m]))',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'written',
|
||||
'refId': 'B',
|
||||
'step': 20
|
||||
},
|
||||
{
|
||||
'expr': 'sum by (instance) (rate(node_disk_io_'
|
||||
'time_ms{instance=\"$server\"}[2m]))',
|
||||
'intervalFactor': 4,
|
||||
'legendFormat': 'io time',
|
||||
'refId': 'C',
|
||||
'step': 20,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Disk Space Usage',
|
||||
dataSource='prometheus',
|
||||
id=7,
|
||||
thresholds='0.75, 0.9',
|
||||
editable=False,
|
||||
valueName='current',
|
||||
format='percentunit',
|
||||
span=3,
|
||||
gauge=Gauge(
|
||||
maxValue=1,
|
||||
show=True,
|
||||
),
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': '(sum(node_filesystem_size{device!='
|
||||
'\"rootfs\",instance=\"$server\"}) - '
|
||||
'sum(node_filesystem_free{device!=\"rootfs\",'
|
||||
'instance=\"$server\"})) / sum(node_filesystem_'
|
||||
'size{device!=\"rootfs\",instance=\"$server\"})',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 60,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=250, title='New Row', titleSize='h6',
|
||||
showTitle=False, editable=False,
|
||||
panels=[
|
||||
Graph(
|
||||
title='Network Received',
|
||||
dataSource='prometheus',
|
||||
id=8,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', min=None),
|
||||
YAxis(format='bytes', min=None),
|
||||
),
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'transmitted',
|
||||
'yaxis': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'rate(node_network_receive_bytes{'
|
||||
'instance=\"$server\",device!~\"lo\"}[5m])',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '{{device}}',
|
||||
'refId': 'A',
|
||||
'step': 10,
|
||||
'target': ''
|
||||
}
|
||||
],
|
||||
),
|
||||
Graph(
|
||||
title='Network Transmitted',
|
||||
dataSource='prometheus',
|
||||
id=10,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=6,
|
||||
tooltip=Tooltip(msResolution=False),
|
||||
yAxes=YAxes(
|
||||
YAxis(format='bytes', min=None),
|
||||
YAxis(format='bytes', min=None),
|
||||
),
|
||||
seriesOverrides=[
|
||||
{
|
||||
'alias': 'transmitted',
|
||||
'yaxis': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'rate(node_network_transmit_bytes'
|
||||
'{instance=\"$server\",device!~\"lo\"}[5m])',
|
||||
'hide': False,
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '{{device}}',
|
||||
'refId': 'B',
|
||||
'step': 10,
|
||||
'target': '',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,255 +0,0 @@
|
|||
from grafanalib.core import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='Pods',
|
||||
version=1,
|
||||
graphTooltip=1,
|
||||
refresh=False,
|
||||
editable=False,
|
||||
schemaVersion=14,
|
||||
time=Time(start='now-6h'),
|
||||
timezone='browser',
|
||||
inputs=[
|
||||
{
|
||||
'name': 'prometheus',
|
||||
'label': 'prometheus',
|
||||
'description': '',
|
||||
'type': 'datasource',
|
||||
'pluginId': 'prometheus',
|
||||
'pluginName': 'Prometheus'
|
||||
},
|
||||
],
|
||||
templating=Templating(list=[
|
||||
{
|
||||
'allValue': '.*',
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': True,
|
||||
'label': 'Namespace',
|
||||
'multi': False,
|
||||
'name': 'namespace',
|
||||
'options': [],
|
||||
'query': 'label_values(kube_pod_info, namespace)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': '',
|
||||
'tags': [],
|
||||
'tagsQuery': '',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
{
|
||||
'allValue': None,
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': False,
|
||||
'label': 'Pod',
|
||||
'multi': False,
|
||||
'name': 'pod',
|
||||
'options': [],
|
||||
'query': 'label_values(kube_pod_info{namespace=~"$namespace"}, '
|
||||
'pod)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': '',
|
||||
'tags': [],
|
||||
'tagsQuery': '',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
{
|
||||
'allValue': '.*',
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': True,
|
||||
'label': 'Container',
|
||||
'multi': False,
|
||||
'name': 'container',
|
||||
'options': [],
|
||||
'query': 'label_values(kube_pod_container_info{namespace='
|
||||
'"$namespace", pod="$pod"}, container)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': '',
|
||||
'tags': [],
|
||||
'tagsQuery': '',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
]),
|
||||
rows=[
|
||||
Row(
|
||||
height=250, title='Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Memory Usage',
|
||||
dataSource='prometheus',
|
||||
id=1,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=12,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
tooltip=Tooltip(msResolution=True, valueType='cumulative'),
|
||||
legend=Legend(
|
||||
alignAsTable=True, avg=True, current=True,
|
||||
rightSide=True, total=False, values=True,
|
||||
),
|
||||
yAxes=YAxes(
|
||||
YAxis(
|
||||
format='bytes', min=None,
|
||||
),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum by(container_name) (container_'
|
||||
'memory_usage_bytes{pod_name="$pod", '
|
||||
'container_name=~"$container", '
|
||||
'container_name!="POD"})',
|
||||
'interval': '10s',
|
||||
'intervalFactor': 1,
|
||||
'legendFormat': 'Current: {{ container_name }}',
|
||||
'metric': 'container_memory_usage_bytes',
|
||||
'refId': 'A',
|
||||
'step': 15,
|
||||
},
|
||||
{
|
||||
'expr': 'kube_pod_container_resource_requests_'
|
||||
'memory_bytes{pod="$pod", container=~'
|
||||
'"$container"}',
|
||||
'interval': '10s',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Requested: {{ container }}',
|
||||
'metric': 'kube_pod_container_resource_'
|
||||
'requests_memory_bytes',
|
||||
'refId': 'B',
|
||||
'step': 20,
|
||||
},
|
||||
{
|
||||
'expr': 'kube_pod_container_resource_limits_'
|
||||
'memory_bytes{pod="$pod", container=~'
|
||||
'"$container"}',
|
||||
'interval': '10s',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Limit: {{ container }}',
|
||||
'metric': 'kube_pod_container_resource_'
|
||||
'limits_memory_bytes',
|
||||
'refId': 'C',
|
||||
'step': 20,
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=250, title='Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='CPU Usage',
|
||||
dataSource='prometheus',
|
||||
id=2,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=12,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
legend=Legend(
|
||||
alignAsTable=True, avg=True, current=True,
|
||||
rightSide=True, total=False, values=True,
|
||||
),
|
||||
tooltip=Tooltip(msResolution=True, valueType='cumulative'),
|
||||
yAxes=YAxes(
|
||||
YAxis(
|
||||
format='short', min=None,
|
||||
),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum by (container_name)('
|
||||
'rate(container_cpu_usage_seconds_total'
|
||||
'{image!="",container_name!="POD",pod_name='
|
||||
'"$pod"}[1m]))',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '{{ container_name }}',
|
||||
'refId': 'A',
|
||||
'step': 30
|
||||
},
|
||||
{
|
||||
'expr': 'kube_pod_container_resource_requests_'
|
||||
'cpu_cores{pod="$pod", container=~'
|
||||
'"$container"}',
|
||||
'interval': '10s',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Requested: {{ container }}',
|
||||
'metric': 'kube_pod_container_resource_'
|
||||
'requests_cpu_cores',
|
||||
'refId': 'B',
|
||||
'step': 20,
|
||||
},
|
||||
{
|
||||
'expr': 'kube_pod_container_resource_limits_'
|
||||
'cpu_cores{pod="$pod", container=~'
|
||||
'"$container"}',
|
||||
'interval': '10s',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': 'Limit: {{ container }}',
|
||||
'metric': 'kube_pod_container_resource_'
|
||||
'limits_memory_bytes',
|
||||
'refId': 'C',
|
||||
'step': 20,
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=250, title='New Row', showTitle=False, editable=False,
|
||||
titleSize='h6', panels=[
|
||||
Graph(
|
||||
title='Network I/O',
|
||||
dataSource='prometheus',
|
||||
id=3,
|
||||
isNew=False,
|
||||
editable=False,
|
||||
spaceLength=10,
|
||||
span=12,
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
legend=Legend(
|
||||
alignAsTable=True, avg=True, current=True,
|
||||
rightSide=True, total=False, values=True,
|
||||
),
|
||||
tooltip=Tooltip(msResolution=True, valueType='cumulative'),
|
||||
yAxes=YAxes(
|
||||
YAxis(
|
||||
format='bytes', min=None,
|
||||
),
|
||||
YAxis(format='short', min=None),
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sort_desc(sum by (pod_name) (rate'
|
||||
'(container_network_receive_bytes_total{'
|
||||
'pod_name="$pod"}[1m])))',
|
||||
'intervalFactor': 2,
|
||||
'legendFormat': '{{ pod_name }}',
|
||||
'refId': 'A',
|
||||
'step': 30
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,7 +0,0 @@
|
|||
{
|
||||
"access": "proxy",
|
||||
"basicAuth": false,
|
||||
"name": "prometheus",
|
||||
"type": "prometheus",
|
||||
"url": "http://prometheus-k8s.monitoring.svc:9090"
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -1,440 +0,0 @@
|
|||
import sys
|
||||
import os.path
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from _grafanalib import *
|
||||
|
||||
|
||||
dashboard = Dashboard(
|
||||
title='StatefulSet',
|
||||
version=1,
|
||||
graphTooltip=1,
|
||||
time=Time(start='now-6h'),
|
||||
templating=Templating(list=[
|
||||
{
|
||||
'allValue': '.*',
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': False,
|
||||
'label': 'Namespace',
|
||||
'multi': False,
|
||||
'name': 'statefulset_namespace',
|
||||
'options': [],
|
||||
'query': 'label_values(kube_statefulset_metadata_generation, '
|
||||
'namespace)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': None,
|
||||
'tags': [],
|
||||
'tagsQuery': '',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
{
|
||||
'allValue': None,
|
||||
'current': {},
|
||||
'datasource': 'prometheus',
|
||||
'hide': 0,
|
||||
'includeAll': False,
|
||||
'label': 'StatefulSet',
|
||||
'multi': False,
|
||||
'name': 'statefulset_name',
|
||||
'options': [],
|
||||
'query': 'label_values(kube_statefulset_metadata_generation'
|
||||
'{namespace="$statefulset_namespace"}, statefulset)',
|
||||
'refresh': 1,
|
||||
'regex': '',
|
||||
'sort': 0,
|
||||
'tagValuesQuery': '',
|
||||
'tags': [],
|
||||
'tagsQuery': 'statefulset',
|
||||
'type': 'query',
|
||||
'useTags': False,
|
||||
},
|
||||
]),
|
||||
rows=[
|
||||
Row(panels=[
|
||||
SingleStat(
|
||||
title='CPU',
|
||||
id=8,
|
||||
gauge=Gauge(show=False),
|
||||
postfix='cores',
|
||||
span=4,
|
||||
valueFontSize='110%',
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
sparkline=SparkLine(
|
||||
fillColor=(31, 118, 189, 0.18),
|
||||
lineColor=(31, 120, 193),
|
||||
show=True,
|
||||
),
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(container_cpu_usage_seconds_total'
|
||||
'{namespace=\"$statefulset_namespace\",pod_name=~\"'
|
||||
'$statefulset_name.*\"}[3m]))',
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Memory',
|
||||
id=9,
|
||||
postfix='GB',
|
||||
prefixFontSize='80%',
|
||||
gauge=Gauge(show=False),
|
||||
span=4,
|
||||
valueFontSize='110%',
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
sparkline=SparkLine(
|
||||
fillColor=(31, 118, 189, 0.18),
|
||||
lineColor=(31, 120, 193),
|
||||
show=True,
|
||||
),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(container_memory_usage_bytes{namespace='
|
||||
'\"$statefulset_namespace\",pod_name=~\"$'
|
||||
'statefulset_name.*\"}) / 1024^3',
|
||||
'intervalFactor': 2,
|
||||
'refId': 'A',
|
||||
'step': 600,
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Network',
|
||||
format='Bps',
|
||||
gauge=Gauge(thresholdMarkers=False),
|
||||
id=7,
|
||||
postfix='',
|
||||
span=4,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
sparkline=SparkLine(
|
||||
fillColor=(31, 118, 189, 0.18),
|
||||
lineColor=(31, 120, 193),
|
||||
show=True,
|
||||
),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'sum(rate(container_network_transmit_'
|
||||
'bytes_total'
|
||||
'{namespace=\"$statefulset_namespace\",pod_name=~\"'
|
||||
'$statefulset_name.*\"}[3m])) + '
|
||||
'sum(rate(container_network_receive_bytes_total'
|
||||
'{namespace=\"$statefulset_namespace\",pod_name=~'
|
||||
'\"$statefulset_name.*\"}[3m]))',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
height=200,
|
||||
),
|
||||
Row(
|
||||
height=100, panels=[
|
||||
SingleStat(
|
||||
title='Desired Replicas',
|
||||
id=5,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'metric': 'kube_statefulset_replicas',
|
||||
'expr': 'max(kube_statefulset_replicas'
|
||||
'{statefulset="$statefulset_name",namespace='
|
||||
'"$statefulset_namespace"}) without '
|
||||
'(instance, pod)',
|
||||
},
|
||||
],
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
gauge=Gauge(thresholdMarkers=False, show=False),
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Available Replicas',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
gauge=Gauge(show=False),
|
||||
id=6,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'min(kube_statefulset_status_replicas'
|
||||
'{statefulset=\"$statefulset_name\",'
|
||||
'namespace=\"$statefulset_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
sparkline=SparkLine(),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Observed Generation',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
gauge=Gauge(),
|
||||
id=3,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(kube_statefulset_status_observed_'
|
||||
'generation{statefulset=\"$statefulset_name\",'
|
||||
'namespace=\"$statefulset_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': "null",
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
sparkline=SparkLine(),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
}
|
||||
],
|
||||
),
|
||||
SingleStat(
|
||||
title='Metadata Generation',
|
||||
colors=[
|
||||
(245, 54, 54, 0.9),
|
||||
(237, 129, 40, 0.89),
|
||||
(50, 172, 45, 0.97),
|
||||
],
|
||||
gauge=Gauge(show=False),
|
||||
id=2,
|
||||
mappingType=1,
|
||||
mappingTypes=[
|
||||
{
|
||||
'name': 'value to text',
|
||||
'value': 1,
|
||||
},
|
||||
{
|
||||
'name': 'range to text',
|
||||
'value': 2,
|
||||
},
|
||||
],
|
||||
targets=[
|
||||
{
|
||||
'expr': 'max(kube_statefulset_metadata_generation'
|
||||
'{statefulset=\"$statefulset_name\",namespace=\"'
|
||||
'$statefulset_namespace\"}) without (instance, '
|
||||
'pod)',
|
||||
},
|
||||
],
|
||||
rangeMaps=[
|
||||
{
|
||||
'from': 'null',
|
||||
'text': 'N/A',
|
||||
'to': 'null',
|
||||
},
|
||||
],
|
||||
span=3,
|
||||
sparkline=SparkLine(),
|
||||
valueMaps=[
|
||||
{
|
||||
'op': '=',
|
||||
'text': 'N/A',
|
||||
'value': 'null',
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
Row(
|
||||
height=350, panels=[
|
||||
Graph(
|
||||
title='Replicas',
|
||||
dashLength=10,
|
||||
dashes=False,
|
||||
id=1,
|
||||
spaceLength=10,
|
||||
targets=[
|
||||
{
|
||||
'expr': 'min(kube_statefulset_status_replicas'
|
||||
'{statefulset=\"$statefulset_name\",'
|
||||
'namespace=\"$statefulset_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
'legendFormat': 'available',
|
||||
'refId': 'B',
|
||||
'step': 30,
|
||||
},
|
||||
{
|
||||
'expr': 'max(kube_statefulset_replicas'
|
||||
'{statefulset=\"$statefulset_name\",namespace=\"'
|
||||
'$statefulset_namespace\"}) without '
|
||||
'(instance, pod)',
|
||||
'legendFormat': 'desired',
|
||||
'refId': 'E',
|
||||
'step': 30,
|
||||
}
|
||||
],
|
||||
xAxis=XAxis(mode='time'),
|
||||
yAxes=YAxes(
|
||||
YAxis(min=None),
|
||||
YAxis(format='short', min=None, show=False),
|
||||
),
|
||||
),
|
||||
]
|
||||
),
|
||||
],
|
||||
)
|
|
@ -1,33 +0,0 @@
|
|||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{$labels.service}}` are out of sync.
|
||||
summary: Configuration out of sync
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
summary: Alertmanager down or missing
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
summary: Alertmanager's configuration reload failed
|
|
@ -1,123 +0,0 @@
|
|||
groups:
|
||||
- name: ./etcd3.rules
|
||||
rules:
|
||||
- alert: InsufficientMembers
|
||||
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: NoLeader
|
||||
expr: etcd_server_has_leader{job="etcd"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{ $labels.instance }} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
||||
}} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} member communication with
|
||||
{{ $labels.To }} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
||||
failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||
summary: high commit durations
|
|
@ -1,39 +0,0 @@
|
|||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
|
||||
summary: Targets are down
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
annotations:
|
||||
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
|
||||
pipeline is functional.
|
||||
summary: Alerting DeadMansSwitch
|
||||
- record: fd_utilization
|
||||
expr: process_open_fds / process_max_fds
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next 4 hours'
|
||||
summary: file descriptors soon exhausted
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next hour'
|
||||
summary: file descriptors soon exhausted
|
|
@ -1,47 +0,0 @@
|
|||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="node-exporter"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery
|
||||
summary: Prometheus could not scrape a node-exporter
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
||||
summary: Node disk is running full within 24 hours
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
||||
summary: Node disk is running full within 2 hours
|
|
@ -1,101 +0,0 @@
|
|||
groups:
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||
summary: Reloading Promehteus' configuration failed
|
||||
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
||||
$labels.pod}}
|
||||
summary: Prometheus' alert notification queue is running full
|
||||
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
summary: Errors while sending alert from Prometheus
|
||||
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
summary: Errors while sending alerts from Prometheus
|
||||
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
||||
to any Alertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers
|
||||
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||
reload failures over the last four hours.'
|
||||
summary: Prometheus has issues reloading data blocks from disk
|
||||
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||
compaction failures over the last four hours.'
|
||||
summary: Prometheus has issues compacting sample blocks
|
||||
|
||||
- alert: PrometheusTSDBWALCorruptions
|
||||
expr: tsdb_wal_corruptions_total > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
||||
log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
||||
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
|
||||
summary: "Prometheus isn't ingesting samples"
|
||||
|
||||
- alert: PrometheusTargetScapesDuplicate
|
||||
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values"
|
||||
summary: Prometheus has many samples rejected
|
|
@ -0,0 +1,53 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'alertmanager.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'AlertmanagerConfigInconsistent',
|
||||
annotations: {
|
||||
description: 'The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.',
|
||||
summary: 'Configuration out of sync',
|
||||
},
|
||||
expr: |||
|
||||
count_values("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerDownOrMissing',
|
||||
annotations: {
|
||||
description: 'An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.',
|
||||
summary: 'Alertmanager down or missing',
|
||||
},
|
||||
expr: |||
|
||||
label_replace(prometheus_operator_alertmanager_spec_replicas{%(prometheusOperatorSelector)s}, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up{%(alertmanagerSelector)s}) BY (job) != 1
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerFailedReload',
|
||||
annotations: {
|
||||
description: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.",
|
||||
summary: "Alertmanager's configuration reload failed",
|
||||
},
|
||||
expr: |||
|
||||
alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
(import 'alertmanager.libsonnet') +
|
||||
(import 'general.libsonnet') +
|
||||
(import 'node.libsonnet') +
|
||||
(import 'prometheus.libsonnet')
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'general.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'TargetDown',
|
||||
annotations: {
|
||||
description: '{{ $value }}% of {{ $labels.job }} targets are down.',
|
||||
summary: 'Targets are down',
|
||||
},
|
||||
expr: '100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10',
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'DeadMansSwitch',
|
||||
annotations: {
|
||||
description: 'This is a DeadMansSwitch meant to ensure that the entire Alerting pipeline is functional.',
|
||||
summary: 'Alerting DeadMansSwitch',
|
||||
},
|
||||
expr: 'vector(1)',
|
||||
labels: {
|
||||
severity: 'none',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-node-alerting.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'NodeDiskRunningFull',
|
||||
annotations: {
|
||||
description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}})',
|
||||
summary: 'Node disk is running full within 24 hours',
|
||||
},
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[6h], 3600 * 24) < 0
|
||||
||| % $._config,
|
||||
'for': '30m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeDiskRunningFull',
|
||||
annotations: {
|
||||
description: 'device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}})',
|
||||
summary: 'Node disk is running full within 2 hours',
|
||||
},
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_free{%(nodeExporterSelector)s}[30m], 3600 * 2) < 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'prometheus.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'PrometheusConfigReloadFailed',
|
||||
annotations: {
|
||||
description: "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}",
|
||||
summary: "Reloading Promehteus' configuration failed",
|
||||
},
|
||||
expr: |||
|
||||
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotificationQueueRunningFull',
|
||||
annotations: {
|
||||
description: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
|
||||
summary: "Prometheus' alert notification queue is running full",
|
||||
},
|
||||
expr: |||
|
||||
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > prometheus_notifications_queue_capacity{%(prometheusSelector)s}
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlerts',
|
||||
annotations: {
|
||||
description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
||||
summary: 'Errors while sending alert from Prometheus',
|
||||
},
|
||||
expr: |||
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.01
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlerts',
|
||||
annotations: {
|
||||
description: 'Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
||||
summary: 'Errors while sending alerts from Prometheus',
|
||||
},
|
||||
expr: |||
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 0.03
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotConnectedToAlertmanagers',
|
||||
annotations: {
|
||||
description: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers',
|
||||
summary: 'Prometheus is not connected to any Alertmanagers',
|
||||
},
|
||||
expr: |||
|
||||
prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBReloadsFailing',
|
||||
annotations: {
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.',
|
||||
summary: 'Prometheus has issues reloading data blocks from disk',
|
||||
},
|
||||
expr: |||
|
||||
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0
|
||||
||| % $._config,
|
||||
'for': '12h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBCompactionsFailing',
|
||||
annotations: {
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.',
|
||||
summary: 'Prometheus has issues compacting sample blocks',
|
||||
},
|
||||
expr: |||
|
||||
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0
|
||||
||| % $._config,
|
||||
'for': '12h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBWALCorruptions',
|
||||
annotations: {
|
||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).',
|
||||
summary: 'Prometheus write-ahead log is corrupted',
|
||||
},
|
||||
expr: |||
|
||||
tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
|
||||
||| % $._config,
|
||||
'for': '4h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotIngestingSamples',
|
||||
annotations: {
|
||||
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
|
||||
summary: "Prometheus isn't ingesting samples",
|
||||
},
|
||||
expr: |||
|
||||
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTargetScapesDuplicate',
|
||||
annotations: {
|
||||
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values',
|
||||
summary: 'Prometheus has many samples rejected',
|
||||
},
|
||||
expr: |||
|
||||
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -6,7 +6,9 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
|
|||
(import 'alertmanager/alertmanager.libsonnet') +
|
||||
(import 'prometheus-operator/prometheus-operator.libsonnet') +
|
||||
(import 'prometheus/prometheus.libsonnet') +
|
||||
(import 'kubernetes-mixin/mixin.libsonnet') + {
|
||||
(import 'kubernetes-mixin/mixin.libsonnet') +
|
||||
(import 'alerts/alerts.libsonnet') +
|
||||
(import 'rules/rules.libsonnet') + {
|
||||
kubePrometheus+:: {
|
||||
namespace: k.core.v1.namespace.new($._config.namespace),
|
||||
},
|
||||
|
@ -14,11 +16,31 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
|
|||
_config+:: {
|
||||
namespace: 'default',
|
||||
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
cadvisorSelector: 'job="kubelet"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
kubeletSelector: 'job="kubelet"',
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
notKubeDnsSelector: 'job!="kube-dns"',
|
||||
kubeSchedulerSelector: 'job="kube-scheduler"',
|
||||
kubeControllerManagerSelector: 'job="kube-controller-manager"',
|
||||
kubeApiserverSelector: 'job="apiserver"',
|
||||
podLabel: 'pod',
|
||||
|
||||
alertmanagerSelector: 'job="alertmanager-main"',
|
||||
prometheusSelector: 'job="prometheus-k8s"',
|
||||
prometheusOperatorSelector: 'job="prometheus-operator"',
|
||||
|
||||
jobs: {
|
||||
Kubelet: $._config.kubeletSelector,
|
||||
KubeScheduler: $._config.kubeSchedulerSelector,
|
||||
KubeControllerManager: $._config.kubeControllerManagerSelector,
|
||||
KubeAPI: $._config.kubeApiserverSelector,
|
||||
KubeStateMetrics: $._config.kubeStateMetricsSelector,
|
||||
NodeExporter: $._config.nodeExporterSelector,
|
||||
Alertmanager: $._config.alertmanagerSelector,
|
||||
Prometheus: $._config.prometheusSelector,
|
||||
PrometheusOperator: $._config.prometheusOperatorSelector,
|
||||
},
|
||||
|
||||
prometheus+:: {
|
||||
rules: $.prometheusRules + $.prometheusAlerts,
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'kube-prometheus-node-recording.rules',
|
||||
rules: [
|
||||
{
|
||||
expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)',
|
||||
record: 'instance:node_cpu:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)',
|
||||
record: 'instance:node_filesystem_usage:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_network_receive_bytes[3m])) BY (instance)',
|
||||
record: 'instance:node_network_receive_bytes:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_network_transmit_bytes[3m])) BY (instance)',
|
||||
record: 'instance:node_network_transmit_bytes:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)',
|
||||
record: 'instance:node_cpu:ratio',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))',
|
||||
record: 'cluster:node_cpu:sum_rate5m',
|
||||
},
|
||||
{
|
||||
expr: 'cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))',
|
||||
record: 'cluster:node_cpu:ratio',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -3868,7 +3868,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\",pod_name=\"$pod\"}[1m])) by (container_name)",
|
||||
"expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[1m])) by (container_name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{container_name}}",
|
||||
|
@ -4097,7 +4097,7 @@ data:
|
|||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)",
|
||||
"expr": "sum(label_replace(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}[5m]), \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
|
@ -4228,7 +4228,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}) by (container_name)",
|
||||
"expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{container_name}}",
|
||||
|
@ -4457,7 +4457,7 @@ data:
|
|||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)",
|
||||
"expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
|
@ -5003,7 +5003,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(245, 54, 54, 0.9)"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "percent",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -5206,7 +5206,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(245, 54, 54, 0.9)"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "percent",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -6066,7 +6066,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -6145,7 +6145,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -6224,7 +6224,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -6317,7 +6317,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -6397,7 +6397,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -6477,7 +6477,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
@ -6557,7 +6557,7 @@ data:
|
|||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "prometheus",
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
|
|
|
@ -49,13 +49,13 @@ data:
|
|||
without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n
|
||||
\ \"record\": \"cluster_quantile:scheduler_binding_latency:histogram_quantile\"\n-
|
||||
\"name\": \"kube-apiserver.rules\"\n \"rules\": \n - \"expr\": |\n histogram_quantile(0.99,
|
||||
sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m])) without(instance,
|
||||
sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance,
|
||||
pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.99\"\n \"record\":
|
||||
\"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n - \"expr\":
|
||||
|\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m]))
|
||||
|\n histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m]))
|
||||
without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.9\"\n
|
||||
\ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n
|
||||
\ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"kube-apiserver\"}[5m]))
|
||||
\ - \"expr\": |\n histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m]))
|
||||
without(instance, pod)) / 1e+06\n \"labels\": \n \"quantile\": \"0.5\"\n
|
||||
\ \"record\": \"cluster_quantile:apiserver_request_latencies:histogram_quantile\"\n-
|
||||
\"name\": \"node.rules\"\n \"rules\": \n - \"expr\": \"sum(min(kube_pod_info)
|
||||
|
@ -122,20 +122,49 @@ data:
|
|||
by (node) (\n (irate(node_network_receive_drop{job=\"node-exporter\",device=\"eth0\"}[1m])
|
||||
+\n irate(node_network_transmit_drop{job=\"node-exporter\",device=\"eth0\"}[1m]))\n
|
||||
\ * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n
|
||||
\ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kubernetes-absent\"\n
|
||||
\ \"rules\": \n - \"alert\": \"KubeAPIDown\"\n \"annotations\": \n \"message\":
|
||||
\"KubeAPI has disappeared from Prometheus target discovery.\"\n \"expr\": |\n
|
||||
\ absent(up{job=\"kube-apiserver\"} == 1)\n \"for\": \"15m\"\n \"labels\":
|
||||
\n \"severity\": \"critical\"\n - \"alert\": \"KubeControllerManagerDown\"\n
|
||||
\ \"annotations\": \n \"message\": \"KubeControllerManager has disappeared
|
||||
from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-controller-manager\"}
|
||||
\ )\n \"record\": \"node:node_net_saturation:sum_irate\"\n- \"name\": \"kube-prometheus-node-recording.rules\"\n
|
||||
\ \"rules\": \n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[3m]))
|
||||
BY (instance)\"\n \"record\": \"instance:node_cpu:rate:sum\"\n - \"expr\":
|
||||
\"sum((node_filesystem_size{mountpoint=\\\"/\\\"} - node_filesystem_free{mountpoint=\\\"/\\\"}))
|
||||
BY (instance)\"\n \"record\": \"instance:node_filesystem_usage:sum\"\n - \"expr\":
|
||||
\"sum(rate(node_network_receive_bytes[3m])) BY (instance)\"\n \"record\": \"instance:node_network_receive_bytes:rate:sum\"\n
|
||||
\ - \"expr\": \"sum(rate(node_network_transmit_bytes[3m])) BY (instance)\"\n \"record\":
|
||||
\"instance:node_network_transmit_bytes:rate:sum\"\n - \"expr\": \"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m]))
|
||||
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance,
|
||||
cpu)) BY (instance)\"\n \"record\": \"instance:node_cpu:ratio\"\n - \"expr\":
|
||||
\"sum(rate(node_cpu{mode!=\\\"idle\\\",mode!=\\\"iowait\\\"}[5m]))\"\n \"record\":
|
||||
\"cluster:node_cpu:sum_rate5m\"\n - \"expr\": \"cluster:node_cpu:rate5m / count(sum(node_cpu)
|
||||
BY (instance, cpu))\"\n \"record\": \"cluster:node_cpu:ratio\"\n- \"name\":
|
||||
\"kubernetes-absent\"\n \"rules\": \n - \"alert\": \"AlertmanagerDown\"\n \"annotations\":
|
||||
\n \"message\": \"Alertmanager has disappeared from Prometheus target discovery.\"\n
|
||||
\ \"expr\": |\n absent(up{job=\"alertmanager-main\"} == 1)\n \"for\":
|
||||
\"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeAPIDown\"\n
|
||||
\ \"annotations\": \n \"message\": \"KubeAPI has disappeared from Prometheus
|
||||
target discovery.\"\n \"expr\": |\n absent(up{job=\"apiserver\"} == 1)\n
|
||||
\ \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n -
|
||||
\"alert\": \"KubeControllerManagerDown\"\n \"annotations\": \n \"message\":
|
||||
\"KubeControllerManager has disappeared from Prometheus target discovery.\"\n
|
||||
\ \"expr\": |\n absent(up{job=\"kube-controller-manager\"} == 1)\n \"for\":
|
||||
\"15m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"KubeSchedulerDown\"\n
|
||||
\ \"annotations\": \n \"message\": \"KubeScheduler has disappeared from
|
||||
Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"kube-scheduler\"}
|
||||
== 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n
|
||||
\ - \"alert\": \"KubeSchedulerDown\"\n \"annotations\": \n \"message\":
|
||||
\"KubeScheduler has disappeared from Prometheus target discovery.\"\n \"expr\":
|
||||
|\n absent(up{job=\"kube-scheduler\"} == 1)\n \"for\": \"15m\"\n \"labels\":
|
||||
\ - \"alert\": \"KubeStateMetricsDown\"\n \"annotations\": \n \"message\":
|
||||
\"KubeStateMetrics has disappeared from Prometheus target discovery.\"\n \"expr\":
|
||||
|\n absent(up{job=\"kube-state-metrics\"} == 1)\n \"for\": \"15m\"\n \"labels\":
|
||||
\n \"severity\": \"critical\"\n - \"alert\": \"KubeletDown\"\n \"annotations\":
|
||||
\n \"message\": \"Kubelet has disappeared from Prometheus target discovery.\"\n
|
||||
\ \"expr\": |\n absent(up{job=\"kubelet\"} == 1)\n \"for\": \"15m\"\n
|
||||
\ \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"NodeExporterDown\"\n
|
||||
\ \"annotations\": \n \"message\": \"NodeExporter has disappeared from
|
||||
Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"node-exporter\"}
|
||||
== 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n
|
||||
\ - \"alert\": \"PrometheusDown\"\n \"annotations\": \n \"message\": \"Prometheus
|
||||
has disappeared from Prometheus target discovery.\"\n \"expr\": |\n absent(up{job=\"prometheus-k8s\"}
|
||||
== 1)\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"critical\"\n
|
||||
\ - \"alert\": \"PrometheusOperatorDown\"\n \"annotations\": \n \"message\":
|
||||
\"PrometheusOperator has disappeared from Prometheus target discovery.\"\n \"expr\":
|
||||
|\n absent(up{job=\"prometheus-operator\"} == 1)\n \"for\": \"15m\"\n
|
||||
\ \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"kubernetes-apps\"\n
|
||||
\ \"rules\": \n - \"alert\": \"KubePodCrashLooping\"\n \"annotations\": \n
|
||||
\ \"message\": \"{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
|
@ -239,28 +268,116 @@ data:
|
|||
100\n \"for\": \"15m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\":
|
||||
\"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}}
|
||||
{{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"}
|
||||
{{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"}
|
||||
> 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"KubeAPILatencyHigh\"\n \"annotations\": \n \"message\":
|
||||
\"The API server has a 99th percentile latency of {{ $value }} seconds for {{$labels.verb}}
|
||||
{{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"kube-apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"}
|
||||
{{$labels.resource}}.\"\n \"expr\": |\n cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$\"}
|
||||
> 4\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n
|
||||
\ - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\":
|
||||
\"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m]))
|
||||
without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m]))
|
||||
\"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m]))
|
||||
without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m]))
|
||||
without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\":
|
||||
\"critical\"\n - \"alert\": \"KubeAPIErrorsHigh\"\n \"annotations\": \n \"message\":
|
||||
\"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"kube-apiserver\",code=~\"^(?:5..)$\"}[5m]))
|
||||
without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"kube-apiserver\"}[5m]))
|
||||
\"API server is erroring for {{ $value }}% of requests.\"\n \"expr\": |\n sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m]))
|
||||
without(instance, pod)\n /\n sum(rate(apiserver_request_count{job=\"apiserver\"}[5m]))
|
||||
without(instance, pod) * 100 > 5\n \"for\": \"10m\"\n \"labels\": \n \"severity\":
|
||||
\"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n \"annotations\":
|
||||
\n \"message\": \"Kubernetes API certificate is expiring in less than 7 days.\"\n
|
||||
\ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m])))
|
||||
\ \"expr\": |\n histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m])))
|
||||
< 604800\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"KubeClientCertificateExpiration\"\n
|
||||
\ \"annotations\": \n \"message\": \"Kubernetes API certificate is expiring
|
||||
in less than 1 day.\"\n \"expr\": |\n histogram_quantile(0.01, sum by
|
||||
(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m])))
|
||||
< 86400\n \"labels\": \n \"severity\": \"critical\""
|
||||
(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m])))
|
||||
< 86400\n \"labels\": \n \"severity\": \"critical\"\n- \"name\": \"alertmanager.rules\"\n
|
||||
\ \"rules\": \n - \"alert\": \"AlertmanagerConfigInconsistent\"\n \"annotations\":
|
||||
\n \"description\": \"The configuration of the instances of the Alertmanager
|
||||
cluster `{{$labels.service}}` are out of sync.\"\n \"summary\": \"Configuration
|
||||
out of sync\"\n \"expr\": |\n count_values(\"config_hash\", alertmanager_config_hash{job=\"alertmanager-main\"})
|
||||
BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"},
|
||||
\"service\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") != 1\n \"for\":
|
||||
\"5m\"\n \"labels\": \n \"severity\": \"critical\"\n - \"alert\": \"AlertmanagerDownOrMissing\"\n
|
||||
\ \"annotations\": \n \"description\": \"An unexpected number of Alertmanagers
|
||||
are scraped or Alertmanagers disappeared from discovery.\"\n \"summary\":
|
||||
\"Alertmanager down or missing\"\n \"expr\": |\n label_replace(prometheus_operator_alertmanager_spec_replicas{job=\"prometheus-operator\"},
|
||||
\"job\", \"alertmanager-$1\", \"alertmanager\", \"(.*)\") / ON(job) GROUP_RIGHT()
|
||||
sum(up{job=\"alertmanager-main\"}) BY (job) != 1\n \"for\": \"5m\"\n \"labels\":
|
||||
\n \"severity\": \"warning\"\n - \"alert\": \"AlertmanagerFailedReload\"\n
|
||||
\ \"annotations\": \n \"description\": \"Reloading Alertmanager's configuration
|
||||
has failed for {{ $labels.namespace }}/{{ $labels.pod}}.\"\n \"summary\":
|
||||
\"Alertmanager's configuration reload failed\"\n \"expr\": |\n alertmanager_config_last_reload_successful{job=\"alertmanager-main\"}
|
||||
== 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n-
|
||||
\"name\": \"general.rules\"\n \"rules\": \n - \"alert\": \"TargetDown\"\n \"annotations\":
|
||||
\n \"description\": \"{{ $value }}% of {{ $labels.job }} targets are down.\"\n
|
||||
\ \"summary\": \"Targets are down\"\n \"expr\": \"100 * (count(up == 0)
|
||||
BY (job) / count(up) BY (job)) > 10\"\n \"for\": \"10m\"\n \"labels\": \n
|
||||
\ \"severity\": \"warning\"\n - \"alert\": \"DeadMansSwitch\"\n \"annotations\":
|
||||
\n \"description\": \"This is a DeadMansSwitch meant to ensure that the entire
|
||||
Alerting pipeline is functional.\"\n \"summary\": \"Alerting DeadMansSwitch\"\n
|
||||
\ \"expr\": \"vector(1)\"\n \"labels\": \n \"severity\": \"none\"\n-
|
||||
\"name\": \"kube-prometheus-node-alerting.rules\"\n \"rules\": \n - \"alert\":
|
||||
\"NodeDiskRunningFull\"\n \"annotations\": \n \"description\": \"device
|
||||
{{$labels.device}} on node {{$labels.instance}} is running full within the next
|
||||
24 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node disk
|
||||
is running full within 24 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[6h],
|
||||
3600 * 24) < 0\n \"for\": \"30m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"NodeDiskRunningFull\"\n \"annotations\": \n \"description\":
|
||||
\"device {{$labels.device}} on node {{$labels.instance}} is running full within
|
||||
the next 2 hours (mounted at {{$labels.mountpoint}})\"\n \"summary\": \"Node
|
||||
disk is running full within 2 hours\"\n \"expr\": |\n predict_linear(node_filesystem_free{job=\"node-exporter\"}[30m],
|
||||
3600 * 2) < 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n-
|
||||
\"name\": \"prometheus.rules\"\n \"rules\": \n - \"alert\": \"PrometheusConfigReloadFailed\"\n
|
||||
\ \"annotations\": \n \"description\": \"Reloading Prometheus' configuration
|
||||
has failed for {{$labels.namespace}}/{{$labels.pod}}\"\n \"summary\": \"Reloading
|
||||
Promehteus' configuration failed\"\n \"expr\": |\n prometheus_config_last_reload_successful{job=\"prometheus-k8s\"}
|
||||
== 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"PrometheusNotificationQueueRunningFull\"\n \"annotations\":
|
||||
\n \"description\": \"Prometheus' alert notification queue is running full
|
||||
for {{$labels.namespace}}/{{ $labels.pod}}\"\n \"summary\": \"Prometheus'
|
||||
alert notification queue is running full\"\n \"expr\": |\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\"}[5m],
|
||||
60 * 30) > prometheus_notifications_queue_capacity{job=\"prometheus-k8s\"}\n \"for\":
|
||||
\"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusErrorSendingAlerts\"\n
|
||||
\ \"annotations\": \n \"description\": \"Errors while sending alerts from
|
||||
Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}\"\n
|
||||
\ \"summary\": \"Errors while sending alert from Prometheus\"\n \"expr\":
|
||||
|\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m])
|
||||
/ rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.01\n
|
||||
\ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\":
|
||||
\"PrometheusErrorSendingAlerts\"\n \"annotations\": \n \"description\":
|
||||
\"Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}}
|
||||
to Alertmanager {{$labels.Alertmanager}}\"\n \"summary\": \"Errors while
|
||||
sending alerts from Prometheus\"\n \"expr\": |\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\"}[5m])
|
||||
/ rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\"}[5m]) > 0.03\n
|
||||
\ \"for\": \"10m\"\n \"labels\": \n \"severity\": \"critical\"\n -
|
||||
\"alert\": \"PrometheusNotConnectedToAlertmanagers\"\n \"annotations\": \n
|
||||
\ \"description\": \"Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is
|
||||
not connected to any Alertmanagers\"\n \"summary\": \"Prometheus is not connected
|
||||
to any Alertmanagers\"\n \"expr\": |\n prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\"}
|
||||
< 1\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"PrometheusTSDBReloadsFailing\"\n \"annotations\": \n \"description\":
|
||||
\"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures
|
||||
over the last four hours.\"\n \"summary\": \"Prometheus has issues reloading
|
||||
data blocks from disk\"\n \"expr\": |\n increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\"}[2h])
|
||||
> 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"PrometheusTSDBCompactionsFailing\"\n \"annotations\": \n \"description\":
|
||||
\"{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction
|
||||
failures over the last four hours.\"\n \"summary\": \"Prometheus has issues
|
||||
compacting sample blocks\"\n \"expr\": |\n increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\"}[2h])
|
||||
> 0\n \"for\": \"12h\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"PrometheusTSDBWALCorruptions\"\n \"annotations\": \n \"description\":
|
||||
\"{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).\"\n
|
||||
\ \"summary\": \"Prometheus write-ahead log is corrupted\"\n \"expr\":
|
||||
|\n tsdb_wal_corruptions_total{job=\"prometheus-k8s\"} > 0\n \"for\":
|
||||
\"4h\"\n \"labels\": \n \"severity\": \"warning\"\n - \"alert\": \"PrometheusNotIngestingSamples\"\n
|
||||
\ \"annotations\": \n \"description\": \"Prometheus {{ $labels.namespace
|
||||
}}/{{ $labels.pod}} isn't ingesting samples.\"\n \"summary\": \"Prometheus
|
||||
isn't ingesting samples\"\n \"expr\": |\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\"}[5m])
|
||||
<= 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\"\n
|
||||
\ - \"alert\": \"PrometheusTargetScapesDuplicate\"\n \"annotations\": \n \"description\":
|
||||
\"{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate
|
||||
timestamps but different values\"\n \"summary\": \"Prometheus has many samples
|
||||
rejected\"\n \"expr\": |\n increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\"}[5m])
|
||||
> 0\n \"for\": \"10m\"\n \"labels\": \n \"severity\": \"warning\""
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue