mirror of
https://github.com/monitoring-mixins/website.git
synced 2024-12-14 11:37:31 +00:00
move site from https://github.com/cloudalchemy/mixins to final destination
This commit is contained in:
commit
7ec62a7e5a
85 changed files with 60052 additions and 0 deletions
14
.drone.yml
Normal file
14
.drone.yml
Normal file
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
|
||||
steps:
|
||||
- name: regenerate
|
||||
image: golang:1.14
|
||||
commands:
|
||||
- apt update && apt install jq -y
|
||||
- GO111MODULE=off go get -u github.com/myitcv/gobin
|
||||
- gobin github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@efe0c9e864431e93d5c3376bd5931d0fb9b2a296
|
||||
- gobin github.com/brancz/gojsontoyaml
|
||||
- gobin github.com/google/go-jsonnet/cmd/jsonnet
|
||||
- ./generate.sh && git diff --exit-code
|
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
tmp/
|
||||
site/public
|
||||
site/resources
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
[submodule "site/themes/ace-documentation"]
|
||||
path = site/themes/ace-documentation
|
||||
url = https://github.com/vantagedesign/ace-documentation.git
|
13
.travis.yml
Normal file
13
.travis.yml
Normal file
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
os: linux
|
||||
dist: xenial
|
||||
language: golang
|
||||
go:
|
||||
- 1.13
|
||||
install:
|
||||
- GO111MODULE=off go get -u github.com/myitcv/gobin
|
||||
- gobin github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@efe0c9e864431e93d5c3376bd5931d0fb9b2a296
|
||||
- gobin github.com/brancz/gojsontoyaml
|
||||
- gobin github.com/google/go-jsonnet/cmd/jsonnet
|
||||
script:
|
||||
- ./generate.sh && git diff --exit-code
|
201
LICENSE
Normal file
201
LICENSE
Normal file
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2020 mixtool authors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
15
README.md
Normal file
15
README.md
Normal file
|
@ -0,0 +1,15 @@
|
|||
# Monitoring Mixins site
|
||||
|
||||
## Adding new mixin
|
||||
|
||||
0. Install [required software](#requirements)
|
||||
1. Add new mixin to [mixins.yaml](mixins.yaml) file
|
||||
2. Run `generate.sh`
|
||||
|
||||
## Requirements
|
||||
|
||||
- jq
|
||||
- [jsonnet](https://github.com/google/go-jsonnet)
|
||||
- [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler)@efe0c9e864431e93d5c3376bd5931d0fb9b2a296
|
||||
- [gojsontoyaml](https://github.com/brancz/gojsontoyaml)
|
||||
|
213
assets/ceph/alerts.yaml
Normal file
213
assets/ceph/alerts.yaml
Normal file
|
@ -0,0 +1,213 @@
|
|||
groups:
|
||||
- name: ceph-mgr-status
|
||||
rules:
|
||||
- alert: CephMgrIsAbsent
|
||||
annotations:
|
||||
description: Ceph Manager has disappeared from Prometheus target discovery.
|
||||
message: Storage metrics collector service not available anymore.
|
||||
severity_level: critical
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
absent(up{job="rook-ceph-mgr"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CephMgrIsMissingReplicas
|
||||
annotations:
|
||||
description: Ceph Manager is missing replicas.
|
||||
message: Storage metrics collector service doesn't have required no of replicas.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(up{job="rook-ceph-mgr"}) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: ceph-mds-status
|
||||
rules:
|
||||
- alert: CephMdsMissingReplicas
|
||||
annotations:
|
||||
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
|
||||
message: Insufficient replicas for storage metadata service.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: quorum-alert.rules
|
||||
rules:
|
||||
- alert: CephMonQuorumAtRisk
|
||||
annotations:
|
||||
description: Storage cluster quorum is low. Contact Support.
|
||||
message: Storage quorum at risk
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= ((count(ceph_mon_metadata{job="rook-ceph-mgr"}) % 2) + 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CephMonHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||
message: Storage Cluster has seen many leader changes recently.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
(ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: ceph-node-alert.rules
|
||||
rules:
|
||||
- alert: CephNodeDown
|
||||
annotations:
|
||||
description: Storage node {{ $labels.node }} went down. Please check the node immediately.
|
||||
message: Storage node {{ $labels.node }} went down
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
cluster:ceph_node_down:join_kube == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
- name: osd-alert.rules
|
||||
rules:
|
||||
- alert: CephOSDCriticallyFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
|
||||
message: Back-end storage device is critically full.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
(ceph_osd_metadata * on (ceph_daemon) group_left() (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.85
|
||||
for: 40s
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CephOSDNearFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
|
||||
message: Back-end storage device is nearing full.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
(ceph_osd_metadata * on (ceph_daemon) group_left() (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75
|
||||
for: 40s
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CephOSDDiskNotResponding
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
|
||||
message: Disk not responding
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CephOSDDiskUnavailable
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
|
||||
message: Disk not accessible
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CephDataRecoveryTakingTooLong
|
||||
annotations:
|
||||
description: Data recovery has been active for too long. Contact Support.
|
||||
message: Data recovery is slow
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_pg_undersized > 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CephPGRepairTakingTooLong
|
||||
annotations:
|
||||
description: Self heal operations taking too long. Contact Support.
|
||||
message: Self heal problems detected
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_pg_inconsistent > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- name: cluster-state-alert.rules
|
||||
rules:
|
||||
- alert: CephClusterErrorState
|
||||
annotations:
|
||||
description: Storage cluster is in error state for more than 10m.
|
||||
message: Storage cluster is in error state
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_health_status{job="rook-ceph-mgr"} > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: CephClusterWarningState
|
||||
annotations:
|
||||
description: Storage cluster is in warning state for more than 10m.
|
||||
message: Storage cluster is in degraded state
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_health_status{job="rook-ceph-mgr"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CephOSDVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different versions of Ceph OSD components running.
|
||||
message: There are multiple versions of storage services running.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CephMonVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different versions of Ceph Mon components running.
|
||||
message: There are multiple versions of storage services running.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: cluster-utilization-alert.rules
|
||||
rules:
|
||||
- alert: CephClusterNearFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 75%. Free up some space or expand the storage cluster.
|
||||
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.75
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CephClusterCriticallyFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 85%. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
26
assets/ceph/rules.yaml
Normal file
26
assets/ceph/rules.yaml
Normal file
|
@ -0,0 +1,26 @@
|
|||
groups:
|
||||
- name: ceph.rules
|
||||
rules:
|
||||
- expr: |
|
||||
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)
|
||||
record: cluster:ceph_node_down:join_kube
|
||||
- expr: |
|
||||
avg(max by(instance) (label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)") * on(instance, device) group_right() (irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
|
||||
record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
|
||||
- name: telemeter.rules
|
||||
rules:
|
||||
- expr: |
|
||||
count(ceph_osd_metadata{job="rook-ceph-mgr"})
|
||||
record: job:ceph_osd_metadata:count
|
||||
- expr: |
|
||||
count(kube_persistentvolume_info)
|
||||
record: job:kube_pv:count
|
||||
- expr: |
|
||||
sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"})
|
||||
record: job:ceph_pools_iops:total
|
||||
- expr: |
|
||||
sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"})
|
||||
record: job:ceph_pools_iops_bytes:total
|
||||
- expr: |
|
||||
count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version))
|
||||
record: job:ceph_versions_running:count
|
27
assets/consul/alerts.yaml
Normal file
27
assets/consul/alerts.yaml
Normal file
|
@ -0,0 +1,27 @@
|
|||
groups:
|
||||
- name: consul
|
||||
rules:
|
||||
- alert: ConsulUp
|
||||
annotations:
|
||||
message: Consul '{{ $labels.job }}' is not up.
|
||||
expr: |
|
||||
consul_up != 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ConsulMaster
|
||||
annotations:
|
||||
message: Consul '{{ $labels.job }}' has no master.
|
||||
expr: |
|
||||
consul_raft_leader != 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ConsulPeers
|
||||
annotations:
|
||||
message: Consul '{{ $labels.job }}' does not have 3 peers.
|
||||
expr: |
|
||||
consul_raft_peers != 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
710
assets/consul/dashboards/consul-overview.json
Normal file
710
assets/consul/dashboards/consul-overview.json
Normal file
|
@ -0,0 +1,710 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "100px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"colorBackground": true,
|
||||
"colors": [
|
||||
"#d44a3a",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#299c46"
|
||||
],
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"format": "none",
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "instance",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "consul_up{job=\"$job\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "0.5,0.5",
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$instance",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "DOWN",
|
||||
"value": "0"
|
||||
},
|
||||
{
|
||||
"op": "=",
|
||||
"text": "UP",
|
||||
"value": "1"
|
||||
}
|
||||
],
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Up",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "100px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"colorBackground": true,
|
||||
"colors": [
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#299c46"
|
||||
],
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"format": "none",
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "instance",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(rate(consul_raft_leader_lastcontact_count{job=\"$job\",instance=~\"$instance\"}[1m]) > bool 0)\n or\n(consul_up{job=\"$job\",instance=~\"$instance\"} == bool 0)\n",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "0.5,0.5",
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$instance",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "FOLLOWER",
|
||||
"value": "0"
|
||||
},
|
||||
{
|
||||
"op": "=",
|
||||
"text": "LEADER",
|
||||
"value": "1"
|
||||
}
|
||||
],
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Leader",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "100px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"colorBackground": true,
|
||||
"colors": [
|
||||
"#d44a3a",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#299c46"
|
||||
],
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"format": "none",
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "instance",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "consul_raft_leader{job=\"$job\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "0.5,0.5",
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$instance",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "NO LEADER",
|
||||
"value": "0"
|
||||
},
|
||||
{
|
||||
"op": "=",
|
||||
"text": "HAS LEADER",
|
||||
"value": "1"
|
||||
}
|
||||
],
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Has Leader",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "100px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"colorBackground": true,
|
||||
"colors": [
|
||||
"#d44a3a",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#299c46"
|
||||
],
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"format": "none",
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "instance",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "consul_raft_peers{job=\"$job\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "1,2",
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$instance",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "singlestat",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "# Peers",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(consul_http_request_count{job=\"$job\"}[1m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "QPS",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(consul_http_request{job=\"$job\", quantile=\"0.99\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "99th Percentile",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "max(consul_http_request{job=\"$job\", quantile=\"0.5\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "50th Percentile",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(consul_http_request{job=\"$job\"}[5m])) / sum(rate(consul_http_request{job=\"$job\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Average",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Latency",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ms",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Consul Server",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [ ],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "prod",
|
||||
"value": "prod"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "job",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"options": [ ],
|
||||
"query": "label_values(consul_up, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 2,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "instance",
|
||||
"multi": true,
|
||||
"name": "instance",
|
||||
"options": [ ],
|
||||
"query": "label_values(consul_up{job=\"$job\"}, instance)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 2,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "Consul Overview",
|
||||
"uid": "",
|
||||
"version": 0
|
||||
}
|
1
assets/consul/rules.yaml
Normal file
1
assets/consul/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
135
assets/etcd/alerts.yaml
Normal file
135
assets/etcd/alerts.yaml
Normal file
|
@ -0,0 +1,135 @@
|
|||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
|
||||
expr: |
|
||||
max by (job) (
|
||||
sum by (job) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count by (job,endpoint) (
|
||||
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
|
||||
expr: |
|
||||
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
||||
expr: |
|
||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
expr: |
|
||||
increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
1093
assets/etcd/dashboards/etcd.json
Normal file
1093
assets/etcd/dashboards/etcd.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/etcd/rules.yaml
Normal file
1
assets/etcd/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
101
assets/gluster/alerts.yaml
Normal file
101
assets/gluster/alerts.yaml
Normal file
|
@ -0,0 +1,101 @@
|
|||
groups:
|
||||
- name: exporter-absent
|
||||
rules:
|
||||
- alert: GlusterExporterDown
|
||||
annotations:
|
||||
message: GlusterExporter has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job="glusterd2-client"}==1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: status-alert.rules
|
||||
rules:
|
||||
- alert: GlusterBrickStatus
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.hostname}}:{{$labels.brick_path}} is down.
|
||||
expr: |
|
||||
gluster_brick_up{job="glusterd2-client"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: GlusterVolumeStatus
|
||||
annotations:
|
||||
message: Gluster Volume {{$labels.volume}} is down.
|
||||
expr: |
|
||||
gluster_volume_up{job="glusterd2-client"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: gluster-utilization
|
||||
rules:
|
||||
- alert: GlusterVolumeUtilization
|
||||
annotations:
|
||||
message: Gluster Volume {{$labels.volume}} Utilization more than 80%
|
||||
expr: |
|
||||
100 * gluster:volume_capacity_used_bytes_total:sum
|
||||
/ gluster:volume_capacity_total_bytes:sum > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: GlusterVolumeUtilization
|
||||
annotations:
|
||||
message: Gluster Volume {{$labels.volume}} Utilization more than 90%
|
||||
expr: |
|
||||
100 * gluster:volume_capacity_used_bytes_total:sum
|
||||
/ gluster:volume_capacity_total_bytes:sum > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: thinpool-utilization
|
||||
rules:
|
||||
- alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
2296
assets/gluster/dashboards/k8s-storage-resources-glusterfs-pv.json
Normal file
2296
assets/gluster/dashboards/k8s-storage-resources-glusterfs-pv.json
Normal file
File diff suppressed because it is too large
Load diff
9
assets/gluster/rules.yaml
Normal file
9
assets/gluster/rules.yaml
Normal file
|
@ -0,0 +1,9 @@
|
|||
groups:
|
||||
- name: gluster-volume.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(max(gluster_subvol_capacity_used_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume)
|
||||
record: gluster:volume_capacity_used_bytes_total:sum
|
||||
- expr: |
|
||||
sum(max(gluster_subvol_capacity_total_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume)
|
||||
record: gluster:volume_capacity_total_bytes:sum
|
99
assets/jaeger/alerts.yaml
Normal file
99
assets/jaeger/alerts.yaml
Normal file
|
@ -0,0 +1,99 @@
|
|||
groups:
|
||||
- name: jaeger_alerts
|
||||
rules:
|
||||
- alert: JaegerAgentUDPPacketsBeingDropped
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }} UDP packets per second.
|
||||
expr: rate(jaeger_agent_thrift_udp_server_packets_dropped_total[1m]) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerAgentHTTPServerErrs
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerClientSpansDropped
|
||||
annotations:
|
||||
message: |
|
||||
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerAgentSpansDropped
|
||||
annotations:
|
||||
message: |
|
||||
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerCollectorQueueNotDraining
|
||||
annotations:
|
||||
message: |
|
||||
collector {{ $labels.job }} {{ $labels.instance }} is not able to drain the queue.
|
||||
expr: avg_over_time(jaeger_collector_queue_length[10m]) > 1000
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerCollectorDroppingSpans
|
||||
annotations:
|
||||
message: |
|
||||
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerSamplingUpdateFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerCollectorPersistenceSlow
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerThrottlingUpdateFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerQueryReqsFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerCassandraWritesFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: JaegerCassandraReadsFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
1
assets/jaeger/rules.yaml
Normal file
1
assets/jaeger/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
25
assets/kube-state-metrics/alerts.yaml
Normal file
25
assets/kube-state-metrics/alerts.yaml
Normal file
|
@ -0,0 +1,25 @@
|
|||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
1
assets/kube-state-metrics/rules.yaml
Normal file
1
assets/kube-state-metrics/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
550
assets/kubernetes/alerts.yaml
Normal file
550
assets/kubernetes/alerts.yaml
Normal file
|
@ -0,0 +1,550 @@
|
|||
groups:
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||
expr: |
|
||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (
|
||||
max by(namespace, pod) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
|
||||
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
|
||||
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
||||
expr: |
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||
expr: |
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||
expr: |
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
||||
expr: |
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
||||
expr: |
|
||||
max without (revision) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||
expr: |
|
||||
kube_daemonset_status_number_ready{job="kube-state-metrics"}
|
||||
/
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
||||
expr: |
|
||||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCronJobRunning
|
||||
annotations:
|
||||
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
|
||||
expr: |
|
||||
time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
||||
expr: |
|
||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
||||
expr: |
|
||||
kube_job_failed{job="kube-state-metrics"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||
expr: |
|
||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics"})
|
||||
and
|
||||
changes(kube_hpa_status_current_replicas[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||
expr: |
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||
==
|
||||
kube_hpa_spec_max_replicas{job="kube-state-metrics"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable_cpu_cores)
|
||||
>
|
||||
(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable_memory_bytes)
|
||||
>
|
||||
(count(kube_node_status_allocatable_memory_bytes)-1)
|
||||
/
|
||||
count(kube_node_status_allocatable_memory_bytes)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||
/
|
||||
sum(kube_node_status_allocatable_cpu_cores)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||
/
|
||||
sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
|
||||
expr: |
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||
expr: |
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
< 0.03
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
) < 0.15
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
||||
expr: |
|
||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: kubernetes-system
|
||||
rules:
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
message: There are {{ $value }} different semantic versions of Kubernetes components running.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||
expr: |
|
||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||
expr: |
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||
/
|
||||
sum(rate(rest_client_requests_total[5m])) by (instance, job))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kube-apiserver-slos
|
||||
rules:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: 2m
|
||||
labels:
|
||||
long: 1h
|
||||
severity: critical
|
||||
short: 5m
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: 15m
|
||||
labels:
|
||||
long: 6h
|
||||
severity: critical
|
||||
short: 30m
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: 1h
|
||||
labels:
|
||||
long: 1d
|
||||
severity: warning
|
||||
short: 2h
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
message: The API server is burning too much error budget
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||
expr: |
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: 3h
|
||||
labels:
|
||||
long: 3d
|
||||
severity: warning
|
||||
short: 6h
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
- alert: KubeAPILatencyHigh
|
||||
annotations:
|
||||
message: The API server has an abnormal latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||
expr: |
|
||||
(
|
||||
cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"}
|
||||
>
|
||||
on (verb) group_left()
|
||||
(
|
||||
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"} >= 0)
|
||||
+
|
||||
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"} >= 0)
|
||||
)
|
||||
) > on (verb) group_left()
|
||||
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="kube-apiserver"} >= 0)
|
||||
and on (verb,resource)
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="kube-apiserver",quantile="0.99"}
|
||||
>
|
||||
1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value | humanizePercentage }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",code=~"5.."}[5m])) by (resource,subresource,verb)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver"}[5m])) by (resource,subresource,verb) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||
expr: |
|
||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. It has not been available at least for the past five minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||
expr: |
|
||||
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
message: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
|
||||
expr: |
|
||||
absent(up{job="kube-apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: kubernetes-system-kubelet
|
||||
rules:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
message: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
||||
expr: |
|
||||
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
|
||||
expr: |
|
||||
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||
expr: |
|
||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||
expr: |
|
||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
message: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
||||
expr: |
|
||||
absent(up{job="kubelet"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: kubernetes-system-scheduler
|
||||
rules:
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
message: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
||||
expr: |
|
||||
absent(up{job="kube-scheduler"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: kubernetes-system-controller-manager
|
||||
rules:
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
message: KubeControllerManager has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
|
||||
expr: |
|
||||
absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
1777
assets/kubernetes/dashboards/apiserver.json
Normal file
1777
assets/kubernetes/dashboards/apiserver.json
Normal file
File diff suppressed because it is too large
Load diff
1652
assets/kubernetes/dashboards/cluster-total.json
Normal file
1652
assets/kubernetes/dashboards/cluster-total.json
Normal file
File diff suppressed because it is too large
Load diff
991
assets/kubernetes/dashboards/controller-manager.json
Normal file
991
assets/kubernetes/dashboards/controller-manager.json
Normal file
|
@ -0,0 +1,991 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 2,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 2,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"kube-controller-manager\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Up",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "N/A",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "min"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(workqueue_adds_total{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} {{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Work Queue Add Rate",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(workqueue_depth{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} {{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Work Queue Depth",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} {{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Work Queue Latency",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "2xx",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "3xx",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "4xx",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-controller-manager\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "5xx",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Kube API Request Rate",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 7,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 8,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{verb}} {{url}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Post Request Latency 99th Quantile",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 8,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{verb}} {{url}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Get Request Latency 99th Quantile",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{job=\"kube-controller-manager\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 10,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU usage",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 11,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_goroutines{job=\"kube-controller-manager\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Goroutines",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"kubernetes-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"options": [ ],
|
||||
"query": "label_values(process_cpu_seconds_total{job=\"kube-controller-manager\"}, instance)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Kubernetes / Controller Manager",
|
||||
"uid": "72e0e05bef5099e5f049b05fdc429ed4",
|
||||
"version": 0
|
||||
}
|
2242
assets/kubernetes/dashboards/k8s-resources-cluster.json
Normal file
2242
assets/kubernetes/dashboards/k8s-resources-cluster.json
Normal file
File diff suppressed because it is too large
Load diff
1988
assets/kubernetes/dashboards/k8s-resources-namespace.json
Normal file
1988
assets/kubernetes/dashboards/k8s-resources-namespace.json
Normal file
File diff suppressed because it is too large
Load diff
818
assets/kubernetes/dashboards/k8s-resources-node.json
Normal file
818
assets/kubernetes/dashboards/k8s-resources-node.json
Normal file
|
@ -0,0 +1,818 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{pod}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU Usage",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "CPU Usage",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"styles": [
|
||||
{
|
||||
"alias": "Time",
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"pattern": "Time",
|
||||
"type": "hidden"
|
||||
},
|
||||
{
|
||||
"alias": "CPU Usage",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #A",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "CPU Requests",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #B",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "CPU Requests %",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #C",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "percentunit"
|
||||
},
|
||||
{
|
||||
"alias": "CPU Limits",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #D",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "CPU Limits %",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #E",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "percentunit"
|
||||
},
|
||||
{
|
||||
"alias": "Pod",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "pod",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"pattern": "/.*/",
|
||||
"thresholds": [ ],
|
||||
"type": "string",
|
||||
"unit": "short"
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "C",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "D",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "E",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU Quota",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"transform": "table",
|
||||
"type": "table",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "CPU Quota",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\", container!=\"\"}) by (pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{pod}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Usage (w/o cache)",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Memory Usage",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"styles": [
|
||||
{
|
||||
"alias": "Time",
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"pattern": "Time",
|
||||
"type": "hidden"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Usage",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #A",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "bytes"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Requests",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #B",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "bytes"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Requests %",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #C",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "percentunit"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Limits",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #D",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "bytes"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Limits %",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #E",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "percentunit"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Usage (RSS)",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #F",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "bytes"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Usage (Cache)",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #G",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "bytes"
|
||||
},
|
||||
{
|
||||
"alias": "Memory Usage (Swap)",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "Value #H",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "bytes"
|
||||
},
|
||||
{
|
||||
"alias": "Pod",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"link": false,
|
||||
"linkTooltip": "Drill down",
|
||||
"linkUrl": "",
|
||||
"pattern": "pod",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "short"
|
||||
},
|
||||
{
|
||||
"alias": "",
|
||||
"colorMode": null,
|
||||
"colors": [ ],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"pattern": "/.*/",
|
||||
"thresholds": [ ],
|
||||
"type": "string",
|
||||
"unit": "short"
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "C",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "D",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=~\"$node\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "E",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_memory_rss{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "F",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_memory_cache{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "G",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_namespace_pod_container:container_memory_swap{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "H",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Quota",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"transform": "table",
|
||||
"type": "table",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Memory Quota",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"kubernetes-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "",
|
||||
"value": ""
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 2,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "cluster",
|
||||
"options": [ ],
|
||||
"query": "label_values(kube_pod_info, cluster)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "",
|
||||
"value": ""
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "node",
|
||||
"options": [ ],
|
||||
"query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Kubernetes / Compute Resources / Node (Pods)",
|
||||
"uid": "200ac8fdbfbb74b39aff88118e4d1c2c",
|
||||
"version": 0
|
||||
}
|
1538
assets/kubernetes/dashboards/k8s-resources-pod.json
Normal file
1538
assets/kubernetes/dashboards/k8s-resources-pod.json
Normal file
File diff suppressed because it is too large
Load diff
1751
assets/kubernetes/dashboards/k8s-resources-workload.json
Normal file
1751
assets/kubernetes/dashboards/k8s-resources-workload.json
Normal file
File diff suppressed because it is too large
Load diff
1895
assets/kubernetes/dashboards/k8s-resources-workloads-namespace.json
Normal file
1895
assets/kubernetes/dashboards/k8s-resources-workloads-namespace.json
Normal file
File diff suppressed because it is too large
Load diff
2230
assets/kubernetes/dashboards/kubelet.json
Normal file
2230
assets/kubernetes/dashboards/kubelet.json
Normal file
File diff suppressed because it is too large
Load diff
1280
assets/kubernetes/dashboards/namespace-by-pod.json
Normal file
1280
assets/kubernetes/dashboards/namespace-by-pod.json
Normal file
File diff suppressed because it is too large
Load diff
1516
assets/kubernetes/dashboards/namespace-by-workload.json
Normal file
1516
assets/kubernetes/dashboards/namespace-by-workload.json
Normal file
File diff suppressed because it is too large
Load diff
492
assets/kubernetes/dashboards/persistentvolumesusage.json
Normal file
492
assets/kubernetes/dashboards/persistentvolumesusage.json
Normal file
|
@ -0,0 +1,492 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 9,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used Space",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Free Space",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Volume Space Usage",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"rgba(50, 172, 45, 0.97)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(245, 54, 54, 0.9)"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "percent",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": true,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 3,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "80, 90",
|
||||
"title": "Volume Space Usage",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "N/A",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 9,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Used inodes",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": " Free inodes",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Volume inodes Usage",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "none",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"rgba(50, 172, 45, 0.97)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(245, 54, 54, 0.9)"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "percent",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": true,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 5,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "80, 90",
|
||||
"title": "Volume inodes Usage",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "N/A",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"kubernetes-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 2,
|
||||
"includeAll": false,
|
||||
"label": "cluster",
|
||||
"multi": false,
|
||||
"name": "cluster",
|
||||
"options": [ ],
|
||||
"query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"options": [ ],
|
||||
"query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\"}, namespace)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "PersistentVolumeClaim",
|
||||
"multi": false,
|
||||
"name": "volume",
|
||||
"options": [ ],
|
||||
"query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\"}, persistentvolumeclaim)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-7d",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Kubernetes / Persistent Volumes",
|
||||
"uid": "919b92a8e8041bd567af9edab12c840c",
|
||||
"version": 0
|
||||
}
|
1075
assets/kubernetes/dashboards/pod-total.json
Normal file
1075
assets/kubernetes/dashboards/pod-total.json
Normal file
File diff suppressed because it is too large
Load diff
1058
assets/kubernetes/dashboards/proxy.json
Normal file
1058
assets/kubernetes/dashboards/proxy.json
Normal file
File diff suppressed because it is too large
Load diff
927
assets/kubernetes/dashboards/scheduler.json
Normal file
927
assets/kubernetes/dashboards/scheduler.json
Normal file
|
@ -0,0 +1,927 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 2,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 2,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"kube-scheduler\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Up",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "N/A",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "min"
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 5,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} e2e",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(scheduler_binding_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} binding",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} scheduling algorithm",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} volume",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Scheduling Rate",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 5,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} e2e",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} binding",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} scheduling algorithm",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} volume",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Scheduling latency 99th Quantile",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "2xx",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "3xx",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "4xx",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "5xx",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Kube API Request Rate",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "ops",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 8,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{verb}} {{url}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Post Request Latency 99th Quantile",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 7,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{verb}} {{url}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Get Request Latency 99th Quantile",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 8,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{job=\"kube-scheduler\", instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU usage",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 10,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_goroutines{job=\"kube-scheduler\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Goroutines",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"kubernetes-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"options": [ ],
|
||||
"query": "label_values(process_cpu_seconds_total{job=\"kube-scheduler\"}, instance)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Kubernetes / Scheduler",
|
||||
"uid": "2e6b6a3b4bddf1427b3a55aa1311c656",
|
||||
"version": 0
|
||||
}
|
836
assets/kubernetes/dashboards/statefulset.json
Normal file
836
assets/kubernetes/dashboards/statefulset.json
Normal file
|
@ -0,0 +1,836 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [ ],
|
||||
"refresh": "",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 2,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "cores",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 4,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": true
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "CPU",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "0",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 3,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "GB",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 4,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": true
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_usage_bytes{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Memory",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "0",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 4,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "Bps",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 4,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": true
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{job=\"cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Network",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "0",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"height": "100px",
|
||||
"panels": [
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 5,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Desired Replicas",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "0",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 6,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Replicas of current version",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "0",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 7,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Observed Generation",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "0",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#299c46",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"#d44a3a"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 8,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "",
|
||||
"title": "Metadata Generation",
|
||||
"tooltip": {
|
||||
"shared": false
|
||||
},
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "0",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "replicas specified",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "replicas created",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "ready",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "replicas of current version",
|
||||
"refId": "D"
|
||||
},
|
||||
{
|
||||
"expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "updated",
|
||||
"refId": "E"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Replicas",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"kubernetes-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 2,
|
||||
"includeAll": false,
|
||||
"label": "cluster",
|
||||
"multi": false,
|
||||
"name": "cluster",
|
||||
"options": [ ],
|
||||
"query": "label_values(kube_statefulset_metadata_generation, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"options": [ ],
|
||||
"query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Name",
|
||||
"multi": false,
|
||||
"name": "statefulset",
|
||||
"options": [ ],
|
||||
"query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\"}, statefulset)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Kubernetes / StatefulSets",
|
||||
"uid": "a31c1f46e6f727cb37c0d731a7245005",
|
||||
"version": 0
|
||||
}
|
1257
assets/kubernetes/dashboards/workload-total.json
Normal file
1257
assets/kubernetes/dashboards/workload-total.json
Normal file
File diff suppressed because it is too large
Load diff
642
assets/kubernetes/rules.yaml
Normal file
642
assets/kubernetes/rules.yaml
Normal file
|
@ -0,0 +1,642 @@
|
|||
groups:
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- expr: |
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum(rate(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
-
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
- expr: |
|
||||
sum by (code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (code,resource) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
record: cluster:apiserver_request_duration_seconds:mean5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="kube-apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
rules:
|
||||
- expr: |
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |
|
||||
1 - (
|
||||
sum(increase(apiserver_request_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
|
||||
-
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum(code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="LIST",code=~"2.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="GET",code=~"2.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="POST",code=~"2.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PUT",code=~"2.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PATCH",code=~"2.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="DELETE",code=~"2.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="LIST",code=~"3.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="GET",code=~"3.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="POST",code=~"3.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PUT",code=~"3.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PATCH",code=~"3.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="DELETE",code=~"3.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="LIST",code=~"4.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="GET",code=~"4.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="POST",code=~"4.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PUT",code=~"4.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PATCH",code=~"4.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="DELETE",code=~"4.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="LIST",code=~"5.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="GET",code=~"5.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="POST",code=~"5.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PUT",code=~"5.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="PATCH",code=~"5.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code, verb) (increase(apiserver_request_total{job="kube-apiserver",verb="DELETE",code=~"5.."}[30d]))
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: |
|
||||
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!="POD"}[5m])) by (namespace)
|
||||
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!="POD"}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
container_memory_working_set_bytes{job="cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |
|
||||
container_memory_rss{job="cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |
|
||||
container_memory_cache{job="cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |
|
||||
container_memory_swap{job="cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |
|
||||
sum(container_memory_usage_bytes{job="cadvisor", image!="", container!="POD"}) by (namespace)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
|
||||
1, max by (replicaset, namespace, owner_name) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(min(kube_pod_info{node!=""}) by (cluster, node))
|
||||
record: ':kube_pod_info_node_count:'
|
||||
- expr: |
|
||||
topk by(namespace, pod) (1,
|
||||
max by (node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
))
|
||||
record: node:node_num_cpu:sum
|
||||
- expr: |
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"})
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"})
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"})
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
1
assets/memcached/alerts.yaml
Normal file
1
assets/memcached/alerts.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
1075
assets/memcached/dashboards/memcached-overview.json
Normal file
1075
assets/memcached/dashboards/memcached-overview.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/memcached/rules.yaml
Normal file
1
assets/memcached/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
177
assets/node-exporter/alerts.yaml
Normal file
177
assets/node-exporter/alerts.yaml
Normal file
|
@ -0,0 +1,177 @@
|
|||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
increase(node_network_receive_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
increase(node_network_transmit_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: |
|
||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: |
|
||||
node_textfile_scrape_error{job="node"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
node_timex_offset_seconds > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
847
assets/node-exporter/dashboards/node-cluster-rsrc-use.json
Normal file
847
assets/node-exporter/dashboards/node-cluster-rsrc-use.json
Normal file
|
@ -0,0 +1,847 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node\"}\n*\n instance:node_num_cpu:sum{job=\"node\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_load1_per_cpu:ratio{job=\"node\"}\n/ scalar(count(instance:node_load1_per_cpu:ratio{job=\"node\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU Saturation (load1 per CPU)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "CPU",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_memory_utilisation:ratio{job=\"node\"}\n/ scalar(count(instance:node_memory_utilisation:ratio{job=\"node\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Saturation (Major Page Faults)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "rps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Memory",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "/ Receive/",
|
||||
"stack": "A"
|
||||
},
|
||||
{
|
||||
"alias": "/ Transmit/",
|
||||
"stack": "B",
|
||||
"transform": "negative-Y"
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} Receive",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} Transmit",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Net Utilisation (Bytes Receive/Transmit)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "/ Receive/",
|
||||
"stack": "A"
|
||||
},
|
||||
{
|
||||
"alias": "/ Transmit/",
|
||||
"stack": "B",
|
||||
"transform": "negative-Y"
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} Receive",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} Transmit",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Net Saturation (Drops Receive/Transmit)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "rps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Network",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 7,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} {{device}}",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk IO Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 8,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node\"}))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}} {{device}}",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk IO Saturation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Disk IO",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum without (device) (\n max without (fstype, mountpoint) (\n node_filesystem_size_bytes{job=\"node\", fstype!=\"\"} - node_filesystem_avail_bytes{job=\"node\", fstype!=\"\"}\n )\n) \n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node\", fstype!=\"\"})))\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{instance}}",
|
||||
"legendLink": "/dashboard/file/node-rsrc-use.json",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk Space Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Disk Space",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [ ],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "USE Method / Cluster",
|
||||
"uid": "",
|
||||
"version": 0
|
||||
}
|
870
assets/node-exporter/dashboards/node-rsrc-use.json
Normal file
870
assets/node-exporter/dashboards/node-rsrc-use.json
Normal file
|
@ -0,0 +1,870 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": false,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_cpu_utilisation:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Utilisation",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": false,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_load1_per_cpu:ratio{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Saturation",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU Saturation (Load1 per CPU)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "CPU",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_memory_utilisation:ratio{job=\"node\", job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Memory",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": false,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Major page faults",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Saturation (Major Page Faults)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Memory",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 5,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "/Receive/",
|
||||
"stack": "A"
|
||||
},
|
||||
{
|
||||
"alias": "/Transmit/",
|
||||
"stack": "B",
|
||||
"transform": "negative-Y"
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Receive",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Transmit",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Net Utilisation (Bytes Receive/Transmit)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "/Receive/",
|
||||
"stack": "A"
|
||||
},
|
||||
{
|
||||
"alias": "/Transmit/",
|
||||
"stack": "B",
|
||||
"transform": "negative-Y"
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Receive drops",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Transmit drops",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Net Saturation (Drops Receive/Transmit)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "rps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Net",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 7,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk IO Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 8,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk IO Saturation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Disk IO",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": false,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "1 -\n(\n max without (mountpoint, fstype) (node_filesystem_avail_bytes{job=\"node\", fstype!=\"\", instance=\"$instance\"})\n/\n max without (mountpoint, fstype) (node_filesystem_size_bytes{job=\"node\", fstype!=\"\", instance=\"$instance\"})\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk Space Utilisation",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Disk Space",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [ ],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "default",
|
||||
"value": "default"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "prod",
|
||||
"value": "prod"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "instance",
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"options": [ ],
|
||||
"query": "label_values(up{job=\"node\"}, instance)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 2,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "utc",
|
||||
"title": "USE Method / Node",
|
||||
"uid": "",
|
||||
"version": 0
|
||||
}
|
865
assets/node-exporter/dashboards/nodes.json
Normal file
865
assets/node-exporter/dashboards/nodes.json
Normal file
|
@ -0,0 +1,865 @@
|
|||
{
|
||||
"__inputs": [ ],
|
||||
"__requires": [ ],
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": false,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [ ],
|
||||
"refresh": "",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node\", mode=\"idle\", instance=\"$instance\"})\n)\n",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 5,
|
||||
"legendFormat": "{{cpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "CPU Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 0,
|
||||
"gridPos": { },
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "1m load average",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "5m load average",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "node_load15{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "15m load average",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "count(node_cpu_seconds_total{job=\"node\", instance=\"$instance\", mode=\"idle\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "logical cores",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Load Average",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 9,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(\n node_memory_MemTotal_bytes{job=\"node\", instance=\"$instance\"}\n-\n node_memory_MemFree_bytes{job=\"node\", instance=\"$instance\"}\n-\n node_memory_Buffers_bytes{job=\"node\", instance=\"$instance\"}\n-\n node_memory_Cached_bytes{job=\"node\", instance=\"$instance\"}\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Buffers_bytes{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory buffers",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Cached_bytes{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory cached",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_MemFree_bytes{job=\"node\", instance=\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory free",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Memory Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"rgba(50, 172, 45, 0.97)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(245, 54, 54, 0.9)"
|
||||
],
|
||||
"datasource": "$datasource",
|
||||
"format": "percent",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": true,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": { },
|
||||
"id": 5,
|
||||
"interval": null,
|
||||
"links": [ ],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "null",
|
||||
"text": "N/A",
|
||||
"to": "null"
|
||||
}
|
||||
],
|
||||
"span": 3,
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": false,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": false
|
||||
},
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 -\n(\n node_memory_MemAvailable_bytes{job=\"node\", instance=\"$instance\"}\n/\n node_memory_MemTotal_bytes{job=\"node\", instance=\"$instance\"}\n* 100\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": "80, 90",
|
||||
"title": "Memory Usage",
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "N/A",
|
||||
"value": "null"
|
||||
}
|
||||
],
|
||||
"valueName": "current"
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 0,
|
||||
"gridPos": { },
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "/ read| written/",
|
||||
"yaxis": 1
|
||||
},
|
||||
{
|
||||
"alias": "/ io time/",
|
||||
"yaxis": 2
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"node\", instance=\"$instance\", device!=\"\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}} read",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"node\", instance=\"$instance\", device!=\"\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}} written",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_io_time_seconds_total{job=\"node\", instance=\"$instance\", device!=\"\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}} io time",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk I/O",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"gridPos": { },
|
||||
"id": 7,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "used",
|
||||
"color": "#E0B400"
|
||||
},
|
||||
{
|
||||
"alias": "available",
|
||||
"color": "#73BF69"
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "available",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Disk Space Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"collapsed": false,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 0,
|
||||
"gridPos": { },
|
||||
"id": 8,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total{job=\"node\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Network Received",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 0,
|
||||
"gridPos": { },
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": null,
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total{job=\"node\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
|
||||
"format": "time_series",
|
||||
"interval": "1m",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Network Transmitted",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [ ],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"options": [ ],
|
||||
"query": "label_values(node_exporter_build_info{job=\"node\"}, instance)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "Nodes",
|
||||
"version": 0
|
||||
}
|
58
assets/node-exporter/rules.yaml
Normal file
58
assets/node-exporter/rules.yaml
Normal file
|
@ -0,0 +1,58 @@
|
|||
groups:
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |
|
||||
count without (cpu) (
|
||||
count without (mode) (
|
||||
node_cpu_seconds_total{job="node"}
|
||||
)
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{job="node", mode="idle"}[1m])
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate1m
|
||||
- expr: |
|
||||
(
|
||||
node_load1{job="node"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: |
|
||||
1 - (
|
||||
node_memory_MemAvailable_bytes{job="node"}
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: |
|
||||
rate(node_vmstat_pgmajfault{job="node"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node", device!=""}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
183
assets/prometheus/alerts.yaml
Normal file
183
assets/prometheus/alerts.yaml
Normal file
|
@ -0,0 +1,183 @@
|
|||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to reload its configuration.
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not ingesting samples.
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m])
|
||||
/
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m])
|
||||
+
|
||||
rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m])
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
|
||||
- on(job, instance) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
1426
assets/prometheus/dashboards/prometheus-remote-write.json
Normal file
1426
assets/prometheus/dashboards/prometheus-remote-write.json
Normal file
File diff suppressed because it is too large
Load diff
1059
assets/prometheus/dashboards/prometheus.json
Normal file
1059
assets/prometheus/dashboards/prometheus.json
Normal file
File diff suppressed because it is too large
Load diff
1
assets/prometheus/rules.yaml
Normal file
1
assets/prometheus/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
null
|
11
assets/sealed-secrets/alerts.yaml
Normal file
11
assets/sealed-secrets/alerts.yaml
Normal file
|
@ -0,0 +1,11 @@
|
|||
groups:
|
||||
- name: sealed-secrets
|
||||
rules:
|
||||
- alert: SealedSecretsUnsealErrorRateHigh
|
||||
annotations:
|
||||
message: High rate of errors unsealing Sealed Secrets
|
||||
runbook: https://github.com/bitnami-labs/sealed-secrets
|
||||
expr: |
|
||||
sum(rate(sealed_secrets_controller_unseal_errors_total{}[5m])) > 0
|
||||
labels:
|
||||
severity: warning
|
302
assets/sealed-secrets/dashboards/sealed-secrets-controller.json
Normal file
302
assets/sealed-secrets/dashboards/sealed-secrets-controller.json
Normal file
|
@ -0,0 +1,302 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Sealed Secrets Controller",
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": 3,
|
||||
"iteration": 1585599163503,
|
||||
"links": [
|
||||
{
|
||||
"icon": "external link",
|
||||
"tags": [ ],
|
||||
"title": "GitHub",
|
||||
"tooltip": "View Project on GitHub",
|
||||
"type": "link",
|
||||
"url": "https://github.com/bitnami-labs/sealed-secrets"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "Rate of requests to unseal a SealedSecret.\n\nThis can include non-obvious operations such as deleting a SealedSecret.",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": [ ]
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(sealed_secrets_controller_unseal_requests_total{}[1m]))",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "rps",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [ ],
|
||||
"timeShift": null,
|
||||
"title": "Unseal Request Rate/s",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "Rate of errors when unsealing a SealedSecret. \n\nReason for error included as label value, eg:\n- unseal = cryptography issue (key/namespace) or RBAC\n- unmanaged = destination Secret wasn't created by SealedSecrets\n- update = potentially RBAC\n- status = potentially RBAC\n- fetch = potentially RBAC\n",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"hideEmpty": false,
|
||||
"hideZero": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"options": {
|
||||
"dataLinks": [ ]
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(sealed_secrets_controller_unseal_errors_total{pod=~\"$pod\"}[1m])) by (reason)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "{{ reason }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [ ],
|
||||
"timeShift": null,
|
||||
"title": "Unseal Error Rate/s",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": false,
|
||||
"schemaVersion": 22,
|
||||
"style": "dark",
|
||||
"tags": [ ],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "prometheus",
|
||||
"value": "prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"definition": "label_values(kube_pod_info, pod)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "pod",
|
||||
"options": [ ],
|
||||
"query": "label_values(kube_pod_info, pod)",
|
||||
"refresh": 1,
|
||||
"regex": "/^sealed-secrets-controller.*$/",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Sealed Secrets Controller",
|
||||
"uid": "UuEtZCVWz",
|
||||
"version": 2
|
||||
}
|
1
assets/sealed-secrets/rules.yaml
Normal file
1
assets/sealed-secrets/rules.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
groups: []
|
475
assets/thanos/alerts.yaml
Normal file
475
assets/thanos/alerts.yaml
Normal file
|
@ -0,0 +1,475 @@
|
|||
groups:
|
||||
- name: thanos-compact.rules
|
||||
rules:
|
||||
- alert: ThanosCompactMultipleRunning
|
||||
annotations:
|
||||
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
|
||||
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosCompactHalted
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} has failed to run and now is halted.
|
||||
expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosCompactHighCompactionFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosCompactBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compact.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosCompactHasNotRun
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
||||
expr: (time() - max(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"})) / 60 / 60 > 24
|
||||
labels:
|
||||
severity: warning
|
||||
- name: thanos-query.rules
|
||||
rules:
|
||||
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosQueryGrpcServerErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_server_started_total{job=~"thanos-query.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosQueryGrpcClientErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
) * 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosQueryInstantLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosQueryRangeLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-query.*", handler="query_range"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: thanos-receive.rules
|
||||
rules:
|
||||
- alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosReceiveHttpRequestLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-receive.*", handler="receive"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosReceiveHighForwardRequestFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
|
||||
)
|
||||
>
|
||||
(
|
||||
max by (job) (floor((thanos_receive_replication_factor{job=~"thanos-receive.*"}+1) / 2))
|
||||
/
|
||||
max by (job) (thanos_receive_hashring_nodes{job=~"thanos-receive.*"})
|
||||
)
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosReceiveHighHashringFileRefreshFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
|
||||
> 0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosReceiveConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
|
||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosReceiveNoUpload
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has not uploaded latest data to object storage.
|
||||
expr: increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[2h]) == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: thanos-sidecar.rules
|
||||
rules:
|
||||
- alert: ThanosSidecarPrometheusDown
|
||||
annotations:
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus.
|
||||
expr: |
|
||||
sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosSidecarUnhealthy
|
||||
annotations:
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
|
||||
expr: |
|
||||
count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
- name: thanos-store.rules
|
||||
rules:
|
||||
- alert: ThanosStoreGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosStoreSeriesGateLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
and
|
||||
sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosStoreBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosStoreObjstoreOperationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
and
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: thanos-rule.rules
|
||||
rules:
|
||||
- alert: ThanosRuleQueueIsDroppingAlerts
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosRuleSenderIsFailingAlerts
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosRuleHighRuleEvaluationFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosRuleHighRuleEvaluationWarnings
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: ThanosRuleRuleEvaluationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
||||
>
|
||||
sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"})
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosRuleGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosRuleConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: ThanosRuleQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosRuleAlertmanagerHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: ThanosRuleNoEvaluationFor10Intervals
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
|
||||
expr: |
|
||||
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
||||
>
|
||||
10 * max by (job, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: ThanosNoRuleEvaluations
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
|
||||
expr: |
|
||||
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
||||
and
|
||||
sum(thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
- name: thanos-component-absent.rules
|
||||
rules:
|
||||
- alert: ThanosCompactIsDown
|
||||
annotations:
|
||||
message: ThanosCompact has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-compact.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosQueryIsDown
|
||||
annotations:
|
||||
message: ThanosQuery has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-query.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosReceiveIsDown
|
||||
annotations:
|
||||
message: ThanosReceive has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-receive.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosRuleIsDown
|
||||
annotations:
|
||||
message: ThanosRule has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-rule.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosSidecarIsDown
|
||||
annotations:
|
||||
message: ThanosSidecar has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-sidecar.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosStoreIsDown
|
||||
annotations:
|
||||
message: ThanosStore has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-store.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: thanos-bucket-replicate.rules
|
||||
rules:
|
||||
- alert: ThanosBucketReplicateIsDown
|
||||
annotations:
|
||||
message: Thanos Replicate has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-bucket-replicate.*"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosBucketReplicateErrorRate
|
||||
annotations:
|
||||
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
|
||||
/ on (namespace) group_left
|
||||
sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m]))
|
||||
) * 100 >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: ThanosBucketReplicateRunLatency
|
||||
annotations:
|
||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
||||
and
|
||||
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
515
assets/thanos/dashboards/bucket_replicate.json
Normal file
515
assets/thanos/dashboards/bucket_replicate.json
Normal file
|
@ -0,0 +1,515 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [ ]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"links": [ ],
|
||||
"refresh": "10s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {
|
||||
"error": "#E24D42"
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 10,
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_replication_runs_total{result=\"error\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) / sum(rate(thanos_replicate_replication_runs_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "error",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Rate",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows rate of errors.",
|
||||
"fill": 10,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 0,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_replication_runs_total{result=\"error\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (result)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{result}}",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Errors",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percentunit",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"description": "Shows how long has it taken to run a replication cycle.",
|
||||
"fill": 1,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job, le)) * 1",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "P99 {{job}}",
|
||||
"refId": "A",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_replication_run_duration_seconds_sum{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job) * 1 / sum(rate(thanos_replicate_replication_run_duration_seconds_count{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "mean {{job}}",
|
||||
"refId": "B",
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(thanos_replicate_replication_run_duration_seconds_bucket{result=\"success\", namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval])) by (job, le)) * 1",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "P50 {{job}}",
|
||||
"refId": "C",
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Duration",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "s",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Bucket Replicate Runs",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": { },
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "$datasource",
|
||||
"fill": 1,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [ ],
|
||||
"spaceLength": 10,
|
||||
"span": 12,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_origin_iterations_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "iterations",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_origin_meta_loads_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "meta loads",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_origin_partial_meta_reads_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "partial meta reads",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_blocks_already_replicated_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "already replicated blocks",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_blocks_replicated_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "replicated blocks",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(thanos_replicate_objects_replicated_total{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}[$interval]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "replicated objects",
|
||||
"legendLink": null,
|
||||
"step": 10
|
||||
}
|
||||
],
|
||||
"thresholds": [ ],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Metrics",
|
||||
"tooltip": {
|
||||
"shared": false,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": [ ]
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": 0,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": true,
|
||||
"title": "Bucket Replication",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"thanos-mixin"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": null,
|
||||
"name": "datasource",
|
||||
"options": [ ],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": { },
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "namespace",
|
||||
"multi": false,
|
||||
"name": "namespace",
|
||||
"options": [ ],
|
||||
"query": "label_values(kube_pod_info{}, namespace)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 2,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": "thanos-bucket-replicate.*",
|
||||
"current": {
|
||||
"text": "all",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": "$datasource",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "job",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"options": [ ],
|
||||
"query": "label_values(up{namespace=\"$namespace\",job=~\"thanos-bucket-replicate.*\"}, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 2,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [ ],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"auto": true,
|
||||
"auto_count": 300,
|
||||
"auto_min": "10s",
|
||||
"current": {
|
||||
"text": "5m",
|
||||
"value": "5m"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "interval",
|
||||
"name": "interval",
|
||||
"query": "5m,10m,30m,1h,6h,12h",
|
||||
"refresh": 2,
|
||||
"type": "interval"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Thanos / BucketReplicate",
|
||||
"uid": "49f644ecf8e31dd1a5084ae2a5f10e80",
|
||||
"version": 0
|
||||
}
|
1549
assets/thanos/dashboards/compact.json
Normal file
1549
assets/thanos/dashboards/compact.json
Normal file
File diff suppressed because it is too large
Load diff
2066
assets/thanos/dashboards/overview.json
Normal file
2066
assets/thanos/dashboards/overview.json
Normal file
File diff suppressed because it is too large
Load diff
2481
assets/thanos/dashboards/query.json
Normal file
2481
assets/thanos/dashboards/query.json
Normal file
File diff suppressed because it is too large
Load diff
2336
assets/thanos/dashboards/receive.json
Normal file
2336
assets/thanos/dashboards/receive.json
Normal file
File diff suppressed because it is too large
Load diff
2014
assets/thanos/dashboards/rule.json
Normal file
2014
assets/thanos/dashboards/rule.json
Normal file
File diff suppressed because it is too large
Load diff
1889
assets/thanos/dashboards/sidecar.json
Normal file
1889
assets/thanos/dashboards/sidecar.json
Normal file
File diff suppressed because it is too large
Load diff
3098
assets/thanos/dashboards/store.json
Normal file
3098
assets/thanos/dashboards/store.json
Normal file
File diff suppressed because it is too large
Load diff
125
assets/thanos/rules.yaml
Normal file
125
assets/thanos/rules.yaml
Normal file
|
@ -0,0 +1,125 @@
|
|||
groups:
|
||||
- name: thanos-query.rules
|
||||
rules:
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_client_failures_per_unary:sum_rate
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_client_failures_per_stream:sum_rate
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :query_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :api_range_query_duration_seconds:histogram_quantile
|
||||
- name: thanos-receive.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(
|
||||
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m])
|
||||
/
|
||||
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_unary:sum_rate
|
||||
- expr: |
|
||||
sum(
|
||||
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
|
||||
/
|
||||
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_stream:sum_rate
|
||||
- expr: |
|
||||
sum(
|
||||
rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m])
|
||||
/
|
||||
rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: :http_failure_per_request:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :http_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_receive_forward_failure_per_requests:sum_rate
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate
|
||||
- name: thanos-store.rules
|
||||
rules:
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_unary:sum_rate
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_stream:sum_rate
|
||||
- expr: |
|
||||
(
|
||||
sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_objstore_bucket_failures_per_operation:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile
|
||||
- name: thanos-bucket-replicate.rules
|
||||
rules: []
|
123
generate.sh
Executable file
123
generate.sh
Executable file
|
@ -0,0 +1,123 @@
|
|||
#!/bin/bash
|
||||
#shellcheck disable=SC2129,SC2164
|
||||
#set -euo pipefail
|
||||
|
||||
MANIFESTS="assets"
|
||||
TOP=$(git rev-parse --show-toplevel)
|
||||
TMPDIR="${TOP}/tmp"
|
||||
|
||||
download_mixin() {
|
||||
local mixin="$1"
|
||||
local repo="$2"
|
||||
local subdir="$3"
|
||||
|
||||
git clone --depth 1 --filter=blob:none "$repo" "${TMPDIR}/$mixin"
|
||||
mkdir -p "${TOP}/${MANIFESTS}/${mixin}/dashboards"
|
||||
(
|
||||
cd "${TMPDIR}/${mixin}/${subdir}"
|
||||
if [ -f "jsonnetfile.json" ]; then
|
||||
jb install
|
||||
fi
|
||||
|
||||
jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' | gojsontoyaml > "${TOP}/${MANIFESTS}/${mixin}/alerts.yaml" || :
|
||||
jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' | gojsontoyaml > "${TOP}/${MANIFESTS}/${mixin}/rules.yaml" || :
|
||||
jsonnet -J vendor -m "${TOP}/${MANIFESTS}/${mixin}/dashboards" -e '(import "mixin.libsonnet").grafanaDashboards' || :
|
||||
)
|
||||
}
|
||||
|
||||
parse_rules() {
|
||||
local source="$1"
|
||||
local type="$2"
|
||||
for group in $(echo "$source" | jq -cr '.groups[].name'); do
|
||||
echo -e "### ${group}\n"
|
||||
for rule in $(echo "$source" | jq -cr ".groups[] | select(.name == \"${group}\") | .rules[] | @base64"); do
|
||||
var=$(echo "$rule" | base64 --decode | gojsontoyaml);
|
||||
name=$(echo -e "$var" | grep "$type" | awk -F ': ' '{print $2}')
|
||||
echo -e "##### ${name}\n"
|
||||
echo -e '{{< code lang="yaml" >}}'
|
||||
echo -e "$var"
|
||||
echo -e '{{< /code >}}\n '
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
panel() {
|
||||
echo -e "{{< panel style=\"$1\" >}}"
|
||||
echo -e "$2"
|
||||
echo -e "{{< /panel >}}\n"
|
||||
}
|
||||
|
||||
mixin_header() {
|
||||
local name="$1"
|
||||
local repo="$2"
|
||||
local url="$3"
|
||||
local description="$4"
|
||||
|
||||
cat << EOF
|
||||
---
|
||||
title: $name
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
$description
|
||||
|
||||
EOF
|
||||
panel "danger" "Jsonnet source code is available at [${repo#*//}]($url)"
|
||||
}
|
||||
|
||||
|
||||
cd "${TOP}" || exit 1
|
||||
# remove generated assets and temporary directory
|
||||
rm -rf "$MANIFESTS" "$TMPDIR"
|
||||
# remove generated site content
|
||||
find site/content/ ! -name '_index.md' -type f -exec rm -rf {} +
|
||||
|
||||
mkdir -p "${TMPDIR}"
|
||||
|
||||
# Generate mixins
|
||||
CONFIG=$(gojsontoyaml -yamltojson < mixins.yaml)
|
||||
|
||||
for mixin in $(echo "$CONFIG" | jq -r '.mixins[].name'); do
|
||||
repo="$(echo "$CONFIG" | jq -r ".mixins[] | select(.name == \"$mixin\") | .source")"
|
||||
subdir="$(echo "$CONFIG" | jq -r ".mixins[] | select(.name == \"$mixin\") | .subdir")"
|
||||
text="$(echo "$CONFIG" | jq -r ".mixins[] | select(.name == \"$mixin\") | .description")"
|
||||
if [ "$text" == "null" ]; then text=""; fi
|
||||
set +u
|
||||
download_mixin "$mixin" "$repo" "$subdir"
|
||||
#set -u
|
||||
|
||||
mkdir -p "site/content/${mixin}"
|
||||
file="site/content/${mixin}/_index.md"
|
||||
# Create header
|
||||
if [ -n "$subdir" ]; then
|
||||
location="$repo/tree/master/$subdir"
|
||||
else
|
||||
location="$repo"
|
||||
fi
|
||||
mixin_header "$mixin" "$repo" "$location" "$text" > "$file"
|
||||
|
||||
dir="$TOP/$MANIFESTS/$mixin"
|
||||
# Alerts
|
||||
if [ -s "$dir/alerts.yaml" ] && [ "$(stat -c%s "$dir/alerts.yaml")" -gt 20 ]; then
|
||||
echo -e "## Alerts\n" >> "$file"
|
||||
panel "warning" "Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/$MANIFESTS/$mixin/alerts.yaml)." >> "$file"
|
||||
parse_rules "$(gojsontoyaml -yamltojson < "$dir/alerts.yaml")" "alert" >> "$file"
|
||||
fi
|
||||
|
||||
# Recording Rules
|
||||
if [ -s "$dir/rules.yaml" ] && [ "$(stat -c%s "$dir/rules.yaml")" -gt 20 ]; then
|
||||
echo -e "## Recording rules\n" >> "$file"
|
||||
panel "warning" "Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/$MANIFESTS/$mixin/rules.yaml)." >> "$file"
|
||||
parse_rules "$(gojsontoyaml -yamltojson < "$dir/rules.yaml")" "record" >> "$file"
|
||||
fi
|
||||
|
||||
# Dashboards
|
||||
if [ "$(ls -A "$dir/dashboards")" ]; then
|
||||
echo -e "## Dashboards\nFollowing dashboards are generated from mixins and hosted on github:\n\n" >> "$file"
|
||||
for dashboard in "$dir/dashboards"/*.json; do
|
||||
d="$(basename "$dashboard")"
|
||||
echo "- [${d%.*}](https://github.com/monitoring-mixins/website/blob/master/$MANIFESTS/$mixin/dashboards/$d)" >> "$file"
|
||||
done
|
||||
fi
|
||||
done
|
52
mixins.yaml
Normal file
52
mixins.yaml
Normal file
|
@ -0,0 +1,52 @@
|
|||
---
|
||||
mixins:
|
||||
- name: "prometheus"
|
||||
source: "https://github.com/prometheus/prometheus"
|
||||
subdir: "documentation/prometheus-mixin"
|
||||
description: "The Prometheus Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Prometheus."
|
||||
- name: node-exporter
|
||||
source: "https://github.com/prometheus/node_exporter"
|
||||
subdir: "docs/node-mixin"
|
||||
description: "The Node Mixin is a set of configurable, reusable, and extensible alerts and dashboards based on the metrics exported by the Node Exporter. The mixin creates recording and alerting rules for Prometheus and suitable dashboard descriptions for Grafana."
|
||||
- name: kubernetes
|
||||
source: "https://github.com/kubernetes-monitoring/kubernetes-mixin"
|
||||
subdir: ""
|
||||
description: "A set of Grafana dashboards and Prometheus alerts for Kubernetes."
|
||||
- name: kube-state-metrics
|
||||
source: "https://github.com/kubernetes/kube-state-metrics"
|
||||
subdir: "jsonnet/kube-state-metrics-mixin"
|
||||
- name: etcd
|
||||
source: "https://github.com/etcd-io/etcd"
|
||||
subdir: "Documentation/etcd-mixin"
|
||||
description: "A set of customisable Prometheus alerts for etcd."
|
||||
- name: ceph
|
||||
source: "https://github.com/ceph/ceph-mixins"
|
||||
subdir: ""
|
||||
description: |
|
||||
A set of Prometheus alerts for Ceph.
|
||||
|
||||
The scope of this project is to provide Ceph specific Prometheus rule files using Prometheus Mixins.
|
||||
- name: gluster
|
||||
source: "https://github.com/gluster/gluster-mixins"
|
||||
subdir: ""
|
||||
description: |
|
||||
A set of Grafana dashboards and Prometheus alerts for Gluster.
|
||||
|
||||
The scope of this project is to provide Gluster specific Grafana dashboard configs and Prometheus rule files using Prometheus Mixins.
|
||||
- name: consul
|
||||
source: "https://github.com/grafana/jsonnet-libs"
|
||||
subdir: "consul-mixin"
|
||||
description: "Grafana dashboards and Prometheus alerts for operating Consul, in the form of a monitoring mixin."
|
||||
- name: jaeger
|
||||
source: "https://github.com/grafana/jsonnet-libs"
|
||||
subdir: "jaeger-mixin"
|
||||
- name: memcached
|
||||
source: "https://github.com/grafana/jsonnet-libs"
|
||||
subdir: "memcached-mixin"
|
||||
description: "Grafana dashboard for operating Memcached, in the form of a monitoring mixin."
|
||||
- name: sealed-secrets
|
||||
source: "https://github.com/bitnami-labs/sealed-secrets"
|
||||
subdir: "contrib/prometheus-mixin"
|
||||
- name: thanos
|
||||
source: "https://github.com/thanos-io/thanos"
|
||||
subdir: "mixin"
|
31
netlify.toml
Normal file
31
netlify.toml
Normal file
|
@ -0,0 +1,31 @@
|
|||
[build]
|
||||
base = "site/"
|
||||
publish = "public"
|
||||
command = "hugo --gc --minify"
|
||||
|
||||
[context.production.environment]
|
||||
HUGO_VERSION = "0.70.0"
|
||||
HUGO_ENV = "production"
|
||||
HUGO_ENABLEGITINFO = "true"
|
||||
|
||||
[context.split1]
|
||||
command = "hugo --gc --minify --enableGitInfo"
|
||||
|
||||
[context.split1.environment]
|
||||
HUGO_VERSION = "0.70.0"
|
||||
HUGO_ENV = "production"
|
||||
|
||||
[context.deploy-preview]
|
||||
command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL"
|
||||
|
||||
[context.deploy-preview.environment]
|
||||
HUGO_VERSION = "0.70.0"
|
||||
|
||||
[context.branch-deploy]
|
||||
command = "hugo --gc --minify -b $DEPLOY_PRIME_URL"
|
||||
|
||||
[context.branch-deploy.environment]
|
||||
HUGO_VERSION = "0.70.0"
|
||||
|
||||
[context.next.environment]
|
||||
HUGO_ENABLEGITINFO = "true"
|
38
site/config.yaml
Normal file
38
site/config.yaml
Normal file
|
@ -0,0 +1,38 @@
|
|||
---
|
||||
baseURL: "https://monitoring.mixins.dev/"
|
||||
# baseURL: ""
|
||||
languageCode: "en-us"
|
||||
title: "Monitoring Mixins"
|
||||
|
||||
theme: 'ace-documentation'
|
||||
|
||||
# Google analytics
|
||||
# googleAnalytics: UA-123456789-1
|
||||
|
||||
permalinks:
|
||||
post: /:filename/
|
||||
|
||||
params:
|
||||
project_name: Monitoring Mixins
|
||||
|
||||
project_tagline: Combination of alerts, recording rules, and dashboards for prometheus exporters
|
||||
|
||||
disableSearch: true
|
||||
disableReadmoreNav: true
|
||||
|
||||
markup:
|
||||
highlight:
|
||||
style: monokailight
|
||||
|
||||
menu:
|
||||
shortcuts:
|
||||
- name: Homepage
|
||||
url: /
|
||||
weight: 1
|
||||
- name: About mixins
|
||||
url: "https://github.com/monitoring-mixins/docs"
|
||||
weight: 2
|
||||
- name: "GitHub"
|
||||
url: "https://github.com/monitoring-mixins/website"
|
||||
weight: 3
|
||||
|
153
site/content/_index.md
Normal file
153
site/content/_index.md
Normal file
|
@ -0,0 +1,153 @@
|
|||
---
|
||||
title: Prometheus Monitoring Mixins
|
||||
---
|
||||
|
||||
A mixin is a set of Grafana dashboards and Prometheus rules and alerts, packaged together in a reuseable and extensible bundle.
|
||||
Mixins are written in [jsonnet](https://jsonnet.org/), and are typically installed and updated with [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler).
|
||||
|
||||
For more information about mixins, see:
|
||||
* [Prometheus Monitoring Mixins Design Doc](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/view). A [cached pdf](design.pdf) is included in monitoring mixins [documentation repository](https://github.com/monitoring-mixins/docs).
|
||||
* For more motivation, see
|
||||
"[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin 2018. The KLUMPs system demo'd became the basis for the kubernetes-mixin.
|
||||
* "[Prometheus Monitoring Mixins: Using Jsonnet to Package Together Dashboards, Alerts and Exporters](https://www.youtube.com/watch?v=b7-DtFfsL6E)" talk from CloudNativeCon Copenhagen 2018.
|
||||
* "[Prometheus Monitoring Mixins: Using Jsonnet to Package Together Dashboards, Alerts and Exporters](https://promcon.io/2018-munich/talks/prometheus-monitoring-mixins/)" talk from PromCon 2018 (slightly updated).
|
||||
|
||||
## How to use mixins.
|
||||
|
||||
Mixins are designed to be vendored into the repo with your infrastructure config.
|
||||
To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler):
|
||||
|
||||
You then have three options for deploying your dashboards
|
||||
1. Generate the config files and deploy them yourself.
|
||||
1. Use ksonnet to deploy this mixin along with Prometheus and Grafana.
|
||||
1. Use kube-prometheus to deploy this mixin.
|
||||
|
||||
## Generate config files
|
||||
|
||||
You can manually generate the alerts, dashboards and rules files, but first you
|
||||
must install some tools:
|
||||
|
||||
```bash
|
||||
$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
|
||||
|
||||
# macOS
|
||||
$ brew install jsonnet
|
||||
|
||||
# Archlinux AUR
|
||||
$ yay -S jsonnet
|
||||
```
|
||||
|
||||
Then, grab the mixin and its dependencies:
|
||||
|
||||
```bash
|
||||
$ git clone https://github.com/<mixin org>/<mixin repo>
|
||||
$ cd <mixin repo>
|
||||
$ jb install
|
||||
```
|
||||
|
||||
Finally, build the mixin:
|
||||
|
||||
```bash
|
||||
$ make prometheus_alerts.yaml
|
||||
$ make prometheus_rules.yaml
|
||||
$ make dashboards_out
|
||||
```
|
||||
|
||||
The `prometheus_alerts.yaml` and `prometheus_rules.yaml` file then need to passed
|
||||
to your Prometheus server, and the files in `dashboards_out` need to be imported
|
||||
into you Grafana server. The exact details will depending on how you deploy your
|
||||
monitoring stack to Kubernetes.
|
||||
|
||||
## Using with prometheus-ksonnet
|
||||
|
||||
Alternatively you can also use the mixin with
|
||||
[prometheus-ksonnet](https://github.com/grafana/jsonnet-libs/tree/master/prometheus-ksonnet),
|
||||
a [ksonnet](https://github.com/ksonnet/ksonnet) module to deploy a fully-fledged
|
||||
Prometheus-based monitoring system for Kubernetes:
|
||||
|
||||
Make sure you have the ksonnet v0.8.0:
|
||||
|
||||
```bash
|
||||
$ brew install https://raw.githubusercontent.com/ksonnet/homebrew-tap/82ef24cb7b454d1857db40e38671426c18cd8820/ks.rb
|
||||
$ brew pin ks
|
||||
$ ks version
|
||||
ksonnet version: v0.8.0
|
||||
jsonnet version: v0.9.5
|
||||
client-go version: v1.6.8-beta.0+$Format:%h$
|
||||
```
|
||||
|
||||
In your config repo, if you don't have a ksonnet application, make a new one (will copy credentials from current context):
|
||||
|
||||
```bash
|
||||
$ ks init <application name>
|
||||
$ cd <application name>
|
||||
$ ks env add default
|
||||
```
|
||||
|
||||
Grab the kubernetes-jsonnet module using and its dependencies, which include
|
||||
the kubernetes-mixin:
|
||||
|
||||
```bash
|
||||
$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
|
||||
$ jb init
|
||||
$ jb install github.com/kausalco/public/prometheus-ksonnet
|
||||
|
||||
```
|
||||
|
||||
Assuming you want to run in the default namespace ('environment' in ksonnet parlance), add the follow to the file `environments/default/main.jsonnet`:
|
||||
|
||||
{{< code lang="as3" >}}
|
||||
local prometheus = import "prometheus-ksonnet/prometheus-ksonnet.libsonnet";
|
||||
|
||||
prometheus {
|
||||
_config+:: {
|
||||
namespace: "default",
|
||||
},
|
||||
}
|
||||
{{< /code >}}
|
||||
|
||||
Apply your config:
|
||||
|
||||
```bash
|
||||
$ ks apply default
|
||||
```
|
||||
|
||||
## Using kube-prometheus
|
||||
|
||||
See the kube-prometheus docs for [instructions on how to use mixins with kube-prometheus](https://github.com/coreos/kube-prometheus#kube-prometheus).
|
||||
|
||||
## Customising the mixin
|
||||
|
||||
Mixins typically allows you to override the selectors used for various jobs,
|
||||
to match those used in your Prometheus set.
|
||||
|
||||
This example uses the [kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin).
|
||||
In a new directory, add a file `mixin.libsonnet`:
|
||||
|
||||
{{< code lang="as3" >}}
|
||||
local kubernetes = import "kubernetes-mixin/mixin.libsonnet";
|
||||
|
||||
kubernetes {
|
||||
_config+:: {
|
||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
||||
cadvisorSelector: 'job="kubernetes-cadvisor"',
|
||||
nodeExporterSelector: 'job="kubernetes-node-exporter"',
|
||||
kubeletSelector: 'job="kubernetes-kubelet"',
|
||||
},
|
||||
}
|
||||
{{< /code >}}
|
||||
|
||||
Then, install the kubernetes-mixin:
|
||||
|
||||
```bash
|
||||
$ jb init
|
||||
$ jb install github.com/kubernetes-monitoring/kubernetes-mixin
|
||||
```
|
||||
|
||||
Generate the alerts, rules and dashboards:
|
||||
|
||||
```bash
|
||||
$ jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusAlerts)' > alerts.yml
|
||||
$ jsonnet -J vendor -S -e 'std.manifestYamlDoc((import "mixin.libsonnet").prometheusRules)' >files/rules.yml
|
||||
$ jsonnet -J vendor -m files/dashboards -e '(import "mixin.libsonnet").grafanaDashboards'
|
||||
```
|
388
site/content/ceph/_index.md
Normal file
388
site/content/ceph/_index.md
Normal file
|
@ -0,0 +1,388 @@
|
|||
---
|
||||
title: ceph
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
A set of Prometheus alerts for Ceph.
|
||||
|
||||
The scope of this project is to provide Ceph specific Prometheus rule files using Prometheus Mixins.
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/ceph/ceph-mixins](https://github.com/ceph/ceph-mixins)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### ceph-mgr-status
|
||||
|
||||
##### CephMgrIsAbsent
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephMgrIsAbsent
|
||||
annotations:
|
||||
description: Ceph Manager has disappeared from Prometheus target discovery.
|
||||
message: Storage metrics collector service not available anymore.
|
||||
severity_level: critical
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
absent(up{job="rook-ceph-mgr"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CephMgrIsMissingReplicas
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephMgrIsMissingReplicas
|
||||
annotations:
|
||||
description: Ceph Manager is missing replicas.
|
||||
message: Storage metrics collector service doesn't have required no of replicas.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(up{job="rook-ceph-mgr"}) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### ceph-mds-status
|
||||
|
||||
##### CephMdsMissingReplicas
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephMdsMissingReplicas
|
||||
annotations:
|
||||
description: Minimum required replicas for storage metadata service not available. Might affect the working of storage cluster.
|
||||
message: Insufficient replicas for storage metadata service.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### quorum-alert.rules
|
||||
|
||||
##### CephMonQuorumAtRisk
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephMonQuorumAtRisk
|
||||
annotations:
|
||||
description: Storage cluster quorum is low. Contact Support.
|
||||
message: Storage quorum at risk
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= ((count(ceph_mon_metadata{job="rook-ceph-mgr"}) % 2) + 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CephMonHighNumberOfLeaderChanges
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephMonHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname }} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
|
||||
message: Storage Cluster has seen many leader changes recently.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
(ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### ceph-node-alert.rules
|
||||
|
||||
##### CephNodeDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephNodeDown
|
||||
annotations:
|
||||
description: Storage node {{ $labels.node }} went down. Please check the node immediately.
|
||||
message: Storage node {{ $labels.node }} went down
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
cluster:ceph_node_down:join_kube == 0
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### osd-alert.rules
|
||||
|
||||
##### CephOSDCriticallyFull
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDCriticallyFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or expand the storage cluster or contact support.
|
||||
message: Back-end storage device is critically full.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
(ceph_osd_metadata * on (ceph_daemon) group_left() (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.85
|
||||
for: 40s
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CephOSDNearFull
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDNearFull
|
||||
annotations:
|
||||
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage cluster or contact support.
|
||||
message: Back-end storage device is nearing full.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
(ceph_osd_metadata * on (ceph_daemon) group_left() (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75
|
||||
for: 40s
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CephOSDDiskNotResponding
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDDiskNotResponding
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host }}.
|
||||
message: Disk not responding
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CephOSDDiskUnavailable
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDDiskUnavailable
|
||||
annotations:
|
||||
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host }}.
|
||||
message: Disk not accessible
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CephDataRecoveryTakingTooLong
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephDataRecoveryTakingTooLong
|
||||
annotations:
|
||||
description: Data recovery has been active for too long. Contact Support.
|
||||
message: Data recovery is slow
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_pg_undersized > 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CephPGRepairTakingTooLong
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephPGRepairTakingTooLong
|
||||
annotations:
|
||||
description: Self heal operations taking too long. Contact Support.
|
||||
message: Self heal problems detected
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_pg_inconsistent > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### cluster-state-alert.rules
|
||||
|
||||
##### CephClusterErrorState
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephClusterErrorState
|
||||
annotations:
|
||||
description: Storage cluster is in error state for more than 10m.
|
||||
message: Storage cluster is in error state
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_health_status{job="rook-ceph-mgr"} > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### CephClusterWarningState
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephClusterWarningState
|
||||
annotations:
|
||||
description: Storage cluster is in warning state for more than 10m.
|
||||
message: Storage cluster is in degraded state
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
ceph_health_status{job="rook-ceph-mgr"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CephOSDVersionMismatch
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephOSDVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different versions of Ceph OSD components running.
|
||||
message: There are multiple versions of storage services running.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CephMonVersionMismatch
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephMonVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different versions of Ceph Mon components running.
|
||||
message: There are multiple versions of storage services running.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### cluster-utilization-alert.rules
|
||||
|
||||
##### CephClusterNearFull
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephClusterNearFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 75%. Free up some space or expand the storage cluster.
|
||||
message: Storage cluster is nearing full. Data deletion or cluster expansion is required.
|
||||
severity_level: warning
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.75
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### CephClusterCriticallyFull
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: CephClusterCriticallyFull
|
||||
annotations:
|
||||
description: Storage cluster utilization has crossed 85%. Free up some space or expand the storage cluster immediately.
|
||||
message: Storage cluster is critically full and needs immediate data deletion or cluster expansion.
|
||||
severity_level: error
|
||||
storage_type: ceph
|
||||
expr: |
|
||||
sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Recording rules
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/rules.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### ceph.rules
|
||||
|
||||
##### cluster:ceph_node_down:join_kube
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)
|
||||
record: cluster:ceph_node_down:join_kube
|
||||
{{< /code >}}
|
||||
|
||||
##### cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
avg(max by(instance) (label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)") * on(instance, device) group_right() (irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
|
||||
record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
|
||||
{{< /code >}}
|
||||
|
||||
### telemeter.rules
|
||||
|
||||
##### job:ceph_osd_metadata:count
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
count(ceph_osd_metadata{job="rook-ceph-mgr"})
|
||||
record: job:ceph_osd_metadata:count
|
||||
{{< /code >}}
|
||||
|
||||
##### job:kube_pv:count
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
count(kube_persistentvolume_info)
|
||||
record: job:kube_pv:count
|
||||
{{< /code >}}
|
||||
|
||||
##### job:ceph_pools_iops:total
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"})
|
||||
record: job:ceph_pools_iops:total
|
||||
{{< /code >}}
|
||||
|
||||
##### job:ceph_pools_iops_bytes:total
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"})
|
||||
record: job:ceph_pools_iops_bytes:total
|
||||
{{< /code >}}
|
||||
|
||||
##### job:ceph_versions_running:count
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version))
|
||||
record: job:ceph_versions_running:count
|
||||
{{< /code >}}
|
||||
|
64
site/content/consul/_index.md
Normal file
64
site/content/consul/_index.md
Normal file
|
@ -0,0 +1,64 @@
|
|||
---
|
||||
title: consul
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Grafana dashboards and Prometheus alerts for operating Consul, in the form of a monitoring mixin.
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/consul-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/consul/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### consul
|
||||
|
||||
##### ConsulUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ConsulUp
|
||||
annotations:
|
||||
message: Consul '{{ $labels.job }}' is not up.
|
||||
expr: |
|
||||
consul_up != 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ConsulMaster
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ConsulMaster
|
||||
annotations:
|
||||
message: Consul '{{ $labels.job }}' has no master.
|
||||
expr: |
|
||||
consul_raft_leader != 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ConsulPeers
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ConsulPeers
|
||||
annotations:
|
||||
message: Consul '{{ $labels.job }}' does not have 3 peers.
|
||||
expr: |
|
||||
consul_raft_peers != 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [consul-overview](https://github.com/monitoring-mixins/website/blob/master/assets/consul/dashboards/consul-overview.json)
|
227
site/content/etcd/_index.md
Normal file
227
site/content/etcd/_index.md
Normal file
|
@ -0,0 +1,227 @@
|
|||
---
|
||||
title: etcd
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
A set of customisable Prometheus alerts for etcd.
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/etcd-io/etcd](https://github.com/etcd-io/etcd/tree/master/Documentation/etcd-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/etcd/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### etcd
|
||||
|
||||
##### etcdMembersDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdMembersDown
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
|
||||
expr: |
|
||||
max by (job) (
|
||||
sum by (job) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count by (job,endpoint) (
|
||||
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdInsufficientMembers
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
|
||||
expr: |
|
||||
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdNoLeader
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
||||
expr: |
|
||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighNumberOfLeaderChanges
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
expr: |
|
||||
increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighNumberOfFailedGRPCRequests
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighNumberOfFailedGRPCRequests
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdGRPCRequestsSlow
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdMemberCommunicationSlow
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighNumberOfFailedProposals
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighFsyncDurations
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighCommitDurations
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighNumberOfFailedHTTPRequests
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHighNumberOfFailedHTTPRequests
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### etcdHTTPRequestsSlow
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [etcd](https://github.com/monitoring-mixins/website/blob/master/assets/etcd/dashboards/etcd.json)
|
204
site/content/gluster/_index.md
Normal file
204
site/content/gluster/_index.md
Normal file
|
@ -0,0 +1,204 @@
|
|||
---
|
||||
title: gluster
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
A set of Grafana dashboards and Prometheus alerts for Gluster.
|
||||
|
||||
The scope of this project is to provide Gluster specific Grafana dashboard configs and Prometheus rule files using Prometheus Mixins.
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/gluster/gluster-mixins](https://github.com/gluster/gluster-mixins)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/gluster/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### exporter-absent
|
||||
|
||||
##### GlusterExporterDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterExporterDown
|
||||
annotations:
|
||||
message: GlusterExporter has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job="glusterd2-client"}==1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### status-alert.rules
|
||||
|
||||
##### GlusterBrickStatus
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterBrickStatus
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.hostname}}:{{$labels.brick_path}} is down.
|
||||
expr: |
|
||||
gluster_brick_up{job="glusterd2-client"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### GlusterVolumeStatus
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterVolumeStatus
|
||||
annotations:
|
||||
message: Gluster Volume {{$labels.volume}} is down.
|
||||
expr: |
|
||||
gluster_volume_up{job="glusterd2-client"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### gluster-utilization
|
||||
|
||||
##### GlusterVolumeUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterVolumeUtilization
|
||||
annotations:
|
||||
message: Gluster Volume {{$labels.volume}} Utilization more than 80%
|
||||
expr: |
|
||||
100 * gluster:volume_capacity_used_bytes_total:sum
|
||||
/ gluster:volume_capacity_total_bytes:sum > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### GlusterVolumeUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterVolumeUtilization
|
||||
annotations:
|
||||
message: Gluster Volume {{$labels.volume}} Utilization more than 90%
|
||||
expr: |
|
||||
100 * gluster:volume_capacity_used_bytes_total:sum
|
||||
/ gluster:volume_capacity_total_bytes:sum > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### GlusterBrickUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 80%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### GlusterBrickUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterBrickUtilization
|
||||
annotations:
|
||||
message: Gluster Brick {{$labels.host}}:{{$labels.brick_path}} Utilization more than 90%
|
||||
expr: |
|
||||
100 * gluster_brick_capacity_used_bytes{job="glusterd2-client"}
|
||||
/ gluster_brick_capacity_bytes_total{job="glusterd2-client"} > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### thinpool-utilization
|
||||
|
||||
##### GlusterThinpoolDataUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 80%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### GlusterThinpoolDataUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolDataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Data Utilization more than 90%
|
||||
expr: |
|
||||
gluster_thinpool_data_used_bytes{job="glusterd2-client"} / gluster_thinpool_data_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### GlusterThinpoolMetadataUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 80%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### GlusterThinpoolMetadataUtilization
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: GlusterThinpoolMetadataUtilization
|
||||
annotations:
|
||||
message: Gluster Thinpool {{ $labels.thinpool_name }} Metadata Utilization more than 90%
|
||||
expr: |
|
||||
gluster_thinpool_metadata_used_bytes{job="glusterd2-client"} / gluster_thinpool_metadata_total_bytes{job="glusterd2-client"} > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Recording rules
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/gluster/rules.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### gluster-volume.rules
|
||||
|
||||
##### gluster:volume_capacity_used_bytes_total:sum
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum(max(gluster_subvol_capacity_used_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume)
|
||||
record: gluster:volume_capacity_used_bytes_total:sum
|
||||
{{< /code >}}
|
||||
|
||||
##### gluster:volume_capacity_total_bytes:sum
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum(max(gluster_subvol_capacity_total_bytes{job="glusterd2-client"}) BY (volume, subvolume)) BY (volume)
|
||||
record: gluster:volume_capacity_total_bytes:sum
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [k8s-storage-resources-glusterfs-pv](https://github.com/monitoring-mixins/website/blob/master/assets/gluster/dashboards/k8s-storage-resources-glusterfs-pv.json)
|
176
site/content/jaeger/_index.md
Normal file
176
site/content/jaeger/_index.md
Normal file
|
@ -0,0 +1,176 @@
|
|||
---
|
||||
title: jaeger
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/jaeger-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/jaeger/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### jaeger_alerts
|
||||
|
||||
##### JaegerAgentUDPPacketsBeingDropped
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerAgentUDPPacketsBeingDropped
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }} UDP packets per second.
|
||||
expr: rate(jaeger_agent_thrift_udp_server_packets_dropped_total[1m]) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerAgentHTTPServerErrs
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerAgentHTTPServerErrs
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
|
||||
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerClientSpansDropped
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerClientSpansDropped
|
||||
annotations:
|
||||
message: |
|
||||
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerAgentSpansDropped
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerAgentSpansDropped
|
||||
annotations:
|
||||
message: |
|
||||
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerCollectorQueueNotDraining
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerCollectorQueueNotDraining
|
||||
annotations:
|
||||
message: |
|
||||
collector {{ $labels.job }} {{ $labels.instance }} is not able to drain the queue.
|
||||
expr: avg_over_time(jaeger_collector_queue_length[10m]) > 1000
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerCollectorDroppingSpans
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerCollectorDroppingSpans
|
||||
annotations:
|
||||
message: |
|
||||
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|
||||
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerSamplingUpdateFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerSamplingUpdateFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
|
||||
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerCollectorPersistenceSlow
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerCollectorPersistenceSlow
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
|
||||
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m]))) > 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerThrottlingUpdateFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerThrottlingUpdateFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
|
||||
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerQueryReqsFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerQueryReqsFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerCassandraWritesFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerCassandraWritesFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### JaegerCassandraReadsFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: JaegerCassandraReadsFailing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
|
||||
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
52
site/content/kube-state-metrics/_index.md
Normal file
52
site/content/kube-state-metrics/_index.md
Normal file
|
@ -0,0 +1,52 @@
|
|||
---
|
||||
title: kube-state-metrics
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/kubernetes/kube-state-metrics](https://github.com/kubernetes/kube-state-metrics/tree/master/jsonnet/kube-state-metrics-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/kube-state-metrics/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### kube-state-metrics
|
||||
|
||||
##### KubeStateMetricsListErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### KubeStateMetricsWatchErrors
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
1912
site/content/kubernetes/_index.md
Normal file
1912
site/content/kubernetes/_index.md
Normal file
File diff suppressed because it is too large
Load diff
17
site/content/memcached/_index.md
Normal file
17
site/content/memcached/_index.md
Normal file
|
@ -0,0 +1,17 @@
|
|||
---
|
||||
title: memcached
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Grafana dashboard for operating Memcached, in the form of a monitoring mixin.
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/grafana/jsonnet-libs](https://github.com/grafana/jsonnet-libs/tree/master/memcached-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [memcached-overview](https://github.com/monitoring-mixins/website/blob/master/assets/memcached/dashboards/memcached-overview.json)
|
389
site/content/node-exporter/_index.md
Normal file
389
site/content/node-exporter/_index.md
Normal file
|
@ -0,0 +1,389 @@
|
|||
---
|
||||
title: node-exporter
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The Node Mixin is a set of configurable, reusable, and extensible alerts and dashboards based on the metrics exported by the Node Exporter. The mixin creates recording and alerting rules for Prometheus and suitable dashboard descriptions for Grafana.
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/prometheus/node_exporter](https://github.com/prometheus/node_exporter/tree/master/docs/node-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/node-exporter/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### node-exporter
|
||||
|
||||
##### NodeFilesystemSpaceFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeFilesystemSpaceFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeFilesystemAlmostOutOfSpace
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeFilesystemAlmostOutOfSpace
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeFilesystemFilesFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeFilesystemFilesFillingUp
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeFilesystemAlmostOutOfFiles
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeFilesystemAlmostOutOfFiles
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeNetworkReceiveErrs
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
increase(node_network_receive_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeNetworkTransmitErrs
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
increase(node_network_transmit_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeHighNumberConntrackEntriesUsed
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: |
|
||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeTextFileCollectorScrapeError
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: |
|
||||
node_textfile_scrape_error{job="node"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeClockSkewDetected
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
node_timex_offset_seconds > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### NodeClockNotSynchronising
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Recording rules
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/node-exporter/rules.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### node-exporter.rules
|
||||
|
||||
##### instance:node_num_cpu:sum
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
count without (cpu) (
|
||||
count without (mode) (
|
||||
node_cpu_seconds_total{job="node"}
|
||||
)
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_cpu_utilisation:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{job="node", mode="idle"}[1m])
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_load1_per_cpu:ratio
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
node_load1{job="node"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_memory_utilisation:ratio
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
1 - (
|
||||
node_memory_MemAvailable_bytes{job="node"}
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_vmstat_pgmajfault:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
rate(node_vmstat_pgmajfault{job="node"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
##### instance_device:node_disk_io_time_seconds:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node", device!=""}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
##### instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_network_receive_bytes_excluding_lo:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_network_transmit_bytes_excluding_lo:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_network_receive_drop_excluding_lo:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
##### instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [node-cluster-rsrc-use](https://github.com/monitoring-mixins/website/blob/master/assets/node-exporter/dashboards/node-cluster-rsrc-use.json)
|
||||
- [node-rsrc-use](https://github.com/monitoring-mixins/website/blob/master/assets/node-exporter/dashboards/node-rsrc-use.json)
|
||||
- [nodes](https://github.com/monitoring-mixins/website/blob/master/assets/node-exporter/dashboards/nodes.json)
|
286
site/content/prometheus/_index.md
Normal file
286
site/content/prometheus/_index.md
Normal file
|
@ -0,0 +1,286 @@
|
|||
---
|
||||
title: prometheus
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The Prometheus Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Prometheus.
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/prometheus/prometheus](https://github.com/prometheus/prometheus/tree/master/documentation/prometheus-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### prometheus
|
||||
|
||||
##### PrometheusBadConfig
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusBadConfig
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to reload its configuration.
|
||||
summary: Failed Prometheus configuration reload.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotificationQueueRunningFull
|
||||
Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Alert notification queue of Prometheus {{$labels.instance}} is running full.
|
||||
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||
expr: |
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
'{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.'
|
||||
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
'{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||
annotations:
|
||||
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.'
|
||||
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||
expr: |
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotConnectedToAlertmanagers
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers.
|
||||
summary: Prometheus is not connected to any Alertmanagers.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusTSDBReloadsFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.
|
||||
summary: Prometheus has issues reloading blocks from disk.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusTSDBCompactionsFailing
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.
|
||||
summary: Prometheus has issues compacting blocks.
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusNotIngestingSamples
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is not ingesting samples.
|
||||
summary: Prometheus is not ingesting samples.
|
||||
expr: |
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusDuplicateTimestamps
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusOutOfOrderTimestamps
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRemoteStorageFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m])
|
||||
/
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m])
|
||||
+
|
||||
rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m])
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRemoteWriteBehind
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
|
||||
- on(job, instance) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
|
||||
)
|
||||
> 120
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRemoteWriteDesiredShards
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
|
||||
>
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusRuleFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
||||
summary: Prometheus is failing rule evaluations.
|
||||
expr: |
|
||||
increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### PrometheusMissingRuleEvaluations
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: PrometheusMissingRuleEvaluations
|
||||
annotations:
|
||||
description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
|
||||
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||
expr: |
|
||||
increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json)
|
||||
- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json)
|
38
site/content/sealed-secrets/_index.md
Normal file
38
site/content/sealed-secrets/_index.md
Normal file
|
@ -0,0 +1,38 @@
|
|||
---
|
||||
title: sealed-secrets
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/bitnami-labs/sealed-secrets](https://github.com/bitnami-labs/sealed-secrets/tree/master/contrib/prometheus-mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/sealed-secrets/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### sealed-secrets
|
||||
|
||||
##### SealedSecretsUnsealErrorRateHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: SealedSecretsUnsealErrorRateHigh
|
||||
annotations:
|
||||
message: High rate of errors unsealing Sealed Secrets
|
||||
runbook: https://github.com/bitnami-labs/sealed-secrets
|
||||
expr: |
|
||||
sum(rate(sealed_secrets_controller_unseal_errors_total{}[5m])) > 0
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [sealed-secrets-controller](https://github.com/monitoring-mixins/website/blob/master/assets/sealed-secrets/dashboards/sealed-secrets-controller.json)
|
931
site/content/thanos/_index.md
Normal file
931
site/content/thanos/_index.md
Normal file
|
@ -0,0 +1,931 @@
|
|||
---
|
||||
title: thanos
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
|
||||
{{< panel style="danger" >}}
|
||||
Jsonnet source code is available at [github.com/thanos-io/thanos](https://github.com/thanos-io/thanos/tree/master/mixin)
|
||||
{{< /panel >}}
|
||||
|
||||
## Alerts
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/alerts.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### thanos-compact.rules
|
||||
|
||||
##### ThanosCompactMultipleRunning
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactMultipleRunning
|
||||
annotations:
|
||||
message: No more than one Thanos Compact instance should be running at once. There are {{ $value }}
|
||||
expr: sum(up{job=~"thanos-compact.*"}) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosCompactHalted
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactHalted
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} has failed to run and now is halted.
|
||||
expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosCompactHighCompactionFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactHighCompactionFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosCompactBucketHighOperationFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compact.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosCompactHasNotRun
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactHasNotRun
|
||||
annotations:
|
||||
message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
|
||||
expr: (time() - max(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"})) / 60 / 60 > 24
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-query.rules
|
||||
|
||||
##### ThanosQueryHttpRequestQueryErrorRateHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosQueryGrpcServerErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryGrpcServerErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_server_started_total{job=~"thanos-query.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosQueryGrpcClientErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryGrpcClientErrorRate
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosQueryHighDNSFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
) * 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosQueryInstantLatencyHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryInstantLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosQueryRangeLatencyHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryRangeLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-query.*", handler="query_range"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-receive.rules
|
||||
|
||||
##### ThanosReceiveHttpRequestErrorRateHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m]))
|
||||
) * 100 > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosReceiveHttpRequestLatencyHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHttpRequestLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
|
||||
and
|
||||
sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-receive.*", handler="receive"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosReceiveHighForwardRequestFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHighForwardRequestFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
|
||||
)
|
||||
>
|
||||
(
|
||||
max by (job) (floor((thanos_receive_replication_factor{job=~"thanos-receive.*"}+1) / 2))
|
||||
/
|
||||
max by (job) (thanos_receive_hashring_nodes{job=~"thanos-receive.*"})
|
||||
)
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosReceiveHighHashringFileRefreshFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveHighHashringFileRefreshFailures
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
|
||||
> 0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosReceiveConfigReloadFailure
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
|
||||
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosReceiveNoUpload
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveNoUpload
|
||||
annotations:
|
||||
message: Thanos Receive {{$labels.job}} has not uploaded latest data to object storage.
|
||||
expr: increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[2h]) == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-sidecar.rules
|
||||
|
||||
##### ThanosSidecarPrometheusDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosSidecarPrometheusDown
|
||||
annotations:
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus.
|
||||
expr: |
|
||||
sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosSidecarUnhealthy
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosSidecarUnhealthy
|
||||
annotations:
|
||||
message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.
|
||||
expr: |
|
||||
count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-store.rules
|
||||
|
||||
##### ThanosStoreGrpcErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosStoreSeriesGateLatencyHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreSeriesGateLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
and
|
||||
sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosStoreBucketHighOperationFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreBucketHighOperationFailures
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosStoreObjstoreOperationLatencyHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreObjstoreOperationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
|
||||
and
|
||||
sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-rule.rules
|
||||
|
||||
##### ThanosRuleQueueIsDroppingAlerts
|
||||
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleQueueIsDroppingAlerts
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleSenderIsFailingAlerts
|
||||
Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleSenderIsFailingAlerts
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleHighRuleEvaluationFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleHighRuleEvaluationFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleHighRuleEvaluationWarnings
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleHighRuleEvaluationWarnings
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.
|
||||
expr: |
|
||||
sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleRuleEvaluationLatencyHigh
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleRuleEvaluationLatencyHigh
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.
|
||||
expr: |
|
||||
(
|
||||
sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
|
||||
>
|
||||
sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"})
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleGrpcErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleGrpcErrorRate
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 5
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleConfigReloadFailure
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleConfigReloadFailure
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
|
||||
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleQueryHighDNSFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleQueryHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleAlertmanagerHighDNSFailures
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleAlertmanagerHighDNSFailures
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.
|
||||
expr: |
|
||||
(
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
|
||||
/
|
||||
sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
|
||||
* 100 > 1
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleNoEvaluationFor10Intervals
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleNoEvaluationFor10Intervals
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.
|
||||
expr: |
|
||||
time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
|
||||
>
|
||||
10 * max by (job, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosNoRuleEvaluations
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosNoRuleEvaluations
|
||||
annotations:
|
||||
message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.
|
||||
expr: |
|
||||
sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
|
||||
and
|
||||
sum(thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-component-absent.rules
|
||||
|
||||
##### ThanosCompactIsDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosCompactIsDown
|
||||
annotations:
|
||||
message: ThanosCompact has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-compact.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosQueryIsDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosQueryIsDown
|
||||
annotations:
|
||||
message: ThanosQuery has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-query.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosReceiveIsDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosReceiveIsDown
|
||||
annotations:
|
||||
message: ThanosReceive has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-receive.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosRuleIsDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosRuleIsDown
|
||||
annotations:
|
||||
message: ThanosRule has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-rule.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosSidecarIsDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosSidecarIsDown
|
||||
annotations:
|
||||
message: ThanosSidecar has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-sidecar.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosStoreIsDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosStoreIsDown
|
||||
annotations:
|
||||
message: ThanosStore has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-store.*"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-bucket-replicate.rules
|
||||
|
||||
##### ThanosBucketReplicateIsDown
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosBucketReplicateIsDown
|
||||
annotations:
|
||||
message: Thanos Replicate has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~"thanos-bucket-replicate.*"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosBucketReplicateErrorRate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosBucketReplicateErrorRate
|
||||
annotations:
|
||||
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
|
||||
/ on (namespace) group_left
|
||||
sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m]))
|
||||
) * 100 >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
##### ThanosBucketReplicateRunLatency
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
alert: ThanosBucketReplicateRunLatency
|
||||
annotations:
|
||||
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.
|
||||
expr: |
|
||||
(
|
||||
histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
|
||||
and
|
||||
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0
|
||||
)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{< /code >}}
|
||||
|
||||
## Recording rules
|
||||
|
||||
{{< panel style="warning" >}}
|
||||
Complete list of pregenerated recording rules is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/rules.yaml).
|
||||
{{< /panel >}}
|
||||
|
||||
### thanos-query.rules
|
||||
|
||||
##### :grpc_client_failures_per_unary:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_client_failures_per_unary:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :grpc_client_failures_per_stream:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_client_failures_per_stream:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :query_duration_seconds:histogram_quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :query_duration_seconds:histogram_quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### :api_range_query_duration_seconds:histogram_quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :api_range_query_duration_seconds:histogram_quantile
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-receive.rules
|
||||
|
||||
##### :grpc_server_failures_per_unary:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum(
|
||||
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m])
|
||||
/
|
||||
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_unary:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :grpc_server_failures_per_stream:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum(
|
||||
rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
|
||||
/
|
||||
rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_stream:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :http_failure_per_request:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
sum(
|
||||
rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m])
|
||||
/
|
||||
rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: :http_failure_per_request:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :http_request_duration_seconds:histogram_quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :http_request_duration_seconds:histogram_quantile
|
||||
{{< /code >}}
|
||||
|
||||
##### :thanos_receive_forward_failure_per_requests:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_receive_forward_failure_per_requests:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :thanos_receive_hashring_file_failure_per_refresh:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-store.rules
|
||||
|
||||
##### :grpc_server_failures_per_unary:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_unary:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :grpc_server_failures_per_stream:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
|
||||
/
|
||||
sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :grpc_server_failures_per_stream:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :thanos_objstore_bucket_failures_per_operation:sum_rate
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
(
|
||||
sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
|
||||
/
|
||||
sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: :thanos_objstore_bucket_failures_per_operation:sum_rate
|
||||
{{< /code >}}
|
||||
|
||||
##### :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile
|
||||
|
||||
{{< code lang="yaml" >}}
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le)
|
||||
)
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile
|
||||
{{< /code >}}
|
||||
|
||||
### thanos-bucket-replicate.rules
|
||||
|
||||
## Dashboards
|
||||
Following dashboards are generated from mixins and hosted on github:
|
||||
|
||||
|
||||
- [bucket_replicate](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/bucket_replicate.json)
|
||||
- [compact](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/compact.json)
|
||||
- [overview](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/overview.json)
|
||||
- [query](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/query.json)
|
||||
- [receive](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/receive.json)
|
||||
- [rule](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/rule.json)
|
||||
- [sidecar](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/sidecar.json)
|
||||
- [store](https://github.com/monitoring-mixins/website/blob/master/assets/thanos/dashboards/store.json)
|
43
site/layouts/_default/baseof.html
Normal file
43
site/layouts/_default/baseof.html
Normal file
|
@ -0,0 +1,43 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
{{- partial "head.html" . -}}
|
||||
<body>
|
||||
|
||||
{{- partial "header.html" . -}}
|
||||
|
||||
<div class="container-fluid">
|
||||
<div class="row">
|
||||
|
||||
<div class="docs-sidenav order-0 col-12 col-md-3 col-lg-2 col-xl-2 position-sticky border-right">
|
||||
{{- partial "menu.html" . -}}
|
||||
</div>
|
||||
|
||||
{{- if and (ne .Site.Params.toc false) (ne .Params.toc false) }}
|
||||
<div class="docs-toc large order-lg-2 order-md-0 order-xs-1 col-12 col-lg-3 col-xl-3 position-sticky border-left">
|
||||
{{- partial "tableofcontents.html" . -}}
|
||||
</div>
|
||||
<div class="main col-12 order-1 col-md-9 col-lg-9 col-xl-7 py-3">
|
||||
{{else}}
|
||||
<div class="main col-12 order-1 col-md-9 col-lg-10 col-xl-10 py-3">
|
||||
{{end}}
|
||||
|
||||
{{- block "main" . }}{{- end }}
|
||||
|
||||
<div class="row">
|
||||
{{- if and (ne .Site.Params.disableReadmoreNav true) (ne .Params.disableReadmoreNav true) -}}
|
||||
<div class="position-relative mx-auto col-lg-9">
|
||||
{{ partial "next-prev-page.html" . }}
|
||||
</div>
|
||||
{{- end -}}
|
||||
</div> <!-- /end of row -->
|
||||
|
||||
</div>
|
||||
|
||||
</div> <!-- /end of row -->
|
||||
|
||||
</div> <!-- /end of container -->
|
||||
|
||||
{{- partial "footer.html" . -}}
|
||||
|
||||
</body>
|
||||
</html>
|
1
site/themes/ace-documentation
Submodule
1
site/themes/ace-documentation
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 94982f7998875e98de9d00482a35dcbe6a5df5eb
|
Loading…
Reference in a new issue