mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 03:38:43 +00:00
Merge pull request #1450 from brancz/kube-prometheus-troubleshooting
kube-prometheus: Add troubleshooting and alertmanager docs
This commit is contained in:
commit
d08df5777c
6 changed files with 128 additions and 4 deletions
contrib/kube-prometheus
|
@ -26,14 +26,22 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m
|
|||
* [Compiling](#compiling)
|
||||
* [Configuration](#configuration)
|
||||
* [Customization](#customization)
|
||||
* [Alertmanager configuration](#alertmanager-configuration)
|
||||
* [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards)
|
||||
* [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress)
|
||||
* [Minikube Example](#minikube-example)
|
||||
* [Troubleshooting](#troubleshooting)
|
||||
* [Error retrieving kubelet metrics](#error-retrieving-kubelet-metrics)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
You will need a Kubernetes cluster, that's it! By default it is assumed, that the kubelet uses token authN and authZ, as otherwise Prometheus needs a client certificate, which gives it full access to the kubelet, rather than just the metrics. Token authN and authZ allows more fine grained and easier access control.
|
||||
|
||||
This means the kubelet configuration must contain these flags:
|
||||
|
||||
* `--authentication-token-webhook=true` This flag enables, that a `ServiceAccount` token can be used to authenticate against the kubelet(s).
|
||||
* `--authorization-mode=Webhook` This flag enables, that the kubelet will perform an RBAC request with the API to determine, whether the requesting entity (Prometheus in this case) is allow to access a resource, in specific for this project the `/metrics` endpoint.
|
||||
|
||||
### minikube
|
||||
|
||||
In order to just try out this stack, start minikube with the following command:
|
||||
|
@ -235,6 +243,49 @@ local daemonset = k.apps.v1beta2.daemonSet;
|
|||
}).nodeExporter.daemonset
|
||||
```
|
||||
|
||||
### Alertmanager configuration
|
||||
|
||||
The Alertmanager configuration is located in the `_config.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field.
|
||||
|
||||
[embedmd]:# (examples/alertmanager-config.jsonnet)
|
||||
```jsonnet
|
||||
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
||||
_config+:: {
|
||||
alertmanager+: {
|
||||
config: |||
|
||||
global:
|
||||
resolve_timeout: 10m
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|||,
|
||||
},
|
||||
},
|
||||
}).alertmanager.secret
|
||||
```
|
||||
|
||||
In the above example the configuration has been inlined, but can just as well be an external file imported in jsonnet via the `importstr` function.
|
||||
|
||||
[embedmd]:# (examples/alertmanager-config-external.jsonnet)
|
||||
```jsonnet
|
||||
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
||||
_config+:: {
|
||||
alertmanager+: {
|
||||
config: importstr 'alertmanager-config.yaml',
|
||||
},
|
||||
},
|
||||
}).alertmanager.secret
|
||||
```
|
||||
|
||||
### Customizing Prometheus alerting/recording rules and Grafana dashboards
|
||||
|
||||
See [developing Prometheus rules and Grafana dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide.
|
||||
|
@ -269,3 +320,19 @@ local kp =
|
|||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Error retrieving kubelet metrics
|
||||
|
||||
Should the Prometheus `/targets` page show kubelet targets, but not able to successfully scrape the metrics, then most likely it is a problem with the authentication and authorization setup of the kubelets.
|
||||
|
||||
As described in the [prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default.
|
||||
|
||||
#### Authentication problem
|
||||
|
||||
The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.
|
||||
|
||||
#### Authorization problem
|
||||
|
||||
The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations.
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
||||
_config+:: {
|
||||
alertmanager+: {
|
||||
config: importstr 'alertmanager-config.yaml',
|
||||
},
|
||||
},
|
||||
}).alertmanager.secret
|
22
contrib/kube-prometheus/examples/alertmanager-config.jsonnet
Normal file
22
contrib/kube-prometheus/examples/alertmanager-config.jsonnet
Normal file
|
@ -0,0 +1,22 @@
|
|||
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
||||
_config+:: {
|
||||
alertmanager+: {
|
||||
config: |||
|
||||
global:
|
||||
resolve_timeout: 10m
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|||,
|
||||
},
|
||||
},
|
||||
}).alertmanager.secret
|
15
contrib/kube-prometheus/examples/alertmanager-config.yaml
Normal file
15
contrib/kube-prometheus/examples/alertmanager-config.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
# external alertmanager yaml
|
||||
global:
|
||||
resolve_timeout: 10m
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'null'
|
|
@ -1,7 +1,5 @@
|
|||
local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
|
||||
|
||||
local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'null'\n routes:\n - match:\n alertname: DeadMansSwitch\n receiver: 'null'\nreceivers:\n- name: 'null'\n";
|
||||
|
||||
{
|
||||
_config+:: {
|
||||
namespace: 'default',
|
||||
|
@ -16,7 +14,22 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by
|
|||
|
||||
alertmanager+:: {
|
||||
name: $._config.alertmanager.name,
|
||||
config: alertmanagerConfig,
|
||||
config: |||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
route:
|
||||
group_by: ['job']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- match:
|
||||
alertname: DeadMansSwitch
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|||,
|
||||
replicas: 3,
|
||||
},
|
||||
},
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Cmdsb2JhbDoKICByZXNvbHZlX3RpbWVvdXQ6IDVtCnJvdXRlOgogIGdyb3VwX2J5OiBbJ2pvYiddCiAgZ3JvdXBfd2FpdDogMzBzCiAgZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByZWNlaXZlcjogJ251bGwnCiAgcm91dGVzOgogIC0gbWF0Y2g6CiAgICAgIGFsZXJ0bmFtZTogRGVhZE1hbnNTd2l0Y2gKICAgIHJlY2VpdmVyOiAnbnVsbCcKcmVjZWl2ZXJzOgotIG5hbWU6ICdudWxsJwo=
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue