1
0
Fork 0
mirror of https://github.com/prometheus-operator/prometheus-operator.git synced 2025-04-21 03:38:43 +00:00

Merge pull request from brancz/kube-prometheus-troubleshooting

kube-prometheus: Add troubleshooting and alertmanager docs
This commit is contained in:
Frederic Branczyk 2018-06-07 14:01:35 +02:00 committed by GitHub
commit d08df5777c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 128 additions and 4 deletions

View file

@ -26,14 +26,22 @@ This stack is meant for cluster monitoring, so it is pre-configured to collect m
* [Compiling](#compiling)
* [Configuration](#configuration)
* [Customization](#customization)
* [Alertmanager configuration](#alertmanager-configuration)
* [Customizing Prometheus alerting/recording rules and Grafana dashboards](#customizing-prometheus-alertingrecording-rules-and-grafana-dashboards)
* [Exposing Prometheus/Alermanager/Grafana via Ingress](#exposing-prometheusalermanagergrafana-via-ingress)
* [Minikube Example](#minikube-example)
* [Troubleshooting](#troubleshooting)
* [Error retrieving kubelet metrics](#error-retrieving-kubelet-metrics)
## Prerequisites
You will need a Kubernetes cluster, that's it! By default it is assumed, that the kubelet uses token authN and authZ, as otherwise Prometheus needs a client certificate, which gives it full access to the kubelet, rather than just the metrics. Token authN and authZ allows more fine grained and easier access control.
This means the kubelet configuration must contain these flags:
* `--authentication-token-webhook=true` This flag enables, that a `ServiceAccount` token can be used to authenticate against the kubelet(s).
* `--authorization-mode=Webhook` This flag enables, that the kubelet will perform an RBAC request with the API to determine, whether the requesting entity (Prometheus in this case) is allow to access a resource, in specific for this project the `/metrics` endpoint.
### minikube
In order to just try out this stack, start minikube with the following command:
@ -235,6 +243,49 @@ local daemonset = k.apps.v1beta2.daemonSet;
}).nodeExporter.daemonset
```
### Alertmanager configuration
The Alertmanager configuration is located in the `_config.alertmanager.config` configuration field. In order to set a custom Alertmanager configuration simply set this field.
[embedmd]:# (examples/alertmanager-config.jsonnet)
```jsonnet
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: {
alertmanager+: {
config: |||
global:
resolve_timeout: 10m
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
receiver: 'null'
receivers:
- name: 'null'
|||,
},
},
}).alertmanager.secret
```
In the above example the configuration has been inlined, but can just as well be an external file imported in jsonnet via the `importstr` function.
[embedmd]:# (examples/alertmanager-config-external.jsonnet)
```jsonnet
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: {
alertmanager+: {
config: importstr 'alertmanager-config.yaml',
},
},
}).alertmanager.secret
```
### Customizing Prometheus alerting/recording rules and Grafana dashboards
See [developing Prometheus rules and Grafana dashboards](docs/developing-prometheus-rules-and-grafana-dashboards.md) guide.
@ -269,3 +320,19 @@ local kp =
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
```
## Troubleshooting
### Error retrieving kubelet metrics
Should the Prometheus `/targets` page show kubelet targets, but not able to successfully scrape the metrics, then most likely it is a problem with the authentication and authorization setup of the kubelets.
As described in the [prerequisites](#prerequisites) section, in order to retrieve metrics from the kubelet token authentication and authorization must be enabled. Some Kubernetes setup tools do not enable this by default.
#### Authentication problem
The Prometheus `/targets` page will show the kubelet job with the error `403 Unauthorized`, when token authentication is not enabled. Ensure, that the `--authentication-token-webhook=true` flag is enabled on all kubelet configurations.
#### Authorization problem
The Prometheus `/targets` page will show the kubelet job with the error `401 Unauthorized`, when token authorization is not enabled. Ensure that the `--authorization-mode=Webhook` flag is enabled on all kubelet configurations.

View file

@ -0,0 +1,7 @@
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: {
alertmanager+: {
config: importstr 'alertmanager-config.yaml',
},
},
}).alertmanager.secret

View file

@ -0,0 +1,22 @@
((import 'kube-prometheus/kube-prometheus.libsonnet') + {
_config+:: {
alertmanager+: {
config: |||
global:
resolve_timeout: 10m
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
receiver: 'null'
receivers:
- name: 'null'
|||,
},
},
}).alertmanager.secret

View file

@ -0,0 +1,15 @@
# external alertmanager yaml
global:
resolve_timeout: 10m
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
receiver: 'null'
receivers:
- name: 'null'

View file

@ -1,7 +1,5 @@
local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by: ['job']\n group_wait: 30s\n group_interval: 5m\n repeat_interval: 12h\n receiver: 'null'\n routes:\n - match:\n alertname: DeadMansSwitch\n receiver: 'null'\nreceivers:\n- name: 'null'\n";
{
_config+:: {
namespace: 'default',
@ -16,7 +14,22 @@ local alertmanagerConfig = "\nglobal:\n resolve_timeout: 5m\nroute:\n group_by
alertmanager+:: {
name: $._config.alertmanager.name,
config: alertmanagerConfig,
config: |||
global:
resolve_timeout: 5m
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: DeadMansSwitch
receiver: 'null'
receivers:
- name: 'null'
|||,
replicas: 3,
},
},

View file

@ -1,6 +1,6 @@
apiVersion: v1
data:
alertmanager.yaml: Cmdsb2JhbDoKICByZXNvbHZlX3RpbWVvdXQ6IDVtCnJvdXRlOgogIGdyb3VwX2J5OiBbJ2pvYiddCiAgZ3JvdXBfd2FpdDogMzBzCiAgZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByZWNlaXZlcjogJ251bGwnCiAgcm91dGVzOgogIC0gbWF0Y2g6CiAgICAgIGFsZXJ0bmFtZTogRGVhZE1hbnNTd2l0Y2gKICAgIHJlY2VpdmVyOiAnbnVsbCcKcmVjZWl2ZXJzOgotIG5hbWU6ICdudWxsJwo=
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
kind: Secret
metadata:
name: alertmanager-main