mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-21 11:48:53 +00:00
pkg/alertmanager: Use lower value for --cluster.reconnect-timeout
Alertmanager in cluster mode resolves the DNS name of each peer and caches its IP address which uses on regular intervals to 'refresh' the connection. In high-dynamic environment like kubernetes, it's possible that alertmanager pods come and go on frequent intervals. The default timeout value of 6h is not suitable in that case as alertmanager will keep trying to reconnect to a non-existing pod over and over until it gives up and remove that peer from the member list. During this period of time, the cluster is reported to be in a degraded state due to the missing member. As such, it's best to use a lower value which will allow the alertmanager to remove the pod from the list of peers soon after it disappears. Related: https://github.com/prometheus/alertmanager/issues/2250
This commit is contained in:
parent
08a964702c
commit
86102e73e9
1 changed files with 6 additions and 0 deletions
|
@ -384,6 +384,12 @@ func makeStatefulSetSpec(a *monitoringv1.Alertmanager, config Config) (*appsv1.S
|
|||
// below Alertmanager v0.15.0 high availability flags are prefixed with 'mesh' instead of 'cluster'
|
||||
amArgs[i] = strings.Replace(amArgs[i], "--cluster.", "--mesh.", 1)
|
||||
}
|
||||
} else {
|
||||
// reconnect-timeout was added in 0.15 (https://github.com/prometheus/alertmanager/pull/1384)
|
||||
// Override default 6h value to allow AlertManager cluster to
|
||||
// quickly remove a cluster member after its pod restarted or during a
|
||||
// regular rolling update.
|
||||
amArgs = append(amArgs, "--cluster.reconnect-timeout=5m")
|
||||
}
|
||||
if version.Minor < 13 {
|
||||
for i := range amArgs {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue