1
0
Fork 0
mirror of https://github.com/prometheus-operator/prometheus-operator.git synced 2025-04-16 09:16:38 +00:00
prometheus-operator/test/e2e/alertmanager_test.go

717 lines
20 KiB
Go

// Copyright 2016 The prometheus-operator Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package e2e
import (
"context"
"fmt"
"strconv"
"strings"
"testing"
"time"
"github.com/pkg/errors"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/wait"
"github.com/prometheus-operator/prometheus-operator/pkg/alertmanager"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
testFramework "github.com/prometheus-operator/prometheus-operator/test/framework"
"github.com/golang/protobuf/proto"
)
func testAMCreateDeleteCluster(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
name := "test"
if _, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, framework.MakeBasicAlertmanager(name, 3)); err != nil {
t.Fatal(err)
}
if err := framework.DeleteAlertmanagerAndWaitUntilGone(ns, name); err != nil {
t.Fatal(err)
}
}
func testAMScaling(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
name := "test"
a, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, framework.MakeBasicAlertmanager(name, 3))
if err != nil {
t.Fatal(err)
}
a.Spec.Replicas = proto.Int32(5)
a, err = framework.UpdateAlertmanagerAndWaitUntilReady(ns, a)
if err != nil {
t.Fatal(err)
}
a.Spec.Replicas = proto.Int32(3)
if _, err := framework.UpdateAlertmanagerAndWaitUntilReady(ns, a); err != nil {
t.Fatal(err)
}
}
func testAMVersionMigration(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
name := "test"
am := framework.MakeBasicAlertmanager(name, 1)
am.Spec.Version = "v0.16.2"
am, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, am)
if err != nil {
t.Fatal(err)
}
am.Spec.Version = "v0.17.0"
am, err = framework.UpdateAlertmanagerAndWaitUntilReady(ns, am)
if err != nil {
t.Fatal(err)
}
am.Spec.Version = "v0.16.2"
am, err = framework.UpdateAlertmanagerAndWaitUntilReady(ns, am)
if err != nil {
t.Fatal(err)
}
}
func testAMStorageUpdate(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
name := "test"
am := framework.MakeBasicAlertmanager(name, 1)
am, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, am)
if err != nil {
t.Fatal(err)
}
am.Spec.Storage = &monitoringv1.StorageSpec{
VolumeClaimTemplate: monitoringv1.EmbeddedPersistentVolumeClaim{
Spec: v1.PersistentVolumeClaimSpec{
AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadWriteOnce},
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceStorage: resource.MustParse("200Mi"),
},
},
},
},
}
am, err = framework.MonClientV1.Alertmanagers(ns).Update(context.TODO(), am, metav1.UpdateOptions{})
if err != nil {
t.Fatal(err)
}
err = wait.Poll(5*time.Second, 2*time.Minute, func() (bool, error) {
pods, err := framework.KubeClient.CoreV1().Pods(ns).List(context.TODO(), alertmanager.ListOptions(name))
if err != nil {
return false, err
}
if len(pods.Items) != 1 {
return false, nil
}
for _, volume := range pods.Items[0].Spec.Volumes {
if volume.Name == "alertmanager-"+name+"-db" && volume.PersistentVolumeClaim != nil && volume.PersistentVolumeClaim.ClaimName != "" {
return true, nil
}
}
return false, nil
})
if err != nil {
t.Fatal(err)
}
}
func testAMExposingWithKubernetesAPI(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
alertmanager := framework.MakeBasicAlertmanager("test-alertmanager", 1)
alertmanagerService := framework.MakeAlertmanagerService(alertmanager.Name, "alertmanager-service", v1.ServiceTypeClusterIP)
if _, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
t.Fatal(err)
}
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, alertmanagerService); err != nil {
t.Fatal(err)
}
proxyGet := framework.KubeClient.CoreV1().Services(ns).ProxyGet
request := proxyGet("", alertmanagerService.Name, "web", "/", make(map[string]string))
_, err := request.DoRaw(context.TODO())
if err != nil {
t.Fatal(err)
}
}
func testAMClusterInitialization(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
amClusterSize := 3
alertmanager := framework.MakeBasicAlertmanager("test", int32(amClusterSize))
alertmanagerService := framework.MakeAlertmanagerService(alertmanager.Name, "alertmanager-service", v1.ServiceTypeClusterIP)
// Print Alertmanager logs on failure.
defer func() {
if !t.Failed() {
return
}
for i := 0; i < amClusterSize; i++ {
err := framework.PrintPodLogs(ns, fmt.Sprintf("alertmanager-test-%v", strconv.Itoa(i)))
if err != nil {
t.Fatal(err)
}
}
}()
if _, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
t.Fatal(err)
}
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, alertmanagerService); err != nil {
t.Fatal(err)
}
for i := 0; i < amClusterSize; i++ {
name := "alertmanager-" + alertmanager.Name + "-" + strconv.Itoa(i)
if err := framework.WaitForAlertmanagerInitialized(ns, name, amClusterSize, alertmanager.Spec.ForceEnableClusterMode); err != nil {
t.Fatal(err)
}
}
}
// testAMClusterAfterRollingUpdate tests whether all Alertmanager instances join
// the cluster after a rolling update, even though DNS records will probably be
// outdated at startup time. See
// https://github.com/prometheus/alertmanager/pull/1428 for more details.
func testAMClusterAfterRollingUpdate(t *testing.T) {
var err error
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
amClusterSize := 3
alertmanager := framework.MakeBasicAlertmanager("test", int32(amClusterSize))
if alertmanager, err = framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
t.Fatal(err)
}
for i := 0; i < amClusterSize; i++ {
name := "alertmanager-" + alertmanager.Name + "-" + strconv.Itoa(i)
if err := framework.WaitForAlertmanagerInitialized(ns, name, amClusterSize, alertmanager.Spec.ForceEnableClusterMode); err != nil {
t.Fatal(err)
}
}
// We need to force a rolling update, e.g. by changing one of the command
// line flags via the Retention.
alertmanager.Spec.Retention = "1h"
if _, err := framework.UpdateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
t.Fatal(err)
}
for i := 0; i < amClusterSize; i++ {
name := "alertmanager-" + alertmanager.Name + "-" + strconv.Itoa(i)
if err := framework.WaitForAlertmanagerInitialized(ns, name, amClusterSize, alertmanager.Spec.ForceEnableClusterMode); err != nil {
t.Fatal(err)
}
}
}
func testAMClusterGossipSilences(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
amClusterSize := 3
alertmanager := framework.MakeBasicAlertmanager("test", int32(amClusterSize))
if _, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
t.Fatal(err)
}
for i := 0; i < amClusterSize; i++ {
name := "alertmanager-" + alertmanager.Name + "-" + strconv.Itoa(i)
if err := framework.WaitForAlertmanagerInitialized(ns, name, amClusterSize, alertmanager.Spec.ForceEnableClusterMode); err != nil {
t.Fatal(err)
}
}
silId, err := framework.CreateSilence(ns, "alertmanager-test-0")
if err != nil {
t.Fatalf("failed to create silence: %v", err)
}
for i := 0; i < amClusterSize; i++ {
err = wait.Poll(time.Second, framework.DefaultTimeout, func() (bool, error) {
silences, err := framework.GetSilences(ns, "alertmanager-"+alertmanager.Name+"-"+strconv.Itoa(i))
if err != nil {
return false, err
}
if len(silences) != 1 {
return false, nil
}
if silences[0].ID != silId {
return false, errors.Errorf("expected silence id on alertmanager %v to match id of created silence '%v' but got %v", i, silId, silences[0].ID)
}
return true, nil
})
if err != nil {
t.Fatalf("could not retrieve created silence on alertmanager %v: %v", i, err)
}
}
}
func testAMReloadConfig(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
alertmanager := framework.MakeBasicAlertmanager("reload-config", 1)
templateResourceName := fmt.Sprintf("alertmanager-templates-%s", alertmanager.Name)
alertmanager.Spec.ConfigMaps = []string{templateResourceName}
alertmanager.Spec.Secrets = []string{templateResourceName}
firstConfig := `
global:
resolve_timeout: 5m
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://firstConfigWebHook:30500/'
`
secondConfig := `
global:
resolve_timeout: 5m
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://secondConfigWebHook:30500/'
`
template := `
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="viewport" content="width=device-width" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>An Alert</title>
<style>
</style>
</head>
`
secondTemplate := `
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="viewport" content="width=device-width" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>An Alert</title>
<style>
</style>
</head>
<body>
An Alert test
</body>
`
cfg := &v1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("alertmanager-%s", alertmanager.Name),
},
Data: map[string][]byte{
"alertmanager.yaml": []byte(firstConfig),
},
}
templateFileKey := "test-emails.tmpl"
templateSecretFileKey := "test-emails-secret.tmpl"
templateCfg := &v1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: templateResourceName,
},
Data: map[string]string{
templateFileKey: template,
},
}
templateSecret := &v1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: templateResourceName,
},
Data: map[string][]byte{
templateSecretFileKey: []byte(template),
},
}
if _, err := framework.KubeClient.CoreV1().ConfigMaps(ns).Create(context.TODO(), templateCfg, metav1.CreateOptions{}); err != nil {
t.Fatal(err)
}
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Create(context.TODO(), templateSecret, metav1.CreateOptions{}); err != nil {
t.Fatal(err)
}
if _, err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
t.Fatal(err)
}
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Update(context.TODO(), cfg, metav1.UpdateOptions{}); err != nil {
t.Fatal(err)
}
firstExpectedString := "firstConfigWebHook"
if err := framework.WaitForAlertmanagerConfigToContainString(ns, alertmanager.Name, firstExpectedString); err != nil {
t.Fatal(errors.Wrap(err, "failed to wait for first expected config"))
}
cfg.Data["alertmanager.yaml"] = []byte(secondConfig)
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Update(context.TODO(), cfg, metav1.UpdateOptions{}); err != nil {
t.Fatal(err)
}
secondExpectedString := "secondConfigWebHook"
if err := framework.WaitForAlertmanagerConfigToContainString(ns, alertmanager.Name, secondExpectedString); err != nil {
t.Fatal(errors.Wrap(err, "failed to wait for second expected config"))
}
priorToReloadTime := time.Now()
templateCfg.Data[templateFileKey] = secondTemplate
if _, err := framework.KubeClient.CoreV1().ConfigMaps(ns).Update(context.TODO(), templateCfg, metav1.UpdateOptions{}); err != nil {
t.Fatal(err)
}
if err := framework.WaitForAlertmanagerConfigToBeReloaded(ns, alertmanager.Name, priorToReloadTime); err != nil {
t.Fatal(errors.Wrap(err, "failed to wait for additional configMaps reload"))
}
priorToReloadTime = time.Now()
templateSecret.Data[templateSecretFileKey] = []byte(secondTemplate)
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Update(context.TODO(), templateSecret, metav1.UpdateOptions{}); err != nil {
t.Fatal(err)
}
if err := framework.WaitForAlertmanagerConfigToBeReloaded(ns, alertmanager.Name, priorToReloadTime); err != nil {
t.Fatal(errors.Wrap(err, "failed to wait for additional secrets reload"))
}
}
func testAMZeroDowntimeRollingDeployment(t *testing.T) {
// Don't run Alertmanager tests in parallel. See
// https://github.com/prometheus/alertmanager/issues/1835 for details.
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
whReplicas := int32(1)
whdpl := &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "alertmanager-webhook",
},
Spec: appsv1.DeploymentSpec{
Replicas: &whReplicas,
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"app": "alertmanager-webhook",
},
},
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"app": "alertmanager-webhook",
},
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "webhook-server",
Image: "quay.io/coreos/prometheus-alertmanager-test-webhook",
Ports: []v1.ContainerPort{
{
Name: "web",
ContainerPort: 5001,
},
},
},
},
},
},
},
}
whsvc := &v1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: "alertmanager-webhook",
},
Spec: v1.ServiceSpec{
Type: v1.ServiceTypeClusterIP,
Ports: []v1.ServicePort{
{
Name: "web",
Port: 5001,
TargetPort: intstr.FromString("web"),
},
},
Selector: map[string]string{
"app": "alertmanager-webhook",
},
},
}
if err := testFramework.CreateDeployment(framework.KubeClient, ns, whdpl); err != nil {
t.Fatal(err)
}
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, whsvc); err != nil {
t.Fatal(err)
}
err := testFramework.WaitForPodsReady(framework.KubeClient, ns, time.Minute*5, 1,
metav1.ListOptions{
LabelSelector: fields.SelectorFromSet(fields.Set(map[string]string{
"app": "alertmanager-webhook",
})).String(),
},
)
if err != nil {
t.Fatal(err)
}
alertmanager := framework.MakeBasicAlertmanager("rolling-deploy", 3)
amsvc := framework.MakeAlertmanagerService(alertmanager.Name, "test", v1.ServiceTypeClusterIP)
amcfg := &v1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("alertmanager-%s", alertmanager.Name),
},
Data: map[string][]byte{
"alertmanager.yaml": []byte(fmt.Sprintf(`
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://%s.%s.svc:5001/'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
`, whsvc.Name, ns)),
},
}
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Create(context.TODO(), amcfg, metav1.CreateOptions{}); err != nil {
t.Fatal(err)
}
alertmanager, err = framework.MonClientV1.Alertmanagers(ns).Create(context.TODO(), alertmanager, metav1.CreateOptions{})
if err != nil {
t.Fatal(err)
}
if err := framework.WaitForAlertmanagerReady(ns, alertmanager.Name, int(*alertmanager.Spec.Replicas), alertmanager.Spec.ForceEnableClusterMode); err != nil {
t.Fatal(err)
}
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, amsvc); err != nil {
t.Fatal(err)
}
// Send alert to each Alertmanager
for i := 0; i < int(*alertmanager.Spec.Replicas); i++ {
replica := i
done := make(chan struct{})
errc := make(chan error, 1)
defer func() {
close(done)
select {
case err := <-errc:
t.Fatal(errors.Wrapf(err, "sending alert to alertmanager %v", replica))
default:
return
}
}()
go func() {
ticker := time.NewTicker(100 * time.Millisecond)
start := time.Now()
failures := 0
for {
select {
case <-ticker.C:
err := framework.SendAlertToAlertmanager(
ns,
"alertmanager-rolling-deploy-"+strconv.Itoa(replica),
start,
)
if err != nil {
failures++
// Allow 50 (~5 Seconds) failures during Alertmanager rolling update.
if failures > 50 {
errc <- err
return
}
}
case <-done:
return
}
}
}()
}
// Wait for alert to propagate
time.Sleep(30 * time.Second)
opts := metav1.ListOptions{
LabelSelector: fields.SelectorFromSet(fields.Set(map[string]string{
"app": "alertmanager-webhook",
})).String(),
}
pl, err := framework.KubeClient.CoreV1().Pods(ns).List(context.TODO(), opts)
if err != nil {
t.Fatal(err)
}
if len(pl.Items) != 1 {
t.Fatalf("Expected one webhook pod, but got %d", len(pl.Items))
}
podName := pl.Items[0].Name
logs, err := testFramework.GetLogs(framework.KubeClient, ns, podName, "webhook-server")
if err != nil {
t.Fatal(err)
}
c := strings.Count(logs, "Alertmanager Notification Payload Received")
if c != 1 {
t.Fatalf("One notification expected, but %d received.\n\n%s", c, logs)
}
// We need to force a rolling update, e.g. by changing one of the command
// line flags via the Retention.
alertmanager.Spec.Retention = "1h"
if _, err := framework.MonClientV1.Alertmanagers(ns).Update(context.TODO(), alertmanager, metav1.UpdateOptions{}); err != nil {
t.Fatal(err)
}
// Wait for the change above to take effect.
time.Sleep(time.Minute)
if err := framework.WaitForAlertmanagerReady(ns, alertmanager.Name, int(*alertmanager.Spec.Replicas), alertmanager.Spec.ForceEnableClusterMode); err != nil {
t.Fatal(err)
}
time.Sleep(time.Minute)
logs, err = testFramework.GetLogs(framework.KubeClient, ns, podName, "webhook-server")
if err != nil {
t.Fatal(err)
}
c = strings.Count(logs, "Alertmanager Notification Payload Received")
if c != 1 {
t.Fatalf("Only one notification expected, but %d received after rolling update of Alertmanager cluster.\n\n%s", c, logs)
}
}