mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-16 01:06:27 +00:00
This patch introduces a new Custom Resource Definition to the Prometheus Operator - the Rule CRD. It addresses two main needs: 1. Prometheus (alerting and recording) Rule validation during creation time via Kubernetes Custom Resource Definition validation. 2. Life-cycle management of Prometheus application Rules alongside the application itself, inside the applications Kubernetes namespace, not necessarily the namespace of the scraping Prometheus instance. A user defines Prometheus alerting and recording Rules via a Kubernetes Custom Resource Definition. These Custom Resource Definitions can be fully validated by the Kubernetes API server during creation time via automatically generated OpenAPI specifications. Instead of the restriction of a Prometheus instance to only select Rule definitions inside its own namespace, the Prometheus specification is extended to also specify namespaces to look for Rule Custom Resource Definitions outside its own namespace. --- Dependent technical changes: - prometheus: Use github.com/jimmidyson/configmap-reload to reload rules - prometheus: Remove Prometheus Statefulset deletion function. Starting with K8s >=1.8 this is handled via OwnerReferences. - prometheus: Do not add rule files checksum to Prometheus configuration secret - prometheus: Update StatefulSet only on relevant changes. Instead of updating the Prometheus StatefulSet on every `sync()` run, only update it if the input parameters to `makeStatefulSet` change. Enforce this via a checksum of the parameters which is saved inside the annotations of the statefulset. - e2e/prometheus: Check how often resources (Secret, ConfigMap, Prometheus CRD, Service) are updated to enforce that Prometheus Operator only updated created resources if necessary. - contrib/prometheus-config-reloader: Remove logic to retriev K8s ConfigMaps. These are mounted into the pod right away now.
495 lines
14 KiB
Go
495 lines
14 KiB
Go
// Copyright 2016 The prometheus-operator Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package e2e
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"strconv"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/pkg/errors"
|
|
appsv1 "k8s.io/api/apps/v1beta2"
|
|
"k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/fields"
|
|
"k8s.io/apimachinery/pkg/util/intstr"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
|
|
testFramework "github.com/coreos/prometheus-operator/test/framework"
|
|
)
|
|
|
|
func TestAlertmanagerCreateDeleteCluster(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
name := "test"
|
|
|
|
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, framework.MakeBasicAlertmanager(name, 3)); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if err := framework.DeleteAlertmanagerAndWaitUntilGone(ns, name); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func TestAlertmanagerScaling(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
name := "test"
|
|
|
|
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, framework.MakeBasicAlertmanager(name, 3)); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if err := framework.UpdateAlertmanagerAndWaitUntilReady(ns, framework.MakeBasicAlertmanager(name, 5)); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if err := framework.UpdateAlertmanagerAndWaitUntilReady(ns, framework.MakeBasicAlertmanager(name, 3)); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func TestAlertmanagerVersionMigration(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
name := "test"
|
|
|
|
am := framework.MakeBasicAlertmanager(name, 1)
|
|
am.Spec.Version = "v0.14.0"
|
|
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, am); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
am.Spec.Version = "v0.15.0-rc.1"
|
|
if err := framework.UpdateAlertmanagerAndWaitUntilReady(ns, am); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
am.Spec.Version = "v0.14.0"
|
|
if err := framework.UpdateAlertmanagerAndWaitUntilReady(ns, am); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func TestExposingAlertmanagerWithKubernetesAPI(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
alertmanager := framework.MakeBasicAlertmanager("test-alertmanager", 1)
|
|
alertmanagerService := framework.MakeAlertmanagerService(alertmanager.Name, "alertmanager-service", v1.ServiceTypeClusterIP)
|
|
|
|
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, alertmanagerService); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
proxyGet := framework.KubeClient.CoreV1().Services(ns).ProxyGet
|
|
request := proxyGet("", alertmanagerService.Name, "web", "/", make(map[string]string))
|
|
_, err := request.DoRaw()
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func TestMeshInitialization(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Starting with Alertmanager v0.15.0 hashicorp/memberlist is used for HA.
|
|
// Make sure both memberlist as well as mesh (< 0.15.0) work
|
|
amVersions := []string{"v0.14.0", "v0.15.0-rc.1"}
|
|
|
|
for _, v := range amVersions {
|
|
version := v
|
|
t.Run(
|
|
fmt.Sprintf("amVersion%v", strings.Replace(version, ".", "-", -1)),
|
|
func(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
amClusterSize := 3
|
|
alertmanager := framework.MakeBasicAlertmanager("test", int32(amClusterSize))
|
|
alertmanager.Spec.Version = version
|
|
alertmanagerService := framework.MakeAlertmanagerService(alertmanager.Name, "alertmanager-service", v1.ServiceTypeClusterIP)
|
|
|
|
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, alertmanagerService); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
for i := 0; i < amClusterSize; i++ {
|
|
name := "alertmanager-" + alertmanager.Name + "-" + strconv.Itoa(i)
|
|
if err := framework.WaitForAlertmanagerInitializedMesh(ns, name, amClusterSize); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
},
|
|
)
|
|
}
|
|
}
|
|
|
|
func TestAlertmanagerClusterGossipSilences(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
amClusterSize := 3
|
|
alertmanager := framework.MakeBasicAlertmanager("test", int32(amClusterSize))
|
|
alertmanager.Spec.Version = "v0.15.0-rc.1"
|
|
|
|
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
for i := 0; i < amClusterSize; i++ {
|
|
name := "alertmanager-" + alertmanager.Name + "-" + strconv.Itoa(i)
|
|
if err := framework.WaitForAlertmanagerInitializedMesh(ns, name, amClusterSize); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
|
|
silId, err := framework.CreateSilence(ns, "alertmanager-test-0")
|
|
if err != nil {
|
|
t.Fatalf("failed to create silence: %v", err)
|
|
}
|
|
|
|
for i := 0; i < amClusterSize; i++ {
|
|
err = wait.Poll(time.Second, framework.DefaultTimeout, func() (bool, error) {
|
|
silences, err := framework.GetSilences(ns, "alertmanager-"+alertmanager.Name+"-"+strconv.Itoa(i))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
if len(silences) != 1 {
|
|
return false, nil
|
|
}
|
|
|
|
if silences[0].ID != silId {
|
|
return false, errors.Errorf("expected silence id on alertmanager %v to match id of created silence '%v' but got %v", i, silId, silences[0].ID)
|
|
}
|
|
return true, nil
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("could not retrieve created silence on alertmanager %v: %v", i, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestAlertmanagerReloadConfig(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
alertmanager := framework.MakeBasicAlertmanager("reload-config", 1)
|
|
|
|
firstConfig := `
|
|
global:
|
|
resolve_timeout: 5m
|
|
route:
|
|
group_by: ['job']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 12h
|
|
receiver: 'webhook'
|
|
receivers:
|
|
- name: 'webhook'
|
|
webhook_configs:
|
|
- url: 'http://firstConfigWebHook:30500/'
|
|
`
|
|
secondConfig := `
|
|
global:
|
|
resolve_timeout: 5m
|
|
route:
|
|
group_by: ['job']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 12h
|
|
receiver: 'webhook'
|
|
receivers:
|
|
- name: 'webhook'
|
|
webhook_configs:
|
|
- url: 'http://secondConfigWebHook:30500/'
|
|
`
|
|
|
|
cfg := &v1.Secret{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: fmt.Sprintf("alertmanager-%s", alertmanager.Name),
|
|
},
|
|
Data: map[string][]byte{
|
|
"alertmanager.yaml": []byte(firstConfig),
|
|
},
|
|
}
|
|
|
|
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Update(cfg); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
firstExpectedString := "firstConfigWebHook"
|
|
log.Println("waiting for first expected config")
|
|
if err := framework.WaitForAlertmanagerConfigToContainString(ns, alertmanager.Name, firstExpectedString); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
log.Println("first expected config found")
|
|
|
|
cfg.Data["alertmanager.yaml"] = []byte(secondConfig)
|
|
|
|
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Update(cfg); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
secondExpectedString := "secondConfigWebHook"
|
|
|
|
log.Println("waiting for second expected config")
|
|
if err := framework.WaitForAlertmanagerConfigToContainString(ns, alertmanager.Name, secondExpectedString); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
log.Println("second expected config found")
|
|
}
|
|
|
|
func TestAlertmanagerZeroDowntimeRollingDeployment(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := framework.NewTestCtx(t)
|
|
defer ctx.Cleanup(t)
|
|
ns := ctx.CreateNamespace(t, framework.KubeClient)
|
|
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
|
|
|
|
alertName := "ExampleAlert"
|
|
|
|
whReplicas := int32(1)
|
|
whdpl := &appsv1.Deployment{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: "alertmanager-webhook",
|
|
},
|
|
Spec: appsv1.DeploymentSpec{
|
|
Replicas: &whReplicas,
|
|
Selector: &metav1.LabelSelector{
|
|
MatchLabels: map[string]string{
|
|
"app": "alertmanager-webhook",
|
|
},
|
|
},
|
|
Template: v1.PodTemplateSpec{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Labels: map[string]string{
|
|
"app": "alertmanager-webhook",
|
|
},
|
|
},
|
|
Spec: v1.PodSpec{
|
|
Containers: []v1.Container{
|
|
{
|
|
Name: "webhook-server",
|
|
Image: "quay.io/coreos/prometheus-alertmanager-test-webhook",
|
|
Ports: []v1.ContainerPort{
|
|
{
|
|
Name: "web",
|
|
ContainerPort: 5001,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
whsvc := &v1.Service{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: "alertmanager-webhook",
|
|
},
|
|
Spec: v1.ServiceSpec{
|
|
Type: v1.ServiceTypeClusterIP,
|
|
Ports: []v1.ServicePort{
|
|
v1.ServicePort{
|
|
Name: "web",
|
|
Port: 5001,
|
|
TargetPort: intstr.FromString("web"),
|
|
},
|
|
},
|
|
Selector: map[string]string{
|
|
"app": "alertmanager-webhook",
|
|
},
|
|
},
|
|
}
|
|
if err := testFramework.CreateDeployment(framework.KubeClient, ns, whdpl); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, whsvc); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
err := testFramework.WaitForPodsReady(framework.KubeClient, ns, time.Minute*5, 1,
|
|
metav1.ListOptions{
|
|
LabelSelector: fields.SelectorFromSet(fields.Set(map[string]string{
|
|
"app": "alertmanager-webhook",
|
|
})).String(),
|
|
},
|
|
)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
alertmanager := framework.MakeBasicAlertmanager("rolling-deploy", 3)
|
|
alertmanager.Spec.Version = "v0.13.0"
|
|
amsvc := framework.MakeAlertmanagerService(alertmanager.Name, "test", v1.ServiceTypeClusterIP)
|
|
amcfg := &v1.Secret{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: fmt.Sprintf("alertmanager-%s", alertmanager.Name),
|
|
},
|
|
Data: map[string][]byte{
|
|
"alertmanager.yaml": []byte(fmt.Sprintf(`
|
|
global:
|
|
resolve_timeout: 5m
|
|
|
|
route:
|
|
group_by: ['alertname']
|
|
group_wait: 10s
|
|
group_interval: 10s
|
|
repeat_interval: 1h
|
|
receiver: 'webhook'
|
|
receivers:
|
|
- name: 'webhook'
|
|
webhook_configs:
|
|
- url: 'http://%s.%s.svc:5001/'
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: 'critical'
|
|
target_match:
|
|
severity: 'warning'
|
|
equal: ['alertname', 'dev', 'instance']
|
|
`, whsvc.Name, ns)),
|
|
},
|
|
}
|
|
|
|
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Create(amcfg); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if _, err := framework.MonClientV1.Alertmanagers(ns).Create(alertmanager); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, amsvc); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
p := framework.MakeBasicPrometheus(ns, "test", "test", 3)
|
|
p.Spec.EvaluationInterval = "100ms"
|
|
framework.AddAlertingToPrometheus(p, ns, alertmanager.Name)
|
|
|
|
_, err = framework.MakeAndCreateFiringRuleFile(ns, p.Name, alertName)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if err := framework.CreatePrometheusAndWaitUntilReady(ns, p); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
pSVC := framework.MakePrometheusService(p.Name, "not-relevant", v1.ServiceTypeClusterIP)
|
|
if finalizerFn, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, pSVC); err != nil {
|
|
t.Fatal(errors.Wrap(err, "creating Prometheus service failed"))
|
|
} else {
|
|
ctx.AddFinalizerFn(finalizerFn)
|
|
}
|
|
|
|
// The Prometheus config reloader reloads Prometheus periodically, not on
|
|
// alert rule change. Thereby one has to wait for Prometheus actually firing
|
|
// the alert.
|
|
err = framework.WaitForPrometheusFiringAlert(p.Namespace, pSVC.Name, alertName)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Wait for alert to propagate
|
|
time.Sleep(10 * time.Second)
|
|
|
|
opts := metav1.ListOptions{
|
|
LabelSelector: fields.SelectorFromSet(fields.Set(map[string]string{
|
|
"app": "alertmanager-webhook",
|
|
})).String(),
|
|
}
|
|
pl, err := framework.KubeClient.Core().Pods(ns).List(opts)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if len(pl.Items) != 1 {
|
|
t.Fatalf("Expected one webhook pod, but got %d", len(pl.Items))
|
|
}
|
|
|
|
podName := pl.Items[0].Name
|
|
logs, err := testFramework.GetLogs(framework.KubeClient, ns, podName, "webhook-server")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
c := strings.Count(logs, "Alertmanager Notification Payload Received")
|
|
if c != 1 {
|
|
t.Fatalf("One notification expected, but %d received.\n\n%s", c, logs)
|
|
}
|
|
|
|
alertmanager.Spec.Version = "v0.14.0"
|
|
if _, err := framework.MonClientV1.Alertmanagers(ns).Update(alertmanager); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
time.Sleep(1 * time.Minute)
|
|
|
|
logs, err = testFramework.GetLogs(framework.KubeClient, ns, podName, "webhook-server")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
c = strings.Count(logs, "Alertmanager Notification Payload Received")
|
|
if c != 1 {
|
|
t.Fatalf("Only one notification expected, but %d received after rolling update of Alertmanager cluster.\n\n%s", c, logs)
|
|
}
|
|
}
|