mirror of
https://github.com/prometheus-operator/prometheus-operator.git
synced 2025-04-15 16:56:24 +00:00
1168 lines
33 KiB
Go
1168 lines
33 KiB
Go
// Copyright 2016 The prometheus-operator Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package prometheus
|
|
|
|
import (
|
|
"bytes"
|
|
"crypto/md5"
|
|
"encoding/json"
|
|
"fmt"
|
|
"reflect"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/coreos/prometheus-operator/pkg/client/monitoring"
|
|
monitoringv1 "github.com/coreos/prometheus-operator/pkg/client/monitoring/v1"
|
|
"github.com/coreos/prometheus-operator/pkg/k8sutil"
|
|
|
|
"github.com/go-kit/kit/log"
|
|
"github.com/go-kit/kit/log/level"
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
appsv1 "k8s.io/api/apps/v1beta2"
|
|
"k8s.io/api/core/v1"
|
|
extensionsobj "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1"
|
|
apiextensionsclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/api/meta"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/fields"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
|
"k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/rest"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/client-go/util/workqueue"
|
|
)
|
|
|
|
const (
|
|
resyncPeriod = 5 * time.Minute
|
|
)
|
|
|
|
// Operator manages lify cycle of Prometheus deployments and
|
|
// monitoring configurations.
|
|
type Operator struct {
|
|
kclient kubernetes.Interface
|
|
mclient monitoring.Interface
|
|
crdclient apiextensionsclient.Interface
|
|
logger log.Logger
|
|
|
|
promInf cache.SharedIndexInformer
|
|
smonInf cache.SharedIndexInformer
|
|
ruleInf cache.SharedIndexInformer
|
|
cmapInf cache.SharedIndexInformer
|
|
secrInf cache.SharedIndexInformer
|
|
ssetInf cache.SharedIndexInformer
|
|
nsInf cache.SharedIndexInformer
|
|
|
|
queue workqueue.RateLimitingInterface
|
|
|
|
statefulsetErrors prometheus.Counter
|
|
|
|
host string
|
|
kubeletObjectName string
|
|
kubeletObjectNamespace string
|
|
kubeletSyncEnabled bool
|
|
config Config
|
|
}
|
|
|
|
type Labels struct {
|
|
LabelsString string
|
|
LabelsMap map[string]string
|
|
}
|
|
|
|
// Implement the flag.Value interface
|
|
func (labels *Labels) String() string {
|
|
return labels.LabelsString
|
|
}
|
|
|
|
// Merge labels create a new map with labels merged.
|
|
func (labels *Labels) Merge(otherLabels map[string]string) map[string]string {
|
|
mergedLabels := map[string]string{}
|
|
|
|
for key, value := range otherLabels {
|
|
mergedLabels[key] = value
|
|
}
|
|
|
|
for key, value := range labels.LabelsMap {
|
|
mergedLabels[key] = value
|
|
}
|
|
return mergedLabels
|
|
}
|
|
|
|
// Implement the flag.Set interface
|
|
func (labels *Labels) Set(value string) error {
|
|
m := map[string]string{}
|
|
if value != "" {
|
|
splited := strings.Split(value, ",")
|
|
for _, pair := range splited {
|
|
sp := strings.Split(pair, "=")
|
|
m[sp[0]] = sp[1]
|
|
}
|
|
}
|
|
(*labels).LabelsMap = m
|
|
(*labels).LabelsString = value
|
|
return nil
|
|
}
|
|
|
|
// Config defines configuration parameters for the Operator.
|
|
type Config struct {
|
|
Host string
|
|
KubeletObject string
|
|
TLSInsecure bool
|
|
TLSConfig rest.TLSClientConfig
|
|
ConfigReloaderImage string
|
|
PrometheusConfigReloader string
|
|
AlertmanagerDefaultBaseImage string
|
|
PrometheusDefaultBaseImage string
|
|
ThanosDefaultBaseImage string
|
|
Namespace string
|
|
Labels Labels
|
|
CrdGroup string
|
|
CrdKinds monitoringv1.CrdKinds
|
|
EnableValidation bool
|
|
DisableAutoUserGroup bool
|
|
LocalHost string
|
|
LogLevel string
|
|
LogFormat string
|
|
ManageCRDs bool
|
|
}
|
|
|
|
type BasicAuthCredentials struct {
|
|
username string
|
|
password string
|
|
}
|
|
|
|
// New creates a new controller.
|
|
func New(conf Config, logger log.Logger) (*Operator, error) {
|
|
cfg, err := k8sutil.NewClusterConfig(conf.Host, conf.TLSInsecure, &conf.TLSConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
client, err := kubernetes.NewForConfig(cfg)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
mclient, err := monitoring.NewForConfig(&conf.CrdKinds, conf.CrdGroup, cfg)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
crdclient, err := apiextensionsclient.NewForConfig(cfg)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "instantiating apiextensions client failed")
|
|
}
|
|
|
|
kubeletObjectName := ""
|
|
kubeletObjectNamespace := ""
|
|
kubeletSyncEnabled := false
|
|
|
|
if conf.KubeletObject != "" {
|
|
parts := strings.Split(conf.KubeletObject, "/")
|
|
if len(parts) != 2 {
|
|
return nil, fmt.Errorf("malformatted kubelet object string, must be in format \"namespace/name\"")
|
|
}
|
|
kubeletObjectNamespace = parts[0]
|
|
kubeletObjectName = parts[1]
|
|
kubeletSyncEnabled = true
|
|
}
|
|
|
|
c := &Operator{
|
|
kclient: client,
|
|
mclient: mclient,
|
|
crdclient: crdclient,
|
|
logger: logger,
|
|
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "prometheus"),
|
|
host: cfg.Host,
|
|
kubeletObjectName: kubeletObjectName,
|
|
kubeletObjectNamespace: kubeletObjectNamespace,
|
|
kubeletSyncEnabled: kubeletSyncEnabled,
|
|
config: conf,
|
|
}
|
|
|
|
c.promInf = cache.NewSharedIndexInformer(
|
|
&cache.ListWatch{
|
|
ListFunc: mclient.MonitoringV1().Prometheuses(c.config.Namespace).List,
|
|
WatchFunc: mclient.MonitoringV1().Prometheuses(c.config.Namespace).Watch,
|
|
},
|
|
&monitoringv1.Prometheus{}, resyncPeriod, cache.Indexers{},
|
|
)
|
|
c.promInf.AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: c.handleAddPrometheus,
|
|
DeleteFunc: c.handleDeletePrometheus,
|
|
UpdateFunc: c.handleUpdatePrometheus,
|
|
})
|
|
|
|
c.smonInf = cache.NewSharedIndexInformer(
|
|
&cache.ListWatch{
|
|
ListFunc: mclient.MonitoringV1().ServiceMonitors(c.config.Namespace).List,
|
|
WatchFunc: mclient.MonitoringV1().ServiceMonitors(c.config.Namespace).Watch,
|
|
},
|
|
&monitoringv1.ServiceMonitor{}, resyncPeriod, cache.Indexers{},
|
|
)
|
|
c.smonInf.AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: c.handleSmonAdd,
|
|
DeleteFunc: c.handleSmonDelete,
|
|
UpdateFunc: c.handleSmonUpdate,
|
|
})
|
|
|
|
c.ruleInf = cache.NewSharedIndexInformer(
|
|
&cache.ListWatch{
|
|
ListFunc: mclient.MonitoringV1().PrometheusRules(c.config.Namespace).List,
|
|
WatchFunc: mclient.MonitoringV1().PrometheusRules(c.config.Namespace).Watch,
|
|
},
|
|
&monitoringv1.PrometheusRule{}, resyncPeriod, cache.Indexers{},
|
|
)
|
|
c.ruleInf.AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: c.handleRuleAdd,
|
|
DeleteFunc: c.handleRuleDelete,
|
|
UpdateFunc: c.handleRuleUpdate,
|
|
})
|
|
|
|
c.cmapInf = cache.NewSharedIndexInformer(
|
|
cache.NewListWatchFromClient(c.kclient.Core().RESTClient(), "configmaps", c.config.Namespace, fields.Everything()),
|
|
&v1.ConfigMap{}, resyncPeriod, cache.Indexers{},
|
|
)
|
|
c.cmapInf.AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: c.handleConfigMapAdd,
|
|
DeleteFunc: c.handleConfigMapDelete,
|
|
UpdateFunc: c.handleConfigMapUpdate,
|
|
})
|
|
c.secrInf = cache.NewSharedIndexInformer(
|
|
cache.NewListWatchFromClient(c.kclient.Core().RESTClient(), "secrets", c.config.Namespace, fields.Everything()),
|
|
&v1.Secret{}, resyncPeriod, cache.Indexers{},
|
|
)
|
|
c.secrInf.AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: c.handleSecretAdd,
|
|
DeleteFunc: c.handleSecretDelete,
|
|
UpdateFunc: c.handleSecretUpdate,
|
|
})
|
|
|
|
c.ssetInf = cache.NewSharedIndexInformer(
|
|
cache.NewListWatchFromClient(c.kclient.AppsV1beta2().RESTClient(), "statefulsets", c.config.Namespace, fields.Everything()),
|
|
&appsv1.StatefulSet{}, resyncPeriod, cache.Indexers{},
|
|
)
|
|
c.ssetInf.AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: c.handleAddStatefulSet,
|
|
DeleteFunc: c.handleDeleteStatefulSet,
|
|
UpdateFunc: c.handleUpdateStatefulSet,
|
|
})
|
|
|
|
c.nsInf = cache.NewSharedIndexInformer(
|
|
cache.NewListWatchFromClient(c.kclient.Core().RESTClient(), "namespaces", metav1.NamespaceAll, fields.Everything()),
|
|
&v1.Namespace{}, resyncPeriod, cache.Indexers{},
|
|
)
|
|
|
|
return c, nil
|
|
}
|
|
|
|
func (c *Operator) RegisterMetrics(r prometheus.Registerer) {
|
|
c.statefulsetErrors = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "prometheus_operator_prometheus_reconcile_errors_total",
|
|
Help: "Number of errors that occurred while reconciling the alertmanager statefulset",
|
|
})
|
|
|
|
r.MustRegister(
|
|
c.statefulsetErrors,
|
|
NewPrometheusCollector(c.promInf.GetStore()),
|
|
)
|
|
}
|
|
|
|
// Run the controller.
|
|
func (c *Operator) Run(stopc <-chan struct{}) error {
|
|
defer c.queue.ShutDown()
|
|
|
|
errChan := make(chan error)
|
|
go func() {
|
|
v, err := c.kclient.Discovery().ServerVersion()
|
|
if err != nil {
|
|
errChan <- errors.Wrap(err, "communicating with server failed")
|
|
return
|
|
}
|
|
level.Info(c.logger).Log("msg", "connection established", "cluster-version", v)
|
|
|
|
if c.config.ManageCRDs {
|
|
if err := c.createCRDs(); err != nil {
|
|
errChan <- errors.Wrap(err, "creating CRDs failed")
|
|
return
|
|
}
|
|
}
|
|
errChan <- nil
|
|
}()
|
|
|
|
select {
|
|
case err := <-errChan:
|
|
if err != nil {
|
|
return err
|
|
}
|
|
level.Info(c.logger).Log("msg", "CRD API endpoints ready")
|
|
case <-stopc:
|
|
return nil
|
|
}
|
|
|
|
go c.worker()
|
|
|
|
go c.promInf.Run(stopc)
|
|
go c.smonInf.Run(stopc)
|
|
go c.ruleInf.Run(stopc)
|
|
go c.cmapInf.Run(stopc)
|
|
go c.secrInf.Run(stopc)
|
|
go c.ssetInf.Run(stopc)
|
|
if c.config.Namespace == v1.NamespaceAll {
|
|
go c.nsInf.Run(stopc)
|
|
}
|
|
|
|
if c.kubeletSyncEnabled {
|
|
go c.reconcileNodeEndpoints(stopc)
|
|
}
|
|
|
|
<-stopc
|
|
return nil
|
|
}
|
|
|
|
func (c *Operator) keyFunc(obj interface{}) (string, bool) {
|
|
k, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
|
|
if err != nil {
|
|
level.Error(c.logger).Log("msg", "creating key failed", "err", err)
|
|
return k, false
|
|
}
|
|
return k, true
|
|
}
|
|
|
|
func (c *Operator) handleAddPrometheus(obj interface{}) {
|
|
key, ok := c.keyFunc(obj)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
level.Debug(c.logger).Log("msg", "Prometheus added", "key", key)
|
|
c.enqueue(key)
|
|
}
|
|
|
|
func (c *Operator) handleDeletePrometheus(obj interface{}) {
|
|
key, ok := c.keyFunc(obj)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
level.Debug(c.logger).Log("msg", "Prometheus deleted", "key", key)
|
|
c.enqueue(key)
|
|
}
|
|
|
|
func (c *Operator) handleUpdatePrometheus(old, cur interface{}) {
|
|
key, ok := c.keyFunc(cur)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
level.Debug(c.logger).Log("msg", "Prometheus updated", "key", key)
|
|
c.enqueue(key)
|
|
}
|
|
|
|
func (c *Operator) reconcileNodeEndpoints(stopc <-chan struct{}) {
|
|
ticker := time.NewTicker(3 * time.Minute)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-stopc:
|
|
return
|
|
case <-ticker.C:
|
|
err := c.syncNodeEndpoints()
|
|
if err != nil {
|
|
level.Error(c.logger).Log("msg", "syncing nodes into Endpoints object failed", "err", err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// nodeAddresses returns the provided node's address, based on the priority:
|
|
// 1. NodeInternalIP
|
|
// 2. NodeExternalIP
|
|
// 3. NodeLegacyHostIP
|
|
// 3. NodeHostName
|
|
//
|
|
// Copied from github.com/prometheus/prometheus/discovery/kubernetes/node.go
|
|
func nodeAddress(node v1.Node) (string, map[v1.NodeAddressType][]string, error) {
|
|
m := map[v1.NodeAddressType][]string{}
|
|
for _, a := range node.Status.Addresses {
|
|
m[a.Type] = append(m[a.Type], a.Address)
|
|
}
|
|
|
|
if addresses, ok := m[v1.NodeInternalIP]; ok {
|
|
return addresses[0], m, nil
|
|
}
|
|
if addresses, ok := m[v1.NodeExternalIP]; ok {
|
|
return addresses[0], m, nil
|
|
}
|
|
if addresses, ok := m[v1.NodeHostName]; ok {
|
|
return addresses[0], m, nil
|
|
}
|
|
return "", m, fmt.Errorf("host address unknown")
|
|
}
|
|
|
|
func (c *Operator) syncNodeEndpoints() error {
|
|
eps := &v1.Endpoints{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: c.kubeletObjectName,
|
|
Labels: c.config.Labels.Merge(map[string]string{
|
|
"k8s-app": "kubelet",
|
|
}),
|
|
},
|
|
Subsets: []v1.EndpointSubset{
|
|
{
|
|
Ports: []v1.EndpointPort{
|
|
{
|
|
Name: "https-metrics",
|
|
Port: 10250,
|
|
},
|
|
{
|
|
Name: "http-metrics",
|
|
Port: 10255,
|
|
},
|
|
{
|
|
Name: "cadvisor",
|
|
Port: 4194,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
nodes, err := c.kclient.CoreV1().Nodes().List(metav1.ListOptions{})
|
|
if err != nil {
|
|
return errors.Wrap(err, "listing nodes failed")
|
|
}
|
|
|
|
for _, n := range nodes.Items {
|
|
address, _, err := nodeAddress(n)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "failed to determine hostname for node (%s)", n.Name)
|
|
}
|
|
eps.Subsets[0].Addresses = append(eps.Subsets[0].Addresses, v1.EndpointAddress{
|
|
IP: address,
|
|
TargetRef: &v1.ObjectReference{
|
|
Kind: "Node",
|
|
Name: n.Name,
|
|
UID: n.UID,
|
|
APIVersion: n.APIVersion,
|
|
},
|
|
})
|
|
}
|
|
|
|
svc := &v1.Service{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: c.kubeletObjectName,
|
|
Labels: c.config.Labels.Merge(map[string]string{
|
|
"k8s-app": "kubelet",
|
|
}),
|
|
},
|
|
Spec: v1.ServiceSpec{
|
|
Type: v1.ServiceTypeClusterIP,
|
|
ClusterIP: "None",
|
|
Ports: []v1.ServicePort{
|
|
{
|
|
Name: "https-metrics",
|
|
Port: 10250,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
err = k8sutil.CreateOrUpdateService(c.kclient.CoreV1().Services(c.kubeletObjectNamespace), svc)
|
|
if err != nil {
|
|
return errors.Wrap(err, "synchronizing kubelet service object failed")
|
|
}
|
|
|
|
err = k8sutil.CreateOrUpdateEndpoints(c.kclient.CoreV1().Endpoints(c.kubeletObjectNamespace), eps)
|
|
if err != nil {
|
|
return errors.Wrap(err, "synchronizing kubelet endpoints object failed")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// TODO: Don't enque just for the namespace
|
|
func (c *Operator) handleSmonAdd(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
// TODO: Don't enque just for the namespace
|
|
func (c *Operator) handleSmonUpdate(old, cur interface{}) {
|
|
o, ok := c.getObject(cur)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
// TODO: Don't enque just for the namespace
|
|
func (c *Operator) handleSmonDelete(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
// TODO: Don't enque just for the namespace
|
|
func (c *Operator) handleRuleAdd(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
// TODO: Don't enque just for the namespace
|
|
func (c *Operator) handleRuleUpdate(old, cur interface{}) {
|
|
o, ok := c.getObject(cur)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
// TODO: Don't enque just for the namespace
|
|
func (c *Operator) handleRuleDelete(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
// TODO: Do we need to enque secrets just for the namespace or in general?
|
|
func (c *Operator) handleSecretDelete(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
func (c *Operator) handleSecretUpdate(old, cur interface{}) {
|
|
o, ok := c.getObject(cur)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
func (c *Operator) handleSecretAdd(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
// TODO: Do we need to enque configmaps just for the namespace or in general?
|
|
func (c *Operator) handleConfigMapAdd(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
func (c *Operator) handleConfigMapDelete(obj interface{}) {
|
|
o, ok := c.getObject(obj)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
func (c *Operator) handleConfigMapUpdate(old, cur interface{}) {
|
|
o, ok := c.getObject(cur)
|
|
if ok {
|
|
c.enqueueForNamespace(o.GetNamespace())
|
|
}
|
|
}
|
|
|
|
func (c *Operator) getObject(obj interface{}) (metav1.Object, bool) {
|
|
ts, ok := obj.(cache.DeletedFinalStateUnknown)
|
|
if ok {
|
|
obj = ts.Obj
|
|
}
|
|
|
|
o, err := meta.Accessor(obj)
|
|
if err != nil {
|
|
level.Error(c.logger).Log("msg", "get object failed", "err", err)
|
|
return nil, false
|
|
}
|
|
return o, true
|
|
}
|
|
|
|
// enqueue adds a key to the queue. If obj is a key already it gets added directly.
|
|
// Otherwise, the key is extracted via keyFunc.
|
|
func (c *Operator) enqueue(obj interface{}) {
|
|
if obj == nil {
|
|
return
|
|
}
|
|
|
|
key, ok := obj.(string)
|
|
if !ok {
|
|
key, ok = c.keyFunc(obj)
|
|
if !ok {
|
|
return
|
|
}
|
|
}
|
|
|
|
c.queue.Add(key)
|
|
}
|
|
|
|
// enqueueForNamespace enqueues all Prometheus object keys that belong to the given namespace.
|
|
func (c *Operator) enqueueForNamespace(ns string) {
|
|
cache.ListAll(c.promInf.GetStore(), labels.Everything(), func(obj interface{}) {
|
|
p := obj.(*monitoringv1.Prometheus)
|
|
if p.Namespace == ns {
|
|
c.enqueue(p)
|
|
}
|
|
})
|
|
}
|
|
|
|
// worker runs a worker thread that just dequeues items, processes them, and marks them done.
|
|
// It enforces that the syncHandler is never invoked concurrently with the same key.
|
|
func (c *Operator) worker() {
|
|
for c.processNextWorkItem() {
|
|
}
|
|
}
|
|
|
|
func (c *Operator) processNextWorkItem() bool {
|
|
key, quit := c.queue.Get()
|
|
if quit {
|
|
return false
|
|
}
|
|
defer c.queue.Done(key)
|
|
|
|
err := c.sync(key.(string))
|
|
if err == nil {
|
|
c.queue.Forget(key)
|
|
return true
|
|
}
|
|
|
|
c.statefulsetErrors.Inc()
|
|
utilruntime.HandleError(errors.Wrap(err, fmt.Sprintf("Sync %q failed", key)))
|
|
c.queue.AddRateLimited(key)
|
|
|
|
return true
|
|
}
|
|
|
|
func (c *Operator) prometheusForStatefulSet(sset interface{}) *monitoringv1.Prometheus {
|
|
key, ok := c.keyFunc(sset)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
|
|
promKey := statefulSetKeyToPrometheusKey(key)
|
|
p, exists, err := c.promInf.GetStore().GetByKey(promKey)
|
|
if err != nil {
|
|
level.Error(c.logger).Log("msg", "Prometheus lookup failed", "err", err)
|
|
return nil
|
|
}
|
|
if !exists {
|
|
return nil
|
|
}
|
|
return p.(*monitoringv1.Prometheus)
|
|
}
|
|
|
|
func prometheusNameFromStatefulSetName(name string) string {
|
|
return strings.TrimPrefix(name, "prometheus-")
|
|
}
|
|
|
|
func statefulSetNameFromPrometheusName(name string) string {
|
|
return "prometheus-" + name
|
|
}
|
|
|
|
func statefulSetKeyToPrometheusKey(key string) string {
|
|
keyParts := strings.Split(key, "/")
|
|
return keyParts[0] + "/" + strings.TrimPrefix(keyParts[1], "prometheus-")
|
|
}
|
|
|
|
func prometheusKeyToStatefulSetKey(key string) string {
|
|
keyParts := strings.Split(key, "/")
|
|
return keyParts[0] + "/prometheus-" + keyParts[1]
|
|
}
|
|
|
|
func (c *Operator) handleDeleteStatefulSet(obj interface{}) {
|
|
if ps := c.prometheusForStatefulSet(obj); ps != nil {
|
|
c.enqueue(ps)
|
|
}
|
|
}
|
|
|
|
func (c *Operator) handleAddStatefulSet(obj interface{}) {
|
|
if ps := c.prometheusForStatefulSet(obj); ps != nil {
|
|
c.enqueue(ps)
|
|
}
|
|
}
|
|
|
|
func (c *Operator) handleUpdateStatefulSet(oldo, curo interface{}) {
|
|
old := oldo.(*appsv1.StatefulSet)
|
|
cur := curo.(*appsv1.StatefulSet)
|
|
|
|
level.Debug(c.logger).Log("msg", "update handler", "old", old.ResourceVersion, "cur", cur.ResourceVersion)
|
|
|
|
// Periodic resync may resend the deployment without changes in-between.
|
|
// Also breaks loops created by updating the resource ourselves.
|
|
if old.ResourceVersion == cur.ResourceVersion {
|
|
return
|
|
}
|
|
|
|
if ps := c.prometheusForStatefulSet(cur); ps != nil {
|
|
c.enqueue(ps)
|
|
}
|
|
}
|
|
|
|
func (c *Operator) sync(key string) error {
|
|
obj, exists, err := c.promInf.GetIndexer().GetByKey(key)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !exists {
|
|
// Dependent resources are cleaned up by K8s via OwnerReferences
|
|
return nil
|
|
}
|
|
|
|
p := obj.(*monitoringv1.Prometheus)
|
|
if p.Spec.Paused {
|
|
return nil
|
|
}
|
|
|
|
level.Info(c.logger).Log("msg", "sync prometheus", "key", key)
|
|
|
|
// TODO: Remove migration with Prometheus Operator v0.21.0
|
|
err = c.migrateRuleConfigMapsToRuleCRDs(p)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
ruleConfigMapNames, err := c.createOrUpdateRuleConfigMaps(p)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// If no service monitor selectors are configured, the user wants to manage
|
|
// configuration themselves.
|
|
if p.Spec.ServiceMonitorSelector != nil {
|
|
// We just always regenerate the configuration to be safe.
|
|
if err := c.createOrUpdateConfigurationSecret(p, ruleConfigMapNames); err != nil {
|
|
return errors.Wrap(err, "creating config failed")
|
|
}
|
|
}
|
|
|
|
// Create empty Secret if it doesn't exist. See comment above.
|
|
s, err := makeEmptyConfigurationSecret(p, c.config)
|
|
if err != nil {
|
|
return errors.Wrap(err, "generating empty config secret failed")
|
|
}
|
|
sClient := c.kclient.CoreV1().Secrets(p.Namespace)
|
|
_, err = sClient.Get(s.Name, metav1.GetOptions{})
|
|
if apierrors.IsNotFound(err) {
|
|
if _, err := c.kclient.Core().Secrets(p.Namespace).Create(s); err != nil && !apierrors.IsAlreadyExists(err) {
|
|
return errors.Wrap(err, "creating empty config file failed")
|
|
}
|
|
return err
|
|
}
|
|
|
|
// Create governing service if it doesn't exist.
|
|
svcClient := c.kclient.Core().Services(p.Namespace)
|
|
if err := k8sutil.CreateOrUpdateService(svcClient, makeStatefulSetService(p, c.config)); err != nil {
|
|
return errors.Wrap(err, "synchronizing governing service failed")
|
|
}
|
|
|
|
ssetClient := c.kclient.AppsV1beta2().StatefulSets(p.Namespace)
|
|
// Ensure we have a StatefulSet running Prometheus deployed.
|
|
obj, exists, err = c.ssetInf.GetIndexer().GetByKey(prometheusKeyToStatefulSetKey(key))
|
|
if err != nil {
|
|
return errors.Wrap(err, "retrieving statefulset failed")
|
|
}
|
|
|
|
newSSetInputChecksum, err := createSSetInputChecksum(*p, c.config, ruleConfigMapNames)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !exists {
|
|
level.Debug(c.logger).Log("msg", "no current Prometheus statefulset found")
|
|
sset, err := makeStatefulSet(*p, "", &c.config, ruleConfigMapNames, newSSetInputChecksum)
|
|
if err != nil {
|
|
return errors.Wrap(err, "making statefulset failed")
|
|
}
|
|
|
|
level.Debug(c.logger).Log("msg", "creating Prometheus statefulset")
|
|
if _, err := ssetClient.Create(sset); err != nil {
|
|
return errors.Wrap(err, "creating statefulset failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
oldSSetInputChecksum := obj.(*appsv1.StatefulSet).ObjectMeta.Annotations[sSetInputChecksumName]
|
|
if newSSetInputChecksum == oldSSetInputChecksum {
|
|
level.Debug(c.logger).Log("msg", "new statefulset generation inputs match current, skipping any actions")
|
|
return nil
|
|
}
|
|
|
|
sset, err := makeStatefulSet(*p, obj.(*appsv1.StatefulSet).Spec.PodManagementPolicy, &c.config, ruleConfigMapNames, newSSetInputChecksum)
|
|
if err != nil {
|
|
return errors.Wrap(err, "making statefulset failed")
|
|
}
|
|
|
|
level.Debug(c.logger).Log("msg", "updating current Prometheus statefulset")
|
|
if _, err := ssetClient.Update(sset); err != nil {
|
|
return errors.Wrap(err, "updating statefulset failed")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func createSSetInputChecksum(p monitoringv1.Prometheus, c Config, ruleConfigMapNames []string) (string, error) {
|
|
json, err := json.Marshal(
|
|
struct {
|
|
P monitoringv1.Prometheus
|
|
C Config
|
|
R []string
|
|
}{p, c, ruleConfigMapNames},
|
|
)
|
|
if err != nil {
|
|
return "", errors.Wrap(err, "failed to marshal Prometheus CRD and config to json")
|
|
}
|
|
|
|
return fmt.Sprintf("%x", md5.Sum(json)), nil
|
|
}
|
|
|
|
func ListOptions(name string) metav1.ListOptions {
|
|
return metav1.ListOptions{
|
|
LabelSelector: fields.SelectorFromSet(fields.Set(map[string]string{
|
|
"app": "prometheus",
|
|
"prometheus": name,
|
|
})).String(),
|
|
}
|
|
}
|
|
|
|
// PrometheusStatus evaluates the current status of a Prometheus deployment with respect
|
|
// to its specified resource object. It return the status and a list of pods that
|
|
// are not updated.
|
|
func PrometheusStatus(kclient kubernetes.Interface, p *monitoringv1.Prometheus) (*monitoringv1.PrometheusStatus, []v1.Pod, error) {
|
|
res := &monitoringv1.PrometheusStatus{Paused: p.Spec.Paused}
|
|
|
|
pods, err := kclient.Core().Pods(p.Namespace).List(ListOptions(p.Name))
|
|
if err != nil {
|
|
return nil, nil, errors.Wrap(err, "retrieving pods of failed")
|
|
}
|
|
sset, err := kclient.AppsV1beta2().StatefulSets(p.Namespace).Get(statefulSetNameFromPrometheusName(p.Name), metav1.GetOptions{})
|
|
if err != nil {
|
|
return nil, nil, errors.Wrap(err, "retrieving stateful set failed")
|
|
}
|
|
|
|
res.Replicas = int32(len(pods.Items))
|
|
|
|
var oldPods []v1.Pod
|
|
for _, pod := range pods.Items {
|
|
ready, err := k8sutil.PodRunningAndReady(pod)
|
|
if err != nil {
|
|
return nil, nil, errors.Wrap(err, "cannot determine pod ready state")
|
|
}
|
|
if ready {
|
|
res.AvailableReplicas++
|
|
// TODO(fabxc): detect other fields of the pod template that are mutable.
|
|
if needsUpdate(&pod, sset.Spec.Template) {
|
|
oldPods = append(oldPods, pod)
|
|
} else {
|
|
res.UpdatedReplicas++
|
|
}
|
|
continue
|
|
}
|
|
res.UnavailableReplicas++
|
|
}
|
|
|
|
return res, oldPods, nil
|
|
}
|
|
|
|
// needsUpdate checks whether the given pod conforms with the pod template spec
|
|
// for various attributes that are influenced by the Prometheus CRD settings.
|
|
func needsUpdate(pod *v1.Pod, tmpl v1.PodTemplateSpec) bool {
|
|
c1 := pod.Spec.Containers[0]
|
|
c2 := tmpl.Spec.Containers[0]
|
|
|
|
if c1.Image != c2.Image {
|
|
return true
|
|
}
|
|
if !reflect.DeepEqual(c1.Args, c2.Args) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func loadAdditionalScrapeConfigsSecret(additionalScrapeConfigs *v1.SecretKeySelector, s *v1.SecretList) ([]byte, error) {
|
|
if additionalScrapeConfigs != nil {
|
|
for _, secret := range s.Items {
|
|
if secret.Name == additionalScrapeConfigs.Name {
|
|
if c, ok := secret.Data[additionalScrapeConfigs.Key]; ok {
|
|
return c, nil
|
|
}
|
|
|
|
return nil, fmt.Errorf("key %v could not be found in Secret %v.", additionalScrapeConfigs.Key, additionalScrapeConfigs.Name)
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("secret %v could not be found.", additionalScrapeConfigs.Name)
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
func loadBasicAuthSecret(basicAuth *monitoringv1.BasicAuth, s *v1.SecretList) (BasicAuthCredentials, error) {
|
|
var username string
|
|
var password string
|
|
|
|
for _, secret := range s.Items {
|
|
|
|
if secret.Name == basicAuth.Username.Name {
|
|
|
|
if u, ok := secret.Data[basicAuth.Username.Key]; ok {
|
|
username = string(u)
|
|
} else {
|
|
return BasicAuthCredentials{}, fmt.Errorf("Secret username key %q in secret %q not found.", basicAuth.Username.Key, secret.Name)
|
|
}
|
|
|
|
}
|
|
|
|
if secret.Name == basicAuth.Password.Name {
|
|
|
|
if p, ok := secret.Data[basicAuth.Password.Key]; ok {
|
|
password = string(p)
|
|
} else {
|
|
return BasicAuthCredentials{}, fmt.Errorf("Secret password key %q in secret %q not found.", basicAuth.Password.Key, secret.Name)
|
|
}
|
|
|
|
}
|
|
if username != "" && password != "" {
|
|
break
|
|
}
|
|
}
|
|
|
|
if username == "" && password == "" {
|
|
return BasicAuthCredentials{}, fmt.Errorf("BasicAuth username and password secret not found.")
|
|
}
|
|
|
|
return BasicAuthCredentials{username: username, password: password}, nil
|
|
|
|
}
|
|
|
|
func (c *Operator) loadBasicAuthSecrets(mons map[string]*monitoringv1.ServiceMonitor, remoteReads []monitoringv1.RemoteReadSpec, remoteWrites []monitoringv1.RemoteWriteSpec, s *v1.SecretList) (map[string]BasicAuthCredentials, error) {
|
|
|
|
secrets := map[string]BasicAuthCredentials{}
|
|
|
|
for _, mon := range mons {
|
|
|
|
for i, ep := range mon.Spec.Endpoints {
|
|
|
|
if ep.BasicAuth != nil {
|
|
|
|
if credentials, err := loadBasicAuthSecret(ep.BasicAuth, s); err != nil {
|
|
return nil, fmt.Errorf("Could not generate basicAuth for servicemonitor %s. %s", mon.Name, err)
|
|
} else {
|
|
secrets[fmt.Sprintf("serviceMonitor/%s/%s/%d", mon.Namespace, mon.Name, i)] = credentials
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
for i, remote := range remoteReads {
|
|
if remote.BasicAuth != nil {
|
|
if credentials, err := loadBasicAuthSecret(remote.BasicAuth, s); err != nil {
|
|
return nil, fmt.Errorf("Could not generate basicAuth for remote_read config %d. %s", i, err)
|
|
} else {
|
|
secrets[fmt.Sprintf("remoteRead/%d", i)] = credentials
|
|
}
|
|
}
|
|
}
|
|
|
|
for i, remote := range remoteWrites {
|
|
if remote.BasicAuth != nil {
|
|
if credentials, err := loadBasicAuthSecret(remote.BasicAuth, s); err != nil {
|
|
return nil, fmt.Errorf("Could not generate basicAuth for remote_write config %d. %s", i, err)
|
|
} else {
|
|
secrets[fmt.Sprintf("remoteWrite/%d", i)] = credentials
|
|
}
|
|
}
|
|
}
|
|
|
|
return secrets, nil
|
|
|
|
}
|
|
|
|
func (c *Operator) createOrUpdateConfigurationSecret(p *monitoringv1.Prometheus, ruleConfigMapNames []string) error {
|
|
smons, err := c.selectServiceMonitors(p)
|
|
if err != nil {
|
|
return errors.Wrap(err, "selecting ServiceMonitors failed")
|
|
}
|
|
|
|
sClient := c.kclient.CoreV1().Secrets(p.Namespace)
|
|
|
|
listSecrets, err := sClient.List(metav1.ListOptions{})
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
basicAuthSecrets, err := c.loadBasicAuthSecrets(smons, p.Spec.RemoteRead, p.Spec.RemoteWrite, listSecrets)
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
additionalScrapeConfigs, err := loadAdditionalScrapeConfigsSecret(p.Spec.AdditionalScrapeConfigs, listSecrets)
|
|
if err != nil {
|
|
return errors.Wrap(err, "loading additional scrape configs from Secret failed")
|
|
}
|
|
additionalAlertManagerConfigs, err := loadAdditionalScrapeConfigsSecret(p.Spec.AdditionalAlertManagerConfigs, listSecrets)
|
|
if err != nil {
|
|
return errors.Wrap(err, "loading additional alert manager configs from Secret failed")
|
|
}
|
|
|
|
// Update secret based on the most recent configuration.
|
|
conf, err := generateConfig(
|
|
p,
|
|
smons,
|
|
basicAuthSecrets,
|
|
additionalScrapeConfigs,
|
|
additionalAlertManagerConfigs,
|
|
ruleConfigMapNames,
|
|
)
|
|
if err != nil {
|
|
return errors.Wrap(err, "generating config failed")
|
|
}
|
|
|
|
s := makeConfigSecret(p, c.config)
|
|
s.ObjectMeta.Annotations = map[string]string{
|
|
"generated": "true",
|
|
}
|
|
s.Data[configFilename] = []byte(conf)
|
|
|
|
curSecret, err := sClient.Get(s.Name, metav1.GetOptions{})
|
|
if apierrors.IsNotFound(err) {
|
|
level.Debug(c.logger).Log("msg", "creating configuration")
|
|
_, err = sClient.Create(s)
|
|
return err
|
|
}
|
|
|
|
var (
|
|
generatedConf = s.Data[configFilename]
|
|
curConfig, curConfigFound = curSecret.Data[configFilename]
|
|
)
|
|
if curConfigFound {
|
|
if bytes.Equal(curConfig, generatedConf) {
|
|
level.Debug(c.logger).Log("msg", "updating Prometheus configuration secret skipped, no configuration change")
|
|
return nil
|
|
} else {
|
|
level.Debug(c.logger).Log("msg", "current Prometheus configuration has changed")
|
|
}
|
|
} else {
|
|
level.Debug(c.logger).Log("msg", "no current Prometheus configuration secret found", "currentConfigFound", curConfigFound)
|
|
}
|
|
|
|
level.Debug(c.logger).Log("msg", "updating Prometheus configuration secret")
|
|
_, err = sClient.Update(s)
|
|
return err
|
|
}
|
|
|
|
func (c *Operator) selectServiceMonitors(p *monitoringv1.Prometheus) (map[string]*monitoringv1.ServiceMonitor, error) {
|
|
namespaces := []string{}
|
|
// Selectors might overlap. Deduplicate them along the keyFunc.
|
|
res := make(map[string]*monitoringv1.ServiceMonitor)
|
|
|
|
servMonSelector, err := metav1.LabelSelectorAsSelector(p.Spec.ServiceMonitorSelector)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// If 'ServiceMonitorNamespaceSelector' is nil, only check own namespace.
|
|
if p.Spec.ServiceMonitorNamespaceSelector == nil {
|
|
namespaces = append(namespaces, p.Namespace)
|
|
} else {
|
|
servMonNSSelector, err := metav1.LabelSelectorAsSelector(p.Spec.ServiceMonitorNamespaceSelector)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
cache.ListAll(c.nsInf.GetStore(), servMonNSSelector, func(obj interface{}) {
|
|
namespaces = append(namespaces, obj.(*v1.Namespace).Name)
|
|
})
|
|
}
|
|
|
|
level.Debug(c.logger).Log("msg", "filtering namespaces to select ServiceMonitors from", "namespaces", strings.Join(namespaces, ","), "namespace", p.Namespace, "prometheus", p.Name)
|
|
|
|
for _, ns := range namespaces {
|
|
cache.ListAllByNamespace(c.smonInf.GetIndexer(), ns, servMonSelector, func(obj interface{}) {
|
|
k, ok := c.keyFunc(obj)
|
|
if ok {
|
|
res[k] = obj.(*monitoringv1.ServiceMonitor)
|
|
}
|
|
})
|
|
}
|
|
|
|
serviceMonitors := []string{}
|
|
for k, _ := range res {
|
|
serviceMonitors = append(serviceMonitors, k)
|
|
}
|
|
level.Debug(c.logger).Log("msg", "selected ServiceMonitors", "servicemonitors", strings.Join(serviceMonitors, ","), "namespace", p.Namespace, "prometheus", p.Name)
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (c *Operator) createCRDs() error {
|
|
crds := []*extensionsobj.CustomResourceDefinition{
|
|
k8sutil.NewCustomResourceDefinition(c.config.CrdKinds.Prometheus, c.config.CrdGroup, c.config.Labels.LabelsMap, c.config.EnableValidation),
|
|
k8sutil.NewCustomResourceDefinition(c.config.CrdKinds.ServiceMonitor, c.config.CrdGroup, c.config.Labels.LabelsMap, c.config.EnableValidation),
|
|
k8sutil.NewCustomResourceDefinition(c.config.CrdKinds.PrometheusRule, c.config.CrdGroup, c.config.Labels.LabelsMap, c.config.EnableValidation),
|
|
}
|
|
|
|
crdClient := c.crdclient.ApiextensionsV1beta1().CustomResourceDefinitions()
|
|
|
|
for _, crd := range crds {
|
|
oldCRD, err := crdClient.Get(crd.Name, metav1.GetOptions{})
|
|
if err != nil && !apierrors.IsNotFound(err) {
|
|
return errors.Wrapf(err, "getting CRD: %s", crd.Spec.Names.Kind)
|
|
}
|
|
if apierrors.IsNotFound(err) {
|
|
if _, err := crdClient.Create(crd); err != nil {
|
|
return errors.Wrapf(err, "creating CRD: %s", crd.Spec.Names.Kind)
|
|
}
|
|
level.Info(c.logger).Log("msg", "CRD created", "crd", crd.Spec.Names.Kind)
|
|
}
|
|
if err == nil {
|
|
crd.ResourceVersion = oldCRD.ResourceVersion
|
|
if _, err := crdClient.Update(crd); err != nil {
|
|
return errors.Wrapf(err, "creating CRD: %s", crd.Spec.Names.Kind)
|
|
}
|
|
level.Info(c.logger).Log("msg", "CRD updated", "crd", crd.Spec.Names.Kind)
|
|
}
|
|
}
|
|
|
|
crdListFuncs := []struct {
|
|
name string
|
|
listFunc func(opts metav1.ListOptions) (runtime.Object, error)
|
|
}{
|
|
{"Prometheus", c.mclient.MonitoringV1().Prometheuses(c.config.Namespace).List},
|
|
{"ServiceMonitor", c.mclient.MonitoringV1().ServiceMonitors(c.config.Namespace).List},
|
|
{"PrometheusRule", c.mclient.MonitoringV1().PrometheusRules(c.config.Namespace).List},
|
|
}
|
|
|
|
for _, crdListFunc := range crdListFuncs {
|
|
err := k8sutil.WaitForCRDReady(crdListFunc.listFunc)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "waiting for %v crd failed", crdListFunc.name)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|