mirror of
https://github.com/arangodb/kube-arangodb.git
synced 2024-12-14 11:57:37 +00:00
[Feature] [Metrics] Member restarts (#1047)
This commit is contained in:
parent
cde52bb084
commit
64201dfe4f
14 changed files with 517 additions and 160 deletions
|
@ -28,6 +28,7 @@
|
|||
- (Bugfix) Extend Agency HealthCheck for replace
|
||||
- (Bugfix) Allow to remove resources (CPU & Memory) on the managed pods
|
||||
- (Bugfix) Add DistributeShardsLike support
|
||||
- (Feature) Member restarts metric
|
||||
|
||||
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
|
||||
- (Bugfix) Fix arangosync members state inspection
|
||||
|
|
|
@ -2,20 +2,21 @@
|
|||
|
||||
## List
|
||||
|
||||
| Name | Namespace | Group | Type | Description |
|
||||
|:---------------------------------------------------------------------------------------------------------------:|:-----------------:|:------------:|:-------:|:---------------------------------------------------|
|
||||
| [arangodb_operator_agency_errors](./arangodb_operator_agency_errors.md) | arangodb_operator | agency | Counter | Current count of agency cache fetch errors |
|
||||
| [arangodb_operator_agency_fetches](./arangodb_operator_agency_fetches.md) | arangodb_operator | agency | Counter | Current count of agency cache fetches |
|
||||
| [arangodb_operator_agency_index](./arangodb_operator_agency_index.md) | arangodb_operator | agency | Gauge | Current index of the agency cache |
|
||||
| [arangodb_operator_agency_cache_health_present](./arangodb_operator_agency_cache_health_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache health is present |
|
||||
| [arangodb_operator_agency_cache_healthy](./arangodb_operator_agency_cache_healthy.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is healthy |
|
||||
| [arangodb_operator_agency_cache_leaders](./arangodb_operator_agency_cache_leaders.md) | arangodb_operator | agency_cache | Gauge | Determines agency leader vote count |
|
||||
| [arangodb_operator_agency_cache_member_commit_offset](./arangodb_operator_agency_cache_member_commit_offset.md) | arangodb_operator | agency_cache | Gauge | Determines agency member commit offset |
|
||||
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
|
||||
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
|
||||
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
|
||||
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
|
||||
| [arangodb_operator_rebalancer_moves_current](./arangodb_operator_rebalancer_moves_current.md) | arangodb_operator | rebalancer | Gauge | Define how many moves are currently in progress |
|
||||
| [arangodb_operator_rebalancer_moves_failed](./arangodb_operator_rebalancer_moves_failed.md) | arangodb_operator | rebalancer | Counter | Define how many moves failed |
|
||||
| [arangodb_operator_rebalancer_moves_generated](./arangodb_operator_rebalancer_moves_generated.md) | arangodb_operator | rebalancer | Counter | Define how many moves were generated |
|
||||
| [arangodb_operator_rebalancer_moves_succeeded](./arangodb_operator_rebalancer_moves_succeeded.md) | arangodb_operator | rebalancer | Counter | Define how many moves succeeded |
|
||||
| Name | Namespace | Group | Type | Description |
|
||||
|:---------------------------------------------------------------------------------------------------------------------------:|:-----------------:|:------------:|:-------:|:--------------------------------------------------------------------------------------|
|
||||
| [arangodb_operator_agency_errors](./arangodb_operator_agency_errors.md) | arangodb_operator | agency | Counter | Current count of agency cache fetch errors |
|
||||
| [arangodb_operator_agency_fetches](./arangodb_operator_agency_fetches.md) | arangodb_operator | agency | Counter | Current count of agency cache fetches |
|
||||
| [arangodb_operator_agency_index](./arangodb_operator_agency_index.md) | arangodb_operator | agency | Gauge | Current index of the agency cache |
|
||||
| [arangodb_operator_agency_cache_health_present](./arangodb_operator_agency_cache_health_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache health is present |
|
||||
| [arangodb_operator_agency_cache_healthy](./arangodb_operator_agency_cache_healthy.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is healthy |
|
||||
| [arangodb_operator_agency_cache_leaders](./arangodb_operator_agency_cache_leaders.md) | arangodb_operator | agency_cache | Gauge | Determines agency leader vote count |
|
||||
| [arangodb_operator_agency_cache_member_commit_offset](./arangodb_operator_agency_cache_member_commit_offset.md) | arangodb_operator | agency_cache | Gauge | Determines agency member commit offset |
|
||||
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
|
||||
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
|
||||
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
|
||||
| [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) |
|
||||
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
|
||||
| [arangodb_operator_rebalancer_moves_current](./arangodb_operator_rebalancer_moves_current.md) | arangodb_operator | rebalancer | Gauge | Define how many moves are currently in progress |
|
||||
| [arangodb_operator_rebalancer_moves_failed](./arangodb_operator_rebalancer_moves_failed.md) | arangodb_operator | rebalancer | Counter | Define how many moves failed |
|
||||
| [arangodb_operator_rebalancer_moves_generated](./arangodb_operator_rebalancer_moves_generated.md) | arangodb_operator | rebalancer | Counter | Define how many moves were generated |
|
||||
| [arangodb_operator_rebalancer_moves_succeeded](./arangodb_operator_rebalancer_moves_succeeded.md) | arangodb_operator | rebalancer | Counter | Define how many moves succeeded |
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# arangodb_operator_members_unexpected_container_exit_codes (Counter)
|
||||
|
||||
## Description
|
||||
|
||||
Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)
|
||||
|
||||
## Labels
|
||||
|
||||
| Label | Description |
|
||||
|:--------------:|:-------------------------------------------|
|
||||
| namespace | Deployment Namespace |
|
||||
| name | Deployment Name |
|
||||
| member | Member ID |
|
||||
| container | Container Name |
|
||||
| container_type | Container/InitContainer/EphemeralContainer |
|
||||
| code | ExitCode |
|
|
@ -331,7 +331,17 @@ func generateMetricsGO(root string, in MetricsDoc) error {
|
|||
keys = append(keys, "value")
|
||||
|
||||
for _, label := range details.Labels {
|
||||
k := strings.ToLower(label.Key)
|
||||
v := strings.Split(strings.ToLower(label.Key), "_")
|
||||
for id := range v {
|
||||
if id == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
v[id] = strings.Title(v[id])
|
||||
}
|
||||
|
||||
k := strings.Join(v, "")
|
||||
|
||||
keys = append(keys, k)
|
||||
|
||||
if t := label.Type; t != nil {
|
||||
|
|
|
@ -148,4 +148,22 @@ namespaces:
|
|||
- key: namespace
|
||||
description: "Deployment Namespace"
|
||||
- key: name
|
||||
description: "Deployment Name"
|
||||
description: "Deployment Name"
|
||||
members:
|
||||
unexpected_container_exit_codes:
|
||||
shortDescription: "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)"
|
||||
description: "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)"
|
||||
type: "Counter"
|
||||
labels:
|
||||
- key: namespace
|
||||
description: "Deployment Namespace"
|
||||
- key: name
|
||||
description: "Deployment Name"
|
||||
- key: member
|
||||
description: "Member ID"
|
||||
- key: container
|
||||
description: "Container Name"
|
||||
- key: container_type
|
||||
description: "Container/InitContainer/EphemeralContainer"
|
||||
- key: code
|
||||
description: "ExitCode"
|
|
@ -137,13 +137,7 @@ type Deployment struct {
|
|||
|
||||
memberState memberState.StateInspector
|
||||
|
||||
metrics struct {
|
||||
agency struct {
|
||||
errors uint64
|
||||
fetches uint64
|
||||
index uint64
|
||||
}
|
||||
}
|
||||
metrics Metrics
|
||||
}
|
||||
|
||||
func (d *Deployment) WithArangoMember(cache inspectorInterface.Inspector, timeout time.Duration, name string) reconciler.ArangoMemberModContext {
|
||||
|
|
|
@ -254,12 +254,12 @@ func (d *Deployment) inspectDeploymentWithError(ctx context.Context, lastInterva
|
|||
nextInterval = interval
|
||||
}
|
||||
|
||||
d.metrics.agency.fetches++
|
||||
d.metrics.Agency.Fetches++
|
||||
if offset, err := d.RefreshAgencyCache(ctx); err != nil {
|
||||
d.metrics.agency.errors++
|
||||
d.metrics.Agency.Errors++
|
||||
d.log.Err(err).Error("Unable to refresh agency")
|
||||
} else {
|
||||
d.metrics.agency.index = offset
|
||||
d.metrics.Agency.Index = offset
|
||||
}
|
||||
|
||||
// Refresh maintenance lock
|
||||
|
|
|
@ -21,145 +21,22 @@
|
|||
package deployment
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/generated/metric_descriptions"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/throttle"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/metrics"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
const (
|
||||
// Component name for metrics of this package
|
||||
metricsComponent = "deployment"
|
||||
)
|
||||
|
||||
func init() {
|
||||
localInventory = inventory{
|
||||
deployments: map[string]map[string]*Deployment{},
|
||||
deploymentsMetric: metrics.NewDescription("arangodb_operator_deployments", "Number of active deployments", []string{"namespace", "deployment"}, nil),
|
||||
deploymentMetricsMembersMetric: metrics.NewDescription("arango_operator_deployment_members", "List of members", []string{"namespace", "deployment", "role", "id"}, nil),
|
||||
deploymentAgencyStateMetric: metrics.NewDescription("arango_operator_deployment_agency_state", "Reachability of agency", []string{"namespace", "deployment"}, nil),
|
||||
deploymentShardLeadersMetric: metrics.NewDescription("arango_operator_deployment_shard_leaders", "Deployment leader shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
|
||||
deploymentShardsMetric: metrics.NewDescription("arango_operator_deployment_shards", "Deployment shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
|
||||
|
||||
operatorStateRefreshMetric: metrics.NewDescription("arango_operator_deployment_state_refresh_count", "Number of refreshes in deployment", []string{"namespace", "deployment", "type"}, nil),
|
||||
type Metrics struct {
|
||||
Agency struct {
|
||||
Errors uint64
|
||||
Fetches uint64
|
||||
Index uint64
|
||||
}
|
||||
|
||||
prometheus.MustRegister(&localInventory)
|
||||
}
|
||||
|
||||
var localInventory inventory
|
||||
|
||||
var _ prometheus.Collector = &inventory{}
|
||||
|
||||
type inventory struct {
|
||||
lock sync.Mutex
|
||||
deployments map[string]map[string]*Deployment
|
||||
|
||||
deploymentsMetric, deploymentMetricsMembersMetric, deploymentAgencyStateMetric, deploymentShardsMetric, deploymentShardLeadersMetric metrics.Description
|
||||
|
||||
operatorStateRefreshMetric metrics.Description
|
||||
}
|
||||
|
||||
func (i *inventory) Describe(descs chan<- *prometheus.Desc) {
|
||||
i.lock.Lock()
|
||||
defer i.lock.Unlock()
|
||||
|
||||
pd := metrics.NewPushDescription(descs)
|
||||
pd.Push(i.deploymentsMetric, i.deploymentMetricsMembersMetric, i.deploymentAgencyStateMetric, i.deploymentShardLeadersMetric, i.deploymentShardsMetric, i.operatorStateRefreshMetric)
|
||||
|
||||
metric_descriptions.Descriptions(pd)
|
||||
}
|
||||
|
||||
func (i *inventory) Collect(m chan<- prometheus.Metric) {
|
||||
i.lock.Lock()
|
||||
defer i.lock.Unlock()
|
||||
|
||||
p := metrics.NewPushMetric(m)
|
||||
for _, deployments := range i.deployments {
|
||||
for _, deployment := range deployments {
|
||||
p.Push(i.deploymentsMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
|
||||
|
||||
deployment.CollectMetrics(p)
|
||||
|
||||
if state := deployment.acs.CurrentClusterCache(); state != nil {
|
||||
t := state.GetThrottles()
|
||||
|
||||
for _, c := range throttle.AllComponents() {
|
||||
p.Push(i.operatorStateRefreshMetric.Gauge(float64(t.Get(c).Count()), deployment.GetNamespace(), deployment.GetName(), string(c)))
|
||||
}
|
||||
}
|
||||
|
||||
spec := deployment.GetSpec()
|
||||
status, _ := deployment.GetStatus()
|
||||
|
||||
for _, member := range status.Members.AsList() {
|
||||
p.Push(i.deploymentMetricsMembersMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName(), member.Group.AsRole(), member.Member.ID))
|
||||
}
|
||||
|
||||
if spec.Mode.Get().HasAgents() {
|
||||
agency, agencyOk := deployment.GetAgencyCache()
|
||||
if !agencyOk {
|
||||
p.Push(i.deploymentAgencyStateMetric.Gauge(0, deployment.GetNamespace(), deployment.GetName()))
|
||||
continue
|
||||
}
|
||||
|
||||
p.Push(i.deploymentAgencyStateMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
|
||||
|
||||
if spec.Mode.Get() == api.DeploymentModeCluster {
|
||||
for db, collections := range agency.Current.Collections {
|
||||
for collection, shards := range collections {
|
||||
for shard, details := range shards {
|
||||
for id, server := range details.Servers {
|
||||
name := "UNKNOWN"
|
||||
if _, ok := agency.Plan.Collections[db]; ok {
|
||||
if _, ok := agency.Plan.Collections[db][collection]; ok {
|
||||
name = agency.Plan.Collections[db][collection].GetName(name)
|
||||
}
|
||||
}
|
||||
|
||||
m := []string{
|
||||
deployment.GetNamespace(),
|
||||
deployment.GetName(),
|
||||
db,
|
||||
name,
|
||||
shard,
|
||||
string(server),
|
||||
}
|
||||
|
||||
if id == 0 {
|
||||
p.Push(i.deploymentShardLeadersMetric.Gauge(1, m...))
|
||||
}
|
||||
p.Push(i.deploymentShardsMetric.Gauge(1, m...))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (i *inventory) Add(d *Deployment) {
|
||||
i.lock.Lock()
|
||||
defer i.lock.Unlock()
|
||||
|
||||
name, namespace := d.GetName(), d.GetNamespace()
|
||||
|
||||
if _, ok := i.deployments[namespace]; !ok {
|
||||
i.deployments[namespace] = map[string]*Deployment{}
|
||||
}
|
||||
|
||||
i.deployments[namespace][name] = d
|
||||
}
|
||||
|
||||
func (d *Deployment) CollectMetrics(m metrics.PushMetric) {
|
||||
m.Push(metric_descriptions.ArangodbOperatorAgencyErrorsCounter(float64(d.metrics.agency.errors), d.namespace, d.name))
|
||||
m.Push(metric_descriptions.ArangodbOperatorAgencyFetchesCounter(float64(d.metrics.agency.fetches), d.namespace, d.name))
|
||||
m.Push(metric_descriptions.ArangodbOperatorAgencyIndexGauge(float64(d.metrics.agency.index), d.namespace, d.name))
|
||||
m.Push(metric_descriptions.ArangodbOperatorAgencyErrorsCounter(float64(d.metrics.Agency.Errors), d.namespace, d.name))
|
||||
m.Push(metric_descriptions.ArangodbOperatorAgencyFetchesCounter(float64(d.metrics.Agency.Fetches), d.namespace, d.name))
|
||||
m.Push(metric_descriptions.ArangodbOperatorAgencyIndexGauge(float64(d.metrics.Agency.Index), d.namespace, d.name))
|
||||
|
||||
if c := d.agencyCache; c != nil {
|
||||
m.Push(metric_descriptions.ArangodbOperatorAgencyCachePresentGauge(1, d.namespace, d.name))
|
||||
|
@ -174,7 +51,13 @@ func (d *Deployment) CollectMetrics(m metrics.PushMetric) {
|
|||
m.Push(metric_descriptions.ArangodbOperatorAgencyCachePresentGauge(0, d.namespace, d.name))
|
||||
}
|
||||
|
||||
// Reconcile
|
||||
if c := d.reconciler; c != nil {
|
||||
c.CollectMetrics(m)
|
||||
}
|
||||
|
||||
// Resources
|
||||
if r := d.resources; r != nil {
|
||||
r.CollectMetrics(m)
|
||||
}
|
||||
}
|
||||
|
|
157
pkg/deployment/old_metrics.go
Normal file
157
pkg/deployment/old_metrics.go
Normal file
|
@ -0,0 +1,157 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package deployment
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
|
||||
"github.com/arangodb/kube-arangodb/pkg/generated/metric_descriptions"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/throttle"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/metrics"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
const (
|
||||
// Component name for metrics of this package
|
||||
metricsComponent = "deployment"
|
||||
)
|
||||
|
||||
func init() {
|
||||
localInventory = inventory{
|
||||
deployments: map[string]map[string]*Deployment{},
|
||||
deploymentsMetric: metrics.NewDescription("arangodb_operator_deployments", "Number of active deployments", []string{"namespace", "deployment"}, nil),
|
||||
deploymentMetricsMembersMetric: metrics.NewDescription("arango_operator_deployment_members", "List of members", []string{"namespace", "deployment", "role", "id"}, nil),
|
||||
deploymentAgencyStateMetric: metrics.NewDescription("arango_operator_deployment_agency_state", "Reachability of agency", []string{"namespace", "deployment"}, nil),
|
||||
deploymentShardLeadersMetric: metrics.NewDescription("arango_operator_deployment_shard_leaders", "Deployment leader shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
|
||||
deploymentShardsMetric: metrics.NewDescription("arango_operator_deployment_shards", "Deployment shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
|
||||
|
||||
operatorStateRefreshMetric: metrics.NewDescription("arango_operator_deployment_state_refresh_count", "Number of refreshes in deployment", []string{"namespace", "deployment", "type"}, nil),
|
||||
}
|
||||
|
||||
prometheus.MustRegister(&localInventory)
|
||||
}
|
||||
|
||||
var localInventory inventory
|
||||
|
||||
var _ prometheus.Collector = &inventory{}
|
||||
|
||||
type inventory struct {
|
||||
lock sync.Mutex
|
||||
deployments map[string]map[string]*Deployment
|
||||
|
||||
deploymentsMetric, deploymentMetricsMembersMetric, deploymentAgencyStateMetric, deploymentShardsMetric, deploymentShardLeadersMetric metrics.Description
|
||||
|
||||
operatorStateRefreshMetric metrics.Description
|
||||
}
|
||||
|
||||
func (i *inventory) Describe(descs chan<- *prometheus.Desc) {
|
||||
i.lock.Lock()
|
||||
defer i.lock.Unlock()
|
||||
|
||||
pd := metrics.NewPushDescription(descs)
|
||||
pd.Push(i.deploymentsMetric, i.deploymentMetricsMembersMetric, i.deploymentAgencyStateMetric, i.deploymentShardLeadersMetric, i.deploymentShardsMetric, i.operatorStateRefreshMetric)
|
||||
|
||||
metric_descriptions.Descriptions(pd)
|
||||
}
|
||||
|
||||
func (i *inventory) Collect(m chan<- prometheus.Metric) {
|
||||
i.lock.Lock()
|
||||
defer i.lock.Unlock()
|
||||
|
||||
p := metrics.NewPushMetric(m)
|
||||
for _, deployments := range i.deployments {
|
||||
for _, deployment := range deployments {
|
||||
p.Push(i.deploymentsMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
|
||||
|
||||
deployment.CollectMetrics(p)
|
||||
|
||||
if state := deployment.acs.CurrentClusterCache(); state != nil {
|
||||
t := state.GetThrottles()
|
||||
|
||||
for _, c := range throttle.AllComponents() {
|
||||
p.Push(i.operatorStateRefreshMetric.Gauge(float64(t.Get(c).Count()), deployment.GetNamespace(), deployment.GetName(), string(c)))
|
||||
}
|
||||
}
|
||||
|
||||
spec := deployment.GetSpec()
|
||||
status, _ := deployment.GetStatus()
|
||||
|
||||
for _, member := range status.Members.AsList() {
|
||||
p.Push(i.deploymentMetricsMembersMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName(), member.Group.AsRole(), member.Member.ID))
|
||||
}
|
||||
|
||||
if spec.Mode.Get().HasAgents() {
|
||||
agency, agencyOk := deployment.GetAgencyCache()
|
||||
if !agencyOk {
|
||||
p.Push(i.deploymentAgencyStateMetric.Gauge(0, deployment.GetNamespace(), deployment.GetName()))
|
||||
continue
|
||||
}
|
||||
|
||||
p.Push(i.deploymentAgencyStateMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
|
||||
|
||||
if spec.Mode.Get() == api.DeploymentModeCluster {
|
||||
for db, collections := range agency.Current.Collections {
|
||||
for collection, shards := range collections {
|
||||
for shard, details := range shards {
|
||||
for id, server := range details.Servers {
|
||||
name := "UNKNOWN"
|
||||
if _, ok := agency.Plan.Collections[db]; ok {
|
||||
if _, ok := agency.Plan.Collections[db][collection]; ok {
|
||||
name = agency.Plan.Collections[db][collection].GetName(name)
|
||||
}
|
||||
}
|
||||
|
||||
m := []string{
|
||||
deployment.GetNamespace(),
|
||||
deployment.GetName(),
|
||||
db,
|
||||
name,
|
||||
shard,
|
||||
string(server),
|
||||
}
|
||||
|
||||
if id == 0 {
|
||||
p.Push(i.deploymentShardLeadersMetric.Gauge(1, m...))
|
||||
}
|
||||
p.Push(i.deploymentShardsMetric.Gauge(1, m...))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (i *inventory) Add(d *Deployment) {
|
||||
i.lock.Lock()
|
||||
defer i.lock.Unlock()
|
||||
|
||||
name, namespace := d.GetName(), d.GetNamespace()
|
||||
|
||||
if _, ok := i.deployments[namespace]; !ok {
|
||||
i.deployments[namespace] = map[string]*Deployment{}
|
||||
}
|
||||
|
||||
i.deployments[namespace][name] = d
|
||||
}
|
|
@ -20,7 +20,146 @@
|
|||
|
||||
package resources
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"fmt"
|
||||
|
||||
"github.com/arangodb/kube-arangodb/pkg/generated/metric_descriptions"
|
||||
"github.com/arangodb/kube-arangodb/pkg/util/metrics"
|
||||
)
|
||||
|
||||
const (
|
||||
// Component name for metrics of this package
|
||||
metricsComponent = "deployment_resources"
|
||||
)
|
||||
|
||||
type Metrics struct {
|
||||
lock sync.Mutex
|
||||
|
||||
Members map[string]MetricMember
|
||||
}
|
||||
|
||||
func (m *Metrics) IncMemberContainerRestarts(id, container string, code int32) {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
if m.Members == nil {
|
||||
m.Members = map[string]MetricMember{}
|
||||
}
|
||||
|
||||
v := m.Members[id]
|
||||
|
||||
if v.ContainerRestarts == nil {
|
||||
v.ContainerRestarts = map[string]MetricMemberRestarts{}
|
||||
}
|
||||
|
||||
cr := v.ContainerRestarts[container]
|
||||
|
||||
if cr == nil {
|
||||
cr = MetricMemberRestarts{}
|
||||
}
|
||||
|
||||
cr[code]++
|
||||
|
||||
v.ContainerRestarts[container] = cr
|
||||
|
||||
m.Members[id] = v
|
||||
}
|
||||
|
||||
func (m *Metrics) IncMemberInitContainerRestarts(id, container string, code int32) {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
if m.Members == nil {
|
||||
m.Members = map[string]MetricMember{}
|
||||
}
|
||||
|
||||
v := m.Members[id]
|
||||
|
||||
if v.InitContainerRestarts == nil {
|
||||
v.InitContainerRestarts = map[string]MetricMemberRestarts{}
|
||||
}
|
||||
|
||||
cr := v.InitContainerRestarts[container]
|
||||
|
||||
if cr == nil {
|
||||
cr = MetricMemberRestarts{}
|
||||
}
|
||||
|
||||
cr[code]++
|
||||
|
||||
v.InitContainerRestarts[container] = cr
|
||||
|
||||
m.Members[id] = v
|
||||
}
|
||||
|
||||
func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code int32) {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
if m.Members == nil {
|
||||
m.Members = map[string]MetricMember{}
|
||||
}
|
||||
|
||||
v := m.Members[id]
|
||||
|
||||
if v.EphemeralContainerRestarts == nil {
|
||||
v.EphemeralContainerRestarts = map[string]MetricMemberRestarts{}
|
||||
}
|
||||
|
||||
cr := v.EphemeralContainerRestarts[container]
|
||||
|
||||
if cr == nil {
|
||||
cr = MetricMemberRestarts{}
|
||||
}
|
||||
|
||||
cr[code]++
|
||||
|
||||
v.EphemeralContainerRestarts[container] = cr
|
||||
|
||||
m.Members[id] = v
|
||||
}
|
||||
|
||||
type MetricMember struct {
|
||||
ContainerRestarts map[string]MetricMemberRestarts
|
||||
InitContainerRestarts map[string]MetricMemberRestarts
|
||||
EphemeralContainerRestarts map[string]MetricMemberRestarts
|
||||
}
|
||||
|
||||
type MetricMemberRestarts map[int32]uint64
|
||||
|
||||
func (d *Resources) CollectMetrics(m metrics.PushMetric) {
|
||||
for member, info := range d.metrics.Members {
|
||||
// Containers
|
||||
for container, restarts := range info.ContainerRestarts {
|
||||
for code, count := range restarts {
|
||||
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "container", fmt.Sprintf("%d", code)))
|
||||
}
|
||||
}
|
||||
// InitContainers
|
||||
for container, restarts := range info.InitContainerRestarts {
|
||||
for code, count := range restarts {
|
||||
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "initContainer", fmt.Sprintf("%d", code)))
|
||||
}
|
||||
}
|
||||
// EphemeralContainers
|
||||
for container, restarts := range info.EphemeralContainerRestarts {
|
||||
for code, count := range restarts {
|
||||
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
93
pkg/deployment/resources/metrics_test.go
Normal file
93
pkg/deployment/resources/metrics_test.go
Normal file
|
@ -0,0 +1,93 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package resources
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func Test_MetricsInc_Container(t *testing.T) {
|
||||
var m Metrics
|
||||
|
||||
m.IncMemberContainerRestarts("ID", "server", 137)
|
||||
|
||||
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][137])
|
||||
|
||||
m.IncMemberContainerRestarts("ID", "server", 137)
|
||||
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137])
|
||||
|
||||
m.IncMemberContainerRestarts("ID", "server2", 137)
|
||||
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137])
|
||||
|
||||
m.IncMemberContainerRestarts("ID2", "server", 137)
|
||||
m.IncMemberContainerRestarts("ID", "server", 138)
|
||||
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID2"].ContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][138])
|
||||
}
|
||||
|
||||
func Test_MetricsInc_InitContainer(t *testing.T) {
|
||||
var m Metrics
|
||||
|
||||
m.IncMemberInitContainerRestarts("ID", "server", 137)
|
||||
|
||||
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][137])
|
||||
|
||||
m.IncMemberInitContainerRestarts("ID", "server", 137)
|
||||
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137])
|
||||
|
||||
m.IncMemberInitContainerRestarts("ID", "server2", 137)
|
||||
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137])
|
||||
|
||||
m.IncMemberInitContainerRestarts("ID2", "server", 137)
|
||||
m.IncMemberInitContainerRestarts("ID", "server", 138)
|
||||
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID2"].InitContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][138])
|
||||
}
|
||||
|
||||
func Test_MetricsInc_EphemeralContainer(t *testing.T) {
|
||||
var m Metrics
|
||||
|
||||
m.IncMemberEphemeralContainerRestarts("ID", "server", 137)
|
||||
|
||||
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][137])
|
||||
|
||||
m.IncMemberEphemeralContainerRestarts("ID", "server", 137)
|
||||
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137])
|
||||
|
||||
m.IncMemberEphemeralContainerRestarts("ID", "server2", 137)
|
||||
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137])
|
||||
|
||||
m.IncMemberEphemeralContainerRestarts("ID2", "server", 137)
|
||||
m.IncMemberEphemeralContainerRestarts("ID", "server", 138)
|
||||
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID2"].EphemeralContainerRestarts["server"][137])
|
||||
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][138])
|
||||
}
|
|
@ -178,6 +178,8 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
|
|||
Time("started", t.StartedAt.Time).
|
||||
Time("finished", t.FinishedAt.Time).
|
||||
Warn("Pod failed in unexpected way: Init Container failed")
|
||||
|
||||
r.metrics.IncMemberInitContainerRestarts(memberStatus.ID, container, t.ExitCode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -198,6 +200,8 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
|
|||
Time("started", t.StartedAt.Time).
|
||||
Time("finished", t.FinishedAt.Time).
|
||||
Warn("Pod failed in unexpected way: Core Container failed")
|
||||
|
||||
r.metrics.IncMemberContainerRestarts(memberStatus.ID, container, t.ExitCode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,8 @@ type Resources struct {
|
|||
log logging.Logger
|
||||
namespace, name string
|
||||
context Context
|
||||
|
||||
metrics Metrics
|
||||
}
|
||||
|
||||
// NewResources creates a new Resources service, used to
|
||||
|
|
39
pkg/generated/metric_descriptions/arangodb_operator_members_unexpected_container_exit_codes.go
generated
Normal file
39
pkg/generated/metric_descriptions/arangodb_operator_members_unexpected_container_exit_codes.go
generated
Normal file
|
@ -0,0 +1,39 @@
|
|||
//
|
||||
// DISCLAIMER
|
||||
//
|
||||
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
//
|
||||
|
||||
package metric_descriptions
|
||||
|
||||
import "github.com/arangodb/kube-arangodb/pkg/util/metrics"
|
||||
|
||||
var (
|
||||
arangodbOperatorMembersUnexpectedContainerExitCodes = metrics.NewDescription("arangodb_operator_members_unexpected_container_exit_codes", "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)", []string{`namespace`, `name`, `member`, `container`, `container_type`, `code`}, nil)
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerDescription(arangodbOperatorMembersUnexpectedContainerExitCodes)
|
||||
}
|
||||
|
||||
func ArangodbOperatorMembersUnexpectedContainerExitCodes() metrics.Description {
|
||||
return arangodbOperatorMembersUnexpectedContainerExitCodes
|
||||
}
|
||||
|
||||
func ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(value float64, namespace string, name string, member string, container string, containerType string, code string) metrics.Metric {
|
||||
return ArangodbOperatorMembersUnexpectedContainerExitCodes().Gauge(value, namespace, name, member, container, containerType, code)
|
||||
}
|
Loading…
Reference in a new issue