1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] [Metrics] Member restarts (#1047)

This commit is contained in:
Adam Janikowski 2022-07-07 14:09:17 +02:00 committed by GitHub
parent cde52bb084
commit 64201dfe4f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 517 additions and 160 deletions

View file

@ -28,6 +28,7 @@
- (Bugfix) Extend Agency HealthCheck for replace
- (Bugfix) Allow to remove resources (CPU & Memory) on the managed pods
- (Bugfix) Add DistributeShardsLike support
- (Feature) Member restarts metric
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
- (Bugfix) Fix arangosync members state inspection

View file

@ -2,20 +2,21 @@
## List
| Name | Namespace | Group | Type | Description |
|:---------------------------------------------------------------------------------------------------------------:|:-----------------:|:------------:|:-------:|:---------------------------------------------------|
| [arangodb_operator_agency_errors](./arangodb_operator_agency_errors.md) | arangodb_operator | agency | Counter | Current count of agency cache fetch errors |
| [arangodb_operator_agency_fetches](./arangodb_operator_agency_fetches.md) | arangodb_operator | agency | Counter | Current count of agency cache fetches |
| [arangodb_operator_agency_index](./arangodb_operator_agency_index.md) | arangodb_operator | agency | Gauge | Current index of the agency cache |
| [arangodb_operator_agency_cache_health_present](./arangodb_operator_agency_cache_health_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache health is present |
| [arangodb_operator_agency_cache_healthy](./arangodb_operator_agency_cache_healthy.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is healthy |
| [arangodb_operator_agency_cache_leaders](./arangodb_operator_agency_cache_leaders.md) | arangodb_operator | agency_cache | Gauge | Determines agency leader vote count |
| [arangodb_operator_agency_cache_member_commit_offset](./arangodb_operator_agency_cache_member_commit_offset.md) | arangodb_operator | agency_cache | Gauge | Determines agency member commit offset |
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
| [arangodb_operator_rebalancer_moves_current](./arangodb_operator_rebalancer_moves_current.md) | arangodb_operator | rebalancer | Gauge | Define how many moves are currently in progress |
| [arangodb_operator_rebalancer_moves_failed](./arangodb_operator_rebalancer_moves_failed.md) | arangodb_operator | rebalancer | Counter | Define how many moves failed |
| [arangodb_operator_rebalancer_moves_generated](./arangodb_operator_rebalancer_moves_generated.md) | arangodb_operator | rebalancer | Counter | Define how many moves were generated |
| [arangodb_operator_rebalancer_moves_succeeded](./arangodb_operator_rebalancer_moves_succeeded.md) | arangodb_operator | rebalancer | Counter | Define how many moves succeeded |
| Name | Namespace | Group | Type | Description |
|:---------------------------------------------------------------------------------------------------------------------------:|:-----------------:|:------------:|:-------:|:--------------------------------------------------------------------------------------|
| [arangodb_operator_agency_errors](./arangodb_operator_agency_errors.md) | arangodb_operator | agency | Counter | Current count of agency cache fetch errors |
| [arangodb_operator_agency_fetches](./arangodb_operator_agency_fetches.md) | arangodb_operator | agency | Counter | Current count of agency cache fetches |
| [arangodb_operator_agency_index](./arangodb_operator_agency_index.md) | arangodb_operator | agency | Gauge | Current index of the agency cache |
| [arangodb_operator_agency_cache_health_present](./arangodb_operator_agency_cache_health_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache health is present |
| [arangodb_operator_agency_cache_healthy](./arangodb_operator_agency_cache_healthy.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is healthy |
| [arangodb_operator_agency_cache_leaders](./arangodb_operator_agency_cache_leaders.md) | arangodb_operator | agency_cache | Gauge | Determines agency leader vote count |
| [arangodb_operator_agency_cache_member_commit_offset](./arangodb_operator_agency_cache_member_commit_offset.md) | arangodb_operator | agency_cache | Gauge | Determines agency member commit offset |
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
| [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) |
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
| [arangodb_operator_rebalancer_moves_current](./arangodb_operator_rebalancer_moves_current.md) | arangodb_operator | rebalancer | Gauge | Define how many moves are currently in progress |
| [arangodb_operator_rebalancer_moves_failed](./arangodb_operator_rebalancer_moves_failed.md) | arangodb_operator | rebalancer | Counter | Define how many moves failed |
| [arangodb_operator_rebalancer_moves_generated](./arangodb_operator_rebalancer_moves_generated.md) | arangodb_operator | rebalancer | Counter | Define how many moves were generated |
| [arangodb_operator_rebalancer_moves_succeeded](./arangodb_operator_rebalancer_moves_succeeded.md) | arangodb_operator | rebalancer | Counter | Define how many moves succeeded |

View file

@ -0,0 +1,16 @@
# arangodb_operator_members_unexpected_container_exit_codes (Counter)
## Description
Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)
## Labels
| Label | Description |
|:--------------:|:-------------------------------------------|
| namespace | Deployment Namespace |
| name | Deployment Name |
| member | Member ID |
| container | Container Name |
| container_type | Container/InitContainer/EphemeralContainer |
| code | ExitCode |

View file

@ -331,7 +331,17 @@ func generateMetricsGO(root string, in MetricsDoc) error {
keys = append(keys, "value")
for _, label := range details.Labels {
k := strings.ToLower(label.Key)
v := strings.Split(strings.ToLower(label.Key), "_")
for id := range v {
if id == 0 {
continue
}
v[id] = strings.Title(v[id])
}
k := strings.Join(v, "")
keys = append(keys, k)
if t := label.Type; t != nil {

View file

@ -148,4 +148,22 @@ namespaces:
- key: namespace
description: "Deployment Namespace"
- key: name
description: "Deployment Name"
description: "Deployment Name"
members:
unexpected_container_exit_codes:
shortDescription: "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)"
description: "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)"
type: "Counter"
labels:
- key: namespace
description: "Deployment Namespace"
- key: name
description: "Deployment Name"
- key: member
description: "Member ID"
- key: container
description: "Container Name"
- key: container_type
description: "Container/InitContainer/EphemeralContainer"
- key: code
description: "ExitCode"

View file

@ -137,13 +137,7 @@ type Deployment struct {
memberState memberState.StateInspector
metrics struct {
agency struct {
errors uint64
fetches uint64
index uint64
}
}
metrics Metrics
}
func (d *Deployment) WithArangoMember(cache inspectorInterface.Inspector, timeout time.Duration, name string) reconciler.ArangoMemberModContext {

View file

@ -254,12 +254,12 @@ func (d *Deployment) inspectDeploymentWithError(ctx context.Context, lastInterva
nextInterval = interval
}
d.metrics.agency.fetches++
d.metrics.Agency.Fetches++
if offset, err := d.RefreshAgencyCache(ctx); err != nil {
d.metrics.agency.errors++
d.metrics.Agency.Errors++
d.log.Err(err).Error("Unable to refresh agency")
} else {
d.metrics.agency.index = offset
d.metrics.Agency.Index = offset
}
// Refresh maintenance lock

View file

@ -21,145 +21,22 @@
package deployment
import (
"sync"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/generated/metric_descriptions"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/throttle"
"github.com/arangodb/kube-arangodb/pkg/util/metrics"
"github.com/prometheus/client_golang/prometheus"
)
const (
// Component name for metrics of this package
metricsComponent = "deployment"
)
func init() {
localInventory = inventory{
deployments: map[string]map[string]*Deployment{},
deploymentsMetric: metrics.NewDescription("arangodb_operator_deployments", "Number of active deployments", []string{"namespace", "deployment"}, nil),
deploymentMetricsMembersMetric: metrics.NewDescription("arango_operator_deployment_members", "List of members", []string{"namespace", "deployment", "role", "id"}, nil),
deploymentAgencyStateMetric: metrics.NewDescription("arango_operator_deployment_agency_state", "Reachability of agency", []string{"namespace", "deployment"}, nil),
deploymentShardLeadersMetric: metrics.NewDescription("arango_operator_deployment_shard_leaders", "Deployment leader shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
deploymentShardsMetric: metrics.NewDescription("arango_operator_deployment_shards", "Deployment shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
operatorStateRefreshMetric: metrics.NewDescription("arango_operator_deployment_state_refresh_count", "Number of refreshes in deployment", []string{"namespace", "deployment", "type"}, nil),
type Metrics struct {
Agency struct {
Errors uint64
Fetches uint64
Index uint64
}
prometheus.MustRegister(&localInventory)
}
var localInventory inventory
var _ prometheus.Collector = &inventory{}
type inventory struct {
lock sync.Mutex
deployments map[string]map[string]*Deployment
deploymentsMetric, deploymentMetricsMembersMetric, deploymentAgencyStateMetric, deploymentShardsMetric, deploymentShardLeadersMetric metrics.Description
operatorStateRefreshMetric metrics.Description
}
func (i *inventory) Describe(descs chan<- *prometheus.Desc) {
i.lock.Lock()
defer i.lock.Unlock()
pd := metrics.NewPushDescription(descs)
pd.Push(i.deploymentsMetric, i.deploymentMetricsMembersMetric, i.deploymentAgencyStateMetric, i.deploymentShardLeadersMetric, i.deploymentShardsMetric, i.operatorStateRefreshMetric)
metric_descriptions.Descriptions(pd)
}
func (i *inventory) Collect(m chan<- prometheus.Metric) {
i.lock.Lock()
defer i.lock.Unlock()
p := metrics.NewPushMetric(m)
for _, deployments := range i.deployments {
for _, deployment := range deployments {
p.Push(i.deploymentsMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
deployment.CollectMetrics(p)
if state := deployment.acs.CurrentClusterCache(); state != nil {
t := state.GetThrottles()
for _, c := range throttle.AllComponents() {
p.Push(i.operatorStateRefreshMetric.Gauge(float64(t.Get(c).Count()), deployment.GetNamespace(), deployment.GetName(), string(c)))
}
}
spec := deployment.GetSpec()
status, _ := deployment.GetStatus()
for _, member := range status.Members.AsList() {
p.Push(i.deploymentMetricsMembersMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName(), member.Group.AsRole(), member.Member.ID))
}
if spec.Mode.Get().HasAgents() {
agency, agencyOk := deployment.GetAgencyCache()
if !agencyOk {
p.Push(i.deploymentAgencyStateMetric.Gauge(0, deployment.GetNamespace(), deployment.GetName()))
continue
}
p.Push(i.deploymentAgencyStateMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
if spec.Mode.Get() == api.DeploymentModeCluster {
for db, collections := range agency.Current.Collections {
for collection, shards := range collections {
for shard, details := range shards {
for id, server := range details.Servers {
name := "UNKNOWN"
if _, ok := agency.Plan.Collections[db]; ok {
if _, ok := agency.Plan.Collections[db][collection]; ok {
name = agency.Plan.Collections[db][collection].GetName(name)
}
}
m := []string{
deployment.GetNamespace(),
deployment.GetName(),
db,
name,
shard,
string(server),
}
if id == 0 {
p.Push(i.deploymentShardLeadersMetric.Gauge(1, m...))
}
p.Push(i.deploymentShardsMetric.Gauge(1, m...))
}
}
}
}
}
}
}
}
}
func (i *inventory) Add(d *Deployment) {
i.lock.Lock()
defer i.lock.Unlock()
name, namespace := d.GetName(), d.GetNamespace()
if _, ok := i.deployments[namespace]; !ok {
i.deployments[namespace] = map[string]*Deployment{}
}
i.deployments[namespace][name] = d
}
func (d *Deployment) CollectMetrics(m metrics.PushMetric) {
m.Push(metric_descriptions.ArangodbOperatorAgencyErrorsCounter(float64(d.metrics.agency.errors), d.namespace, d.name))
m.Push(metric_descriptions.ArangodbOperatorAgencyFetchesCounter(float64(d.metrics.agency.fetches), d.namespace, d.name))
m.Push(metric_descriptions.ArangodbOperatorAgencyIndexGauge(float64(d.metrics.agency.index), d.namespace, d.name))
m.Push(metric_descriptions.ArangodbOperatorAgencyErrorsCounter(float64(d.metrics.Agency.Errors), d.namespace, d.name))
m.Push(metric_descriptions.ArangodbOperatorAgencyFetchesCounter(float64(d.metrics.Agency.Fetches), d.namespace, d.name))
m.Push(metric_descriptions.ArangodbOperatorAgencyIndexGauge(float64(d.metrics.Agency.Index), d.namespace, d.name))
if c := d.agencyCache; c != nil {
m.Push(metric_descriptions.ArangodbOperatorAgencyCachePresentGauge(1, d.namespace, d.name))
@ -174,7 +51,13 @@ func (d *Deployment) CollectMetrics(m metrics.PushMetric) {
m.Push(metric_descriptions.ArangodbOperatorAgencyCachePresentGauge(0, d.namespace, d.name))
}
// Reconcile
if c := d.reconciler; c != nil {
c.CollectMetrics(m)
}
// Resources
if r := d.resources; r != nil {
r.CollectMetrics(m)
}
}

View file

@ -0,0 +1,157 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package deployment
import (
"sync"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/generated/metric_descriptions"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/throttle"
"github.com/arangodb/kube-arangodb/pkg/util/metrics"
"github.com/prometheus/client_golang/prometheus"
)
const (
// Component name for metrics of this package
metricsComponent = "deployment"
)
func init() {
localInventory = inventory{
deployments: map[string]map[string]*Deployment{},
deploymentsMetric: metrics.NewDescription("arangodb_operator_deployments", "Number of active deployments", []string{"namespace", "deployment"}, nil),
deploymentMetricsMembersMetric: metrics.NewDescription("arango_operator_deployment_members", "List of members", []string{"namespace", "deployment", "role", "id"}, nil),
deploymentAgencyStateMetric: metrics.NewDescription("arango_operator_deployment_agency_state", "Reachability of agency", []string{"namespace", "deployment"}, nil),
deploymentShardLeadersMetric: metrics.NewDescription("arango_operator_deployment_shard_leaders", "Deployment leader shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
deploymentShardsMetric: metrics.NewDescription("arango_operator_deployment_shards", "Deployment shards distribution", []string{"namespace", "deployment", "database", "collection", "shard", "server"}, nil),
operatorStateRefreshMetric: metrics.NewDescription("arango_operator_deployment_state_refresh_count", "Number of refreshes in deployment", []string{"namespace", "deployment", "type"}, nil),
}
prometheus.MustRegister(&localInventory)
}
var localInventory inventory
var _ prometheus.Collector = &inventory{}
type inventory struct {
lock sync.Mutex
deployments map[string]map[string]*Deployment
deploymentsMetric, deploymentMetricsMembersMetric, deploymentAgencyStateMetric, deploymentShardsMetric, deploymentShardLeadersMetric metrics.Description
operatorStateRefreshMetric metrics.Description
}
func (i *inventory) Describe(descs chan<- *prometheus.Desc) {
i.lock.Lock()
defer i.lock.Unlock()
pd := metrics.NewPushDescription(descs)
pd.Push(i.deploymentsMetric, i.deploymentMetricsMembersMetric, i.deploymentAgencyStateMetric, i.deploymentShardLeadersMetric, i.deploymentShardsMetric, i.operatorStateRefreshMetric)
metric_descriptions.Descriptions(pd)
}
func (i *inventory) Collect(m chan<- prometheus.Metric) {
i.lock.Lock()
defer i.lock.Unlock()
p := metrics.NewPushMetric(m)
for _, deployments := range i.deployments {
for _, deployment := range deployments {
p.Push(i.deploymentsMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
deployment.CollectMetrics(p)
if state := deployment.acs.CurrentClusterCache(); state != nil {
t := state.GetThrottles()
for _, c := range throttle.AllComponents() {
p.Push(i.operatorStateRefreshMetric.Gauge(float64(t.Get(c).Count()), deployment.GetNamespace(), deployment.GetName(), string(c)))
}
}
spec := deployment.GetSpec()
status, _ := deployment.GetStatus()
for _, member := range status.Members.AsList() {
p.Push(i.deploymentMetricsMembersMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName(), member.Group.AsRole(), member.Member.ID))
}
if spec.Mode.Get().HasAgents() {
agency, agencyOk := deployment.GetAgencyCache()
if !agencyOk {
p.Push(i.deploymentAgencyStateMetric.Gauge(0, deployment.GetNamespace(), deployment.GetName()))
continue
}
p.Push(i.deploymentAgencyStateMetric.Gauge(1, deployment.GetNamespace(), deployment.GetName()))
if spec.Mode.Get() == api.DeploymentModeCluster {
for db, collections := range agency.Current.Collections {
for collection, shards := range collections {
for shard, details := range shards {
for id, server := range details.Servers {
name := "UNKNOWN"
if _, ok := agency.Plan.Collections[db]; ok {
if _, ok := agency.Plan.Collections[db][collection]; ok {
name = agency.Plan.Collections[db][collection].GetName(name)
}
}
m := []string{
deployment.GetNamespace(),
deployment.GetName(),
db,
name,
shard,
string(server),
}
if id == 0 {
p.Push(i.deploymentShardLeadersMetric.Gauge(1, m...))
}
p.Push(i.deploymentShardsMetric.Gauge(1, m...))
}
}
}
}
}
}
}
}
}
func (i *inventory) Add(d *Deployment) {
i.lock.Lock()
defer i.lock.Unlock()
name, namespace := d.GetName(), d.GetNamespace()
if _, ok := i.deployments[namespace]; !ok {
i.deployments[namespace] = map[string]*Deployment{}
}
i.deployments[namespace][name] = d
}

View file

@ -20,7 +20,146 @@
package resources
import (
"sync"
"fmt"
"github.com/arangodb/kube-arangodb/pkg/generated/metric_descriptions"
"github.com/arangodb/kube-arangodb/pkg/util/metrics"
)
const (
// Component name for metrics of this package
metricsComponent = "deployment_resources"
)
type Metrics struct {
lock sync.Mutex
Members map[string]MetricMember
}
func (m *Metrics) IncMemberContainerRestarts(id, container string, code int32) {
if m == nil {
return
}
m.lock.Lock()
defer m.lock.Unlock()
if m.Members == nil {
m.Members = map[string]MetricMember{}
}
v := m.Members[id]
if v.ContainerRestarts == nil {
v.ContainerRestarts = map[string]MetricMemberRestarts{}
}
cr := v.ContainerRestarts[container]
if cr == nil {
cr = MetricMemberRestarts{}
}
cr[code]++
v.ContainerRestarts[container] = cr
m.Members[id] = v
}
func (m *Metrics) IncMemberInitContainerRestarts(id, container string, code int32) {
if m == nil {
return
}
m.lock.Lock()
defer m.lock.Unlock()
if m.Members == nil {
m.Members = map[string]MetricMember{}
}
v := m.Members[id]
if v.InitContainerRestarts == nil {
v.InitContainerRestarts = map[string]MetricMemberRestarts{}
}
cr := v.InitContainerRestarts[container]
if cr == nil {
cr = MetricMemberRestarts{}
}
cr[code]++
v.InitContainerRestarts[container] = cr
m.Members[id] = v
}
func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code int32) {
if m == nil {
return
}
m.lock.Lock()
defer m.lock.Unlock()
if m.Members == nil {
m.Members = map[string]MetricMember{}
}
v := m.Members[id]
if v.EphemeralContainerRestarts == nil {
v.EphemeralContainerRestarts = map[string]MetricMemberRestarts{}
}
cr := v.EphemeralContainerRestarts[container]
if cr == nil {
cr = MetricMemberRestarts{}
}
cr[code]++
v.EphemeralContainerRestarts[container] = cr
m.Members[id] = v
}
type MetricMember struct {
ContainerRestarts map[string]MetricMemberRestarts
InitContainerRestarts map[string]MetricMemberRestarts
EphemeralContainerRestarts map[string]MetricMemberRestarts
}
type MetricMemberRestarts map[int32]uint64
func (d *Resources) CollectMetrics(m metrics.PushMetric) {
for member, info := range d.metrics.Members {
// Containers
for container, restarts := range info.ContainerRestarts {
for code, count := range restarts {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "container", fmt.Sprintf("%d", code)))
}
}
// InitContainers
for container, restarts := range info.InitContainerRestarts {
for code, count := range restarts {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "initContainer", fmt.Sprintf("%d", code)))
}
}
// EphemeralContainers
for container, restarts := range info.EphemeralContainerRestarts {
for code, count := range restarts {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code)))
}
}
}
}

View file

@ -0,0 +1,93 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package resources
import (
"testing"
"github.com/stretchr/testify/require"
)
func Test_MetricsInc_Container(t *testing.T) {
var m Metrics
m.IncMemberContainerRestarts("ID", "server", 137)
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][137])
m.IncMemberContainerRestarts("ID", "server", 137)
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137])
m.IncMemberContainerRestarts("ID", "server2", 137)
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137])
m.IncMemberContainerRestarts("ID2", "server", 137)
m.IncMemberContainerRestarts("ID", "server", 138)
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137])
require.EqualValues(t, 1, m.Members["ID2"].ContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][138])
}
func Test_MetricsInc_InitContainer(t *testing.T) {
var m Metrics
m.IncMemberInitContainerRestarts("ID", "server", 137)
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][137])
m.IncMemberInitContainerRestarts("ID", "server", 137)
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137])
m.IncMemberInitContainerRestarts("ID", "server2", 137)
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137])
m.IncMemberInitContainerRestarts("ID2", "server", 137)
m.IncMemberInitContainerRestarts("ID", "server", 138)
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137])
require.EqualValues(t, 1, m.Members["ID2"].InitContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][138])
}
func Test_MetricsInc_EphemeralContainer(t *testing.T) {
var m Metrics
m.IncMemberEphemeralContainerRestarts("ID", "server", 137)
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][137])
m.IncMemberEphemeralContainerRestarts("ID", "server", 137)
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137])
m.IncMemberEphemeralContainerRestarts("ID", "server2", 137)
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137])
m.IncMemberEphemeralContainerRestarts("ID2", "server", 137)
m.IncMemberEphemeralContainerRestarts("ID", "server", 138)
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137])
require.EqualValues(t, 1, m.Members["ID2"].EphemeralContainerRestarts["server"][137])
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][138])
}

View file

@ -178,6 +178,8 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
Time("started", t.StartedAt.Time).
Time("finished", t.FinishedAt.Time).
Warn("Pod failed in unexpected way: Init Container failed")
r.metrics.IncMemberInitContainerRestarts(memberStatus.ID, container, t.ExitCode)
}
}
}
@ -198,6 +200,8 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
Time("started", t.StartedAt.Time).
Time("finished", t.FinishedAt.Time).
Warn("Pod failed in unexpected way: Core Container failed")
r.metrics.IncMemberContainerRestarts(memberStatus.ID, container, t.ExitCode)
}
}
}

View file

@ -28,6 +28,8 @@ type Resources struct {
log logging.Logger
namespace, name string
context Context
metrics Metrics
}
// NewResources creates a new Resources service, used to

View file

@ -0,0 +1,39 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
package metric_descriptions
import "github.com/arangodb/kube-arangodb/pkg/util/metrics"
var (
arangodbOperatorMembersUnexpectedContainerExitCodes = metrics.NewDescription("arangodb_operator_members_unexpected_container_exit_codes", "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)", []string{`namespace`, `name`, `member`, `container`, `container_type`, `code`}, nil)
)
func init() {
registerDescription(arangodbOperatorMembersUnexpectedContainerExitCodes)
}
func ArangodbOperatorMembersUnexpectedContainerExitCodes() metrics.Description {
return arangodbOperatorMembersUnexpectedContainerExitCodes
}
func ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(value float64, namespace string, name string, member string, container string, containerType string, code string) metrics.Metric {
return ArangodbOperatorMembersUnexpectedContainerExitCodes().Gauge(value, namespace, name, member, container, containerType, code)
}