1
0
Fork 0
mirror of https://github.com/arangodb/kube-arangodb.git synced 2024-12-14 11:57:37 +00:00

[Feature] Add Reason in OOM Metric (#1308)

This commit is contained in:
Adam Janikowski 2023-05-08 17:18:45 +07:00 committed by GitHub
parent 8fb93b85f9
commit 9107c053a7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 98 additions and 60 deletions

View file

@ -2,6 +2,7 @@
## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A) ## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A)
- (Feature) ArangoBackup create retries and MaxIterations limit - (Feature) ArangoBackup create retries and MaxIterations limit
- (Feature) Add Reason in OOM Metric
## [1.2.27](https://github.com/arangodb/kube-arangodb/tree/1.2.27) (2023-04-27) ## [1.2.27](https://github.com/arangodb/kube-arangodb/tree/1.2.27) (2023-04-27)
- (Feature) Add InSync Cache - (Feature) Add InSync Cache

View file

@ -14,3 +14,4 @@ Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContai
| container | Container Name | | container | Container Name |
| container_type | Container/InitContainer/EphemeralContainer | | container_type | Container/InitContainer/EphemeralContainer |
| code | ExitCode | | code | ExitCode |
| reason | Reason |

View file

@ -219,6 +219,8 @@ namespaces:
description: "Container/InitContainer/EphemeralContainer" description: "Container/InitContainer/EphemeralContainer"
- key: code - key: code
description: "ExitCode" description: "ExitCode"
- key: reason
description: "Reason"
engine: engine:
panics_recovered: panics_recovered:
shortDescription: "Number of Panics recovered inside Operator reconciliation loop" shortDescription: "Number of Panics recovered inside Operator reconciliation loop"

View file

@ -39,7 +39,7 @@ type Metrics struct {
Members map[string]MetricMember Members map[string]MetricMember
} }
func (m *Metrics) IncMemberContainerRestarts(id, container string, code int32) { func (m *Metrics) IncMemberContainerRestarts(id, container, reason string, code int32) {
if m == nil { if m == nil {
return return
} }
@ -63,14 +63,22 @@ func (m *Metrics) IncMemberContainerRestarts(id, container string, code int32) {
cr = MetricMemberRestarts{} cr = MetricMemberRestarts{}
} }
cr[code]++ cd := cr[code]
if cd == nil {
cd = MetricMemberRestartReason{}
}
cd[reason]++
cr[code] = cd
v.ContainerRestarts[container] = cr v.ContainerRestarts[container] = cr
m.Members[id] = v m.Members[id] = v
} }
func (m *Metrics) IncMemberInitContainerRestarts(id, container string, code int32) { func (m *Metrics) IncMemberInitContainerRestarts(id, container, reason string, code int32) {
if m == nil { if m == nil {
return return
} }
@ -94,14 +102,22 @@ func (m *Metrics) IncMemberInitContainerRestarts(id, container string, code int3
cr = MetricMemberRestarts{} cr = MetricMemberRestarts{}
} }
cr[code]++ cd := cr[code]
if cd == nil {
cd = MetricMemberRestartReason{}
}
cd[reason]++
cr[code] = cd
v.InitContainerRestarts[container] = cr v.InitContainerRestarts[container] = cr
m.Members[id] = v m.Members[id] = v
} }
func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code int32) { func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container, reason string, code int32) {
if m == nil { if m == nil {
return return
} }
@ -125,7 +141,15 @@ func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code
cr = MetricMemberRestarts{} cr = MetricMemberRestarts{}
} }
cr[code]++ cd := cr[code]
if cd == nil {
cd = MetricMemberRestartReason{}
}
cd[reason]++
cr[code] = cd
v.EphemeralContainerRestarts[container] = cr v.EphemeralContainerRestarts[container] = cr
@ -133,31 +157,41 @@ func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code
} }
type MetricMember struct { type MetricMember struct {
ContainerRestarts map[string]MetricMemberRestarts ContainerRestarts MetricContainerRestarts
InitContainerRestarts map[string]MetricMemberRestarts InitContainerRestarts MetricContainerRestarts
EphemeralContainerRestarts map[string]MetricMemberRestarts EphemeralContainerRestarts MetricContainerRestarts
} }
type MetricMemberRestarts map[int32]uint64 type MetricContainerRestarts map[string]MetricMemberRestarts
type MetricMemberRestarts map[int32]MetricMemberRestartReason
type MetricMemberRestartReason map[string]uint64
func (d *Resources) CollectMetrics(m metrics.PushMetric) { func (d *Resources) CollectMetrics(m metrics.PushMetric) {
for member, info := range d.metrics.Members { for member, info := range d.metrics.Members {
// Containers // Containers
for container, restarts := range info.ContainerRestarts { for container, restarts := range info.ContainerRestarts {
for code, count := range restarts { for code, reasons := range restarts {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "container", fmt.Sprintf("%d", code))) for reason, count := range reasons {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "container", fmt.Sprintf("%d", code), reason))
}
} }
} }
// InitContainers // InitContainers
for container, restarts := range info.InitContainerRestarts { for container, restarts := range info.InitContainerRestarts {
for code, count := range restarts { for code, reasons := range restarts {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "initContainer", fmt.Sprintf("%d", code))) for reason, count := range reasons {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "initContainer", fmt.Sprintf("%d", code), reason))
}
} }
} }
// EphemeralContainers // EphemeralContainers
for container, restarts := range info.EphemeralContainerRestarts { for container, restarts := range info.EphemeralContainerRestarts {
for code, count := range restarts { for code, reasons := range restarts {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code))) for reason, count := range reasons {
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code), reason))
}
} }
} }
} }

View file

@ -29,65 +29,65 @@ import (
func Test_MetricsInc_Container(t *testing.T) { func Test_MetricsInc_Container(t *testing.T) {
var m Metrics var m Metrics
m.IncMemberContainerRestarts("ID", "server", 137) m.IncMemberContainerRestarts("ID", "server", "OOMKill", 137)
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][137]) require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"])
m.IncMemberContainerRestarts("ID", "server", 137) m.IncMemberContainerRestarts("ID", "server", "OOMKill", 137)
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"])
m.IncMemberContainerRestarts("ID", "server2", 137) m.IncMemberContainerRestarts("ID", "server2", "OOMKill", 137)
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]) require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]["OOMKill"])
m.IncMemberContainerRestarts("ID2", "server", 137) m.IncMemberContainerRestarts("ID2", "server", "OOMKill", 137)
m.IncMemberContainerRestarts("ID", "server", 138) m.IncMemberContainerRestarts("ID", "server", "OOMKill", 138)
require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]) require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID2"].ContainerRestarts["server"][137]) require.EqualValues(t, 1, m.Members["ID2"].ContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][138]) require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][138]["OOMKill"])
} }
func Test_MetricsInc_InitContainer(t *testing.T) { func Test_MetricsInc_InitContainer(t *testing.T) {
var m Metrics var m Metrics
m.IncMemberInitContainerRestarts("ID", "server", 137) m.IncMemberInitContainerRestarts("ID", "server", "OOMKill", 137)
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][137]) require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"])
m.IncMemberInitContainerRestarts("ID", "server", 137) m.IncMemberInitContainerRestarts("ID", "server", "OOMKill", 137)
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"])
m.IncMemberInitContainerRestarts("ID", "server2", 137) m.IncMemberInitContainerRestarts("ID", "server2", "OOMKill", 137)
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]) require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]["OOMKill"])
m.IncMemberInitContainerRestarts("ID2", "server", 137) m.IncMemberInitContainerRestarts("ID2", "server", "OOMKill", 137)
m.IncMemberInitContainerRestarts("ID", "server", 138) m.IncMemberInitContainerRestarts("ID", "server", "OOMKill", 138)
require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]) require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID2"].InitContainerRestarts["server"][137]) require.EqualValues(t, 1, m.Members["ID2"].InitContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][138]) require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][138]["OOMKill"])
} }
func Test_MetricsInc_EphemeralContainer(t *testing.T) { func Test_MetricsInc_EphemeralContainer(t *testing.T) {
var m Metrics var m Metrics
m.IncMemberEphemeralContainerRestarts("ID", "server", 137) m.IncMemberEphemeralContainerRestarts("ID", "server", "OOMKill", 137)
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][137]) require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"])
m.IncMemberEphemeralContainerRestarts("ID", "server", 137) m.IncMemberEphemeralContainerRestarts("ID", "server", "OOMKill", 137)
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"])
m.IncMemberEphemeralContainerRestarts("ID", "server2", 137) m.IncMemberEphemeralContainerRestarts("ID", "server2", "OOMKill", 137)
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]) require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]["OOMKill"])
m.IncMemberEphemeralContainerRestarts("ID2", "server", 137) m.IncMemberEphemeralContainerRestarts("ID2", "server", "OOMKill", 137)
m.IncMemberEphemeralContainerRestarts("ID", "server", 138) m.IncMemberEphemeralContainerRestarts("ID", "server", "OOMKill", 138)
require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]) require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]) require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID2"].EphemeralContainerRestarts["server"][137]) require.EqualValues(t, 1, m.Members["ID2"].EphemeralContainerRestarts["server"][137]["OOMKill"])
require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][138]) require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][138]["OOMKill"])
} }

View file

@ -205,7 +205,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
Time("finished", t.FinishedAt.Time). Time("finished", t.FinishedAt.Time).
Warn("Pod failed in unexpected way: Init Container failed") Warn("Pod failed in unexpected way: Init Container failed")
r.metrics.IncMemberInitContainerRestarts(memberStatus.ID, container, t.ExitCode) r.metrics.IncMemberInitContainerRestarts(memberStatus.ID, container, t.Reason, t.ExitCode)
} }
} }
} }
@ -227,7 +227,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
Time("finished", t.FinishedAt.Time). Time("finished", t.FinishedAt.Time).
Warn("Pod failed in unexpected way: Core Container failed") Warn("Pod failed in unexpected way: Core Container failed")
r.metrics.IncMemberContainerRestarts(memberStatus.ID, container, t.ExitCode) r.metrics.IncMemberContainerRestarts(memberStatus.ID, container, t.Reason, t.ExitCode)
} }
} }
} }

View file

@ -23,7 +23,7 @@ package metric_descriptions
import "github.com/arangodb/kube-arangodb/pkg/util/metrics" import "github.com/arangodb/kube-arangodb/pkg/util/metrics"
var ( var (
arangodbOperatorMembersUnexpectedContainerExitCodes = metrics.NewDescription("arangodb_operator_members_unexpected_container_exit_codes", "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)", []string{`namespace`, `name`, `member`, `container`, `container_type`, `code`}, nil) arangodbOperatorMembersUnexpectedContainerExitCodes = metrics.NewDescription("arangodb_operator_members_unexpected_container_exit_codes", "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)", []string{`namespace`, `name`, `member`, `container`, `container_type`, `code`, `reason`}, nil)
) )
func init() { func init() {
@ -34,6 +34,6 @@ func ArangodbOperatorMembersUnexpectedContainerExitCodes() metrics.Description {
return arangodbOperatorMembersUnexpectedContainerExitCodes return arangodbOperatorMembersUnexpectedContainerExitCodes
} }
func ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(value float64, namespace string, name string, member string, container string, containerType string, code string) metrics.Metric { func ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(value float64, namespace string, name string, member string, container string, containerType string, code string, reason string) metrics.Metric {
return ArangodbOperatorMembersUnexpectedContainerExitCodes().Gauge(value, namespace, name, member, container, containerType, code) return ArangodbOperatorMembersUnexpectedContainerExitCodes().Gauge(value, namespace, name, member, container, containerType, code, reason)
} }