diff --git a/CHANGELOG.md b/CHANGELOG.md index 9570190d1..fdbf5276c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A) - (Feature) ArangoBackup create retries and MaxIterations limit +- (Feature) Add Reason in OOM Metric ## [1.2.27](https://github.com/arangodb/kube-arangodb/tree/1.2.27) (2023-04-27) - (Feature) Add InSync Cache diff --git a/docs/generated/metrics/arangodb_operator_members_unexpected_container_exit_codes.md b/docs/generated/metrics/arangodb_operator_members_unexpected_container_exit_codes.md index 14cda09d5..73bbb8fbb 100644 --- a/docs/generated/metrics/arangodb_operator_members_unexpected_container_exit_codes.md +++ b/docs/generated/metrics/arangodb_operator_members_unexpected_container_exit_codes.md @@ -14,3 +14,4 @@ Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContai | container | Container Name | | container_type | Container/InitContainer/EphemeralContainer | | code | ExitCode | +| reason | Reason | diff --git a/internal/metrics.yaml b/internal/metrics.yaml index aecb0bb1a..a6d3b4b38 100644 --- a/internal/metrics.yaml +++ b/internal/metrics.yaml @@ -219,6 +219,8 @@ namespaces: description: "Container/InitContainer/EphemeralContainer" - key: code description: "ExitCode" + - key: reason + description: "Reason" engine: panics_recovered: shortDescription: "Number of Panics recovered inside Operator reconciliation loop" diff --git a/pkg/deployment/resources/metrics.go b/pkg/deployment/resources/metrics.go index 40f50d4a7..8cb1f148b 100644 --- a/pkg/deployment/resources/metrics.go +++ b/pkg/deployment/resources/metrics.go @@ -39,7 +39,7 @@ type Metrics struct { Members map[string]MetricMember } -func (m *Metrics) IncMemberContainerRestarts(id, container string, code int32) { +func (m *Metrics) IncMemberContainerRestarts(id, container, reason string, code int32) { if m == nil { return } @@ -63,14 +63,22 @@ func (m *Metrics) IncMemberContainerRestarts(id, container string, code int32) { cr = MetricMemberRestarts{} } - cr[code]++ + cd := cr[code] + + if cd == nil { + cd = MetricMemberRestartReason{} + } + + cd[reason]++ + + cr[code] = cd v.ContainerRestarts[container] = cr m.Members[id] = v } -func (m *Metrics) IncMemberInitContainerRestarts(id, container string, code int32) { +func (m *Metrics) IncMemberInitContainerRestarts(id, container, reason string, code int32) { if m == nil { return } @@ -94,14 +102,22 @@ func (m *Metrics) IncMemberInitContainerRestarts(id, container string, code int3 cr = MetricMemberRestarts{} } - cr[code]++ + cd := cr[code] + + if cd == nil { + cd = MetricMemberRestartReason{} + } + + cd[reason]++ + + cr[code] = cd v.InitContainerRestarts[container] = cr m.Members[id] = v } -func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code int32) { +func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container, reason string, code int32) { if m == nil { return } @@ -125,7 +141,15 @@ func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code cr = MetricMemberRestarts{} } - cr[code]++ + cd := cr[code] + + if cd == nil { + cd = MetricMemberRestartReason{} + } + + cd[reason]++ + + cr[code] = cd v.EphemeralContainerRestarts[container] = cr @@ -133,31 +157,41 @@ func (m *Metrics) IncMemberEphemeralContainerRestarts(id, container string, code } type MetricMember struct { - ContainerRestarts map[string]MetricMemberRestarts - InitContainerRestarts map[string]MetricMemberRestarts - EphemeralContainerRestarts map[string]MetricMemberRestarts + ContainerRestarts MetricContainerRestarts + InitContainerRestarts MetricContainerRestarts + EphemeralContainerRestarts MetricContainerRestarts } -type MetricMemberRestarts map[int32]uint64 +type MetricContainerRestarts map[string]MetricMemberRestarts + +type MetricMemberRestarts map[int32]MetricMemberRestartReason + +type MetricMemberRestartReason map[string]uint64 func (d *Resources) CollectMetrics(m metrics.PushMetric) { for member, info := range d.metrics.Members { // Containers for container, restarts := range info.ContainerRestarts { - for code, count := range restarts { - m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "container", fmt.Sprintf("%d", code))) + for code, reasons := range restarts { + for reason, count := range reasons { + m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "container", fmt.Sprintf("%d", code), reason)) + } } } // InitContainers for container, restarts := range info.InitContainerRestarts { - for code, count := range restarts { - m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "initContainer", fmt.Sprintf("%d", code))) + for code, reasons := range restarts { + for reason, count := range reasons { + m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "initContainer", fmt.Sprintf("%d", code), reason)) + } } } // EphemeralContainers for container, restarts := range info.EphemeralContainerRestarts { - for code, count := range restarts { - m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code))) + for code, reasons := range restarts { + for reason, count := range reasons { + m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code), reason)) + } } } } diff --git a/pkg/deployment/resources/metrics_test.go b/pkg/deployment/resources/metrics_test.go index 340b327a6..18f09d61a 100644 --- a/pkg/deployment/resources/metrics_test.go +++ b/pkg/deployment/resources/metrics_test.go @@ -29,65 +29,65 @@ import ( func Test_MetricsInc_Container(t *testing.T) { var m Metrics - m.IncMemberContainerRestarts("ID", "server", 137) + m.IncMemberContainerRestarts("ID", "server", "OOMKill", 137) - require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][137]) + require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"]) - m.IncMemberContainerRestarts("ID", "server", 137) - require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]) + m.IncMemberContainerRestarts("ID", "server", "OOMKill", 137) + require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"]) - m.IncMemberContainerRestarts("ID", "server2", 137) - require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]) + m.IncMemberContainerRestarts("ID", "server2", "OOMKill", 137) + require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]["OOMKill"]) - m.IncMemberContainerRestarts("ID2", "server", 137) - m.IncMemberContainerRestarts("ID", "server", 138) - require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]) - require.EqualValues(t, 1, m.Members["ID2"].ContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][138]) + m.IncMemberContainerRestarts("ID2", "server", "OOMKill", 137) + m.IncMemberContainerRestarts("ID", "server", "OOMKill", 138) + require.EqualValues(t, 2, m.Members["ID"].ContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server2"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID2"].ContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].ContainerRestarts["server"][138]["OOMKill"]) } func Test_MetricsInc_InitContainer(t *testing.T) { var m Metrics - m.IncMemberInitContainerRestarts("ID", "server", 137) + m.IncMemberInitContainerRestarts("ID", "server", "OOMKill", 137) - require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][137]) + require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"]) - m.IncMemberInitContainerRestarts("ID", "server", 137) - require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]) + m.IncMemberInitContainerRestarts("ID", "server", "OOMKill", 137) + require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"]) - m.IncMemberInitContainerRestarts("ID", "server2", 137) - require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]) + m.IncMemberInitContainerRestarts("ID", "server2", "OOMKill", 137) + require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]["OOMKill"]) - m.IncMemberInitContainerRestarts("ID2", "server", 137) - m.IncMemberInitContainerRestarts("ID", "server", 138) - require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]) - require.EqualValues(t, 1, m.Members["ID2"].InitContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][138]) + m.IncMemberInitContainerRestarts("ID2", "server", "OOMKill", 137) + m.IncMemberInitContainerRestarts("ID", "server", "OOMKill", 138) + require.EqualValues(t, 2, m.Members["ID"].InitContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server2"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID2"].InitContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].InitContainerRestarts["server"][138]["OOMKill"]) } func Test_MetricsInc_EphemeralContainer(t *testing.T) { var m Metrics - m.IncMemberEphemeralContainerRestarts("ID", "server", 137) + m.IncMemberEphemeralContainerRestarts("ID", "server", "OOMKill", 137) - require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][137]) + require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"]) - m.IncMemberEphemeralContainerRestarts("ID", "server", 137) - require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]) + m.IncMemberEphemeralContainerRestarts("ID", "server", "OOMKill", 137) + require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"]) - m.IncMemberEphemeralContainerRestarts("ID", "server2", 137) - require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]) + m.IncMemberEphemeralContainerRestarts("ID", "server2", "OOMKill", 137) + require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]["OOMKill"]) - m.IncMemberEphemeralContainerRestarts("ID2", "server", 137) - m.IncMemberEphemeralContainerRestarts("ID", "server", 138) - require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]) - require.EqualValues(t, 1, m.Members["ID2"].EphemeralContainerRestarts["server"][137]) - require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][138]) + m.IncMemberEphemeralContainerRestarts("ID2", "server", "OOMKill", 137) + m.IncMemberEphemeralContainerRestarts("ID", "server", "OOMKill", 138) + require.EqualValues(t, 2, m.Members["ID"].EphemeralContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server2"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID2"].EphemeralContainerRestarts["server"][137]["OOMKill"]) + require.EqualValues(t, 1, m.Members["ID"].EphemeralContainerRestarts["server"][138]["OOMKill"]) } diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index eb3e603f8..f759e947f 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -205,7 +205,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter Time("finished", t.FinishedAt.Time). Warn("Pod failed in unexpected way: Init Container failed") - r.metrics.IncMemberInitContainerRestarts(memberStatus.ID, container, t.ExitCode) + r.metrics.IncMemberInitContainerRestarts(memberStatus.ID, container, t.Reason, t.ExitCode) } } } @@ -227,7 +227,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter Time("finished", t.FinishedAt.Time). Warn("Pod failed in unexpected way: Core Container failed") - r.metrics.IncMemberContainerRestarts(memberStatus.ID, container, t.ExitCode) + r.metrics.IncMemberContainerRestarts(memberStatus.ID, container, t.Reason, t.ExitCode) } } } diff --git a/pkg/generated/metric_descriptions/arangodb_operator_members_unexpected_container_exit_codes.go b/pkg/generated/metric_descriptions/arangodb_operator_members_unexpected_container_exit_codes.go index 24974853c..831c50c5a 100644 --- a/pkg/generated/metric_descriptions/arangodb_operator_members_unexpected_container_exit_codes.go +++ b/pkg/generated/metric_descriptions/arangodb_operator_members_unexpected_container_exit_codes.go @@ -23,7 +23,7 @@ package metric_descriptions import "github.com/arangodb/kube-arangodb/pkg/util/metrics" var ( - arangodbOperatorMembersUnexpectedContainerExitCodes = metrics.NewDescription("arangodb_operator_members_unexpected_container_exit_codes", "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)", []string{`namespace`, `name`, `member`, `container`, `container_type`, `code`}, nil) + arangodbOperatorMembersUnexpectedContainerExitCodes = metrics.NewDescription("arangodb_operator_members_unexpected_container_exit_codes", "Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers)", []string{`namespace`, `name`, `member`, `container`, `container_type`, `code`, `reason`}, nil) ) func init() { @@ -34,6 +34,6 @@ func ArangodbOperatorMembersUnexpectedContainerExitCodes() metrics.Description { return arangodbOperatorMembersUnexpectedContainerExitCodes } -func ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(value float64, namespace string, name string, member string, container string, containerType string, code string) metrics.Metric { - return ArangodbOperatorMembersUnexpectedContainerExitCodes().Gauge(value, namespace, name, member, container, containerType, code) +func ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(value float64, namespace string, name string, member string, container string, containerType string, code string, reason string) metrics.Metric { + return ArangodbOperatorMembersUnexpectedContainerExitCodes().Gauge(value, namespace, name, member, container, containerType, code, reason) }