From 056aabb5158e81fd3a5057fe276437c45417a3f8 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 30 Mar 2018 15:40:11 +0200 Subject: [PATCH] Adding chaos-monkey for deployments --- Makefile | 7 +- examples/simple-cluster.yaml | 2 + main.go | 8 +- .../templates/deployment/deployment.yaml | 1 + pkg/apis/deployment/v1alpha/chaos_spec.go | 91 +++++++++++++++++++ .../deployment/v1alpha/deployment_spec.go | 7 ++ pkg/apis/deployment/v1alpha/percent.go | 62 +++++++++++++ .../v1alpha/zz_generated.deepcopy.go | 44 +++++++++ pkg/deployment/chaos/context.go | 40 ++++++++ pkg/deployment/chaos/errors.go | 29 ++++++ pkg/deployment/chaos/monkey.go | 90 ++++++++++++++++++ pkg/deployment/deployment.go | 7 ++ pkg/operator/operator.go | 2 +- pkg/operator/operator_deployment.go | 1 + tools/manifests/manifest_builder.go | 4 + tools/release/release.go | 1 + 16 files changed, 391 insertions(+), 5 deletions(-) create mode 100644 pkg/apis/deployment/v1alpha/chaos_spec.go create mode 100644 pkg/apis/deployment/v1alpha/percent.go create mode 100644 pkg/deployment/chaos/context.go create mode 100644 pkg/deployment/chaos/errors.go create mode 100644 pkg/deployment/chaos/monkey.go diff --git a/Makefile b/Makefile index 7228ffb77..9b41671a0 100644 --- a/Makefile +++ b/Makefile @@ -65,6 +65,10 @@ ifndef ENTERPRISEIMAGE ENTERPRISEIMAGE := $(DEFAULTENTERPRISEIMAGE) endif +ifndef ALLOWCHAOS + ALLOWCHAOS := true +endif + BINNAME := $(PROJECT) BIN := $(BINDIR)/$(BINNAME) TESTBINNAME := $(PROJECT)_test @@ -200,7 +204,8 @@ manifests: $(GOBUILDDIR) --output-suffix=$(MANIFESTSUFFIX) \ --image=$(OPERATORIMAGE) \ --image-sha256=$(IMAGESHA256) \ - --namespace=$(DEPLOYMENTNAMESPACE) + --namespace=$(DEPLOYMENTNAMESPACE) \ + --allow-chaos=$(ALLOWCHAOS) # Testing diff --git a/examples/simple-cluster.yaml b/examples/simple-cluster.yaml index 52fb63621..5fcce829e 100644 --- a/examples/simple-cluster.yaml +++ b/examples/simple-cluster.yaml @@ -10,3 +10,5 @@ spec: coordinators: args: - --log.level=true + chaos: + enabled: true diff --git a/main.go b/main.go index a066187a3..9eb38df86 100644 --- a/main.go +++ b/main.go @@ -78,7 +78,9 @@ var ( operatorOptions struct { enableDeployment bool // Run deployment operator enableStorage bool // Run deployment operator - createCRD bool + } + chaosOptions struct { + allowed bool } ) @@ -89,7 +91,7 @@ func init() { f.StringVar(&logLevel, "log.level", defaultLogLevel, "Set initial log level") f.BoolVar(&operatorOptions.enableDeployment, "operator.deployment", false, "Enable to run the ArangoDeployment operator") f.BoolVar(&operatorOptions.enableStorage, "operator.storage", false, "Enable to run the ArangoLocalStorage operator") - f.BoolVar(&operatorOptions.createCRD, "operator.create-crd", true, "Disable to avoid create the custom resource definition") + f.BoolVar(&chaosOptions.allowed, "chaos.allowed", false, "Set to allow chaos in deployments. Only activated when allowed and enabled in deployment") } func main() { @@ -183,7 +185,7 @@ func newOperatorConfigAndDeps(id, namespace, name string) (operator.Config, oper ServiceAccount: serviceAccount, EnableDeployment: operatorOptions.enableDeployment, EnableStorage: operatorOptions.enableStorage, - CreateCRD: operatorOptions.createCRD, + AllowChaos: chaosOptions.allowed, } deps := operator.Dependencies{ LogService: logService, diff --git a/manifests/templates/deployment/deployment.yaml b/manifests/templates/deployment/deployment.yaml index 09166b22b..1ed02f42f 100644 --- a/manifests/templates/deployment/deployment.yaml +++ b/manifests/templates/deployment/deployment.yaml @@ -17,6 +17,7 @@ spec: image: {{ .Image }} args: - --operator.deployment + - --chaos.allowed={{ .Deployment.AllowChaos }} env: - name: MY_POD_NAMESPACE valueFrom: diff --git a/pkg/apis/deployment/v1alpha/chaos_spec.go b/pkg/apis/deployment/v1alpha/chaos_spec.go new file mode 100644 index 000000000..9a8519658 --- /dev/null +++ b/pkg/apis/deployment/v1alpha/chaos_spec.go @@ -0,0 +1,91 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package v1alpha + +import ( + time "time" + + "github.com/arangodb/kube-arangodb/pkg/util" + "github.com/pkg/errors" +) + +// ChaosSpec holds configuration for the deployment chaos monkey. +type ChaosSpec struct { + // Enabled switches the chaos monkey for a deployment on or off. + Enabled *bool `json:"enabled,omitempty"` + // Interval is the time between events + Interval *time.Duration `json:"interval,omitempty"` + // KillPodProbability is the chance of a pod being killed during an event + KillPodProbability *Percent `json:"kill-pod-probability,omitempty"` +} + +// IsEnabled returns the value of enabled. +func (s ChaosSpec) IsEnabled() bool { + return util.BoolOrDefault(s.Enabled) +} + +// GetInterval returns the value of interval. +func (s ChaosSpec) GetInterval() time.Duration { + return util.DurationOrDefault(s.Interval) +} + +// GetKillPodProbability returns the value of kill-pod-probability. +func (s ChaosSpec) GetKillPodProbability() Percent { + return PercentOrDefault(s.KillPodProbability) +} + +// Validate the given spec +func (s ChaosSpec) Validate() error { + if s.IsEnabled() { + if s.GetInterval() <= 0 { + return maskAny(errors.Wrapf(ValidationError, "Interval must be > 0")) + } + if err := s.GetKillPodProbability().Validate(); err != nil { + return maskAny(err) + } + } + return nil +} + +// SetDefaults fills in missing defaults +func (s *ChaosSpec) SetDefaults() { + if s.GetInterval() == 0 { + s.Interval = util.NewDuration(time.Minute) + } + if s.GetKillPodProbability() == 0 { + s.KillPodProbability = NewPercent(50) + } +} + +// SetDefaultsFrom fills unspecified fields with a value from given source spec. +func (s *ChaosSpec) SetDefaultsFrom(source ChaosSpec) { + if s.Enabled == nil { + s.Enabled = util.NewBoolOrNil(source.Enabled) + } + if s.Interval == nil { + s.Interval = util.NewDurationOrNil(source.Interval) + } + if s.KillPodProbability == nil { + s.KillPodProbability = NewPercentOrNil(source.KillPodProbability) + } +} diff --git a/pkg/apis/deployment/v1alpha/deployment_spec.go b/pkg/apis/deployment/v1alpha/deployment_spec.go index 5954719bc..7f8485cb3 100644 --- a/pkg/apis/deployment/v1alpha/deployment_spec.go +++ b/pkg/apis/deployment/v1alpha/deployment_spec.go @@ -62,6 +62,8 @@ type DeploymentSpec struct { Coordinators ServerGroupSpec `json:"coordinators"` SyncMasters ServerGroupSpec `json:"syncmasters"` SyncWorkers ServerGroupSpec `json:"syncworkers"` + + Chaos ChaosSpec `json:"chaos"` } // GetMode returns the value of mode. @@ -147,6 +149,7 @@ func (s *DeploymentSpec) SetDefaults(deploymentName string) { s.Coordinators.SetDefaults(ServerGroupCoordinators, s.GetMode().HasCoordinators(), s.GetMode()) s.SyncMasters.SetDefaults(ServerGroupSyncMasters, s.Sync.IsEnabled(), s.GetMode()) s.SyncWorkers.SetDefaults(ServerGroupSyncWorkers, s.Sync.IsEnabled(), s.GetMode()) + s.Chaos.SetDefaults() } // SetDefaultsFrom fills unspecified fields with a value from given source spec. @@ -176,6 +179,7 @@ func (s *DeploymentSpec) SetDefaultsFrom(source DeploymentSpec) { s.Coordinators.SetDefaultsFrom(source.Coordinators) s.SyncMasters.SetDefaultsFrom(source.SyncMasters) s.SyncWorkers.SetDefaultsFrom(source.SyncWorkers) + s.Chaos.SetDefaultsFrom(source.Chaos) } // Validate the specification. @@ -226,6 +230,9 @@ func (s *DeploymentSpec) Validate() error { if err := s.SyncWorkers.Validate(ServerGroupSyncWorkers, s.Sync.IsEnabled(), s.GetMode(), s.GetEnvironment()); err != nil { return maskAny(err) } + if err := s.Chaos.Validate(); err != nil { + return maskAny(errors.Wrap(err, "spec.chaos")) + } return nil } diff --git a/pkg/apis/deployment/v1alpha/percent.go b/pkg/apis/deployment/v1alpha/percent.go new file mode 100644 index 000000000..3d7c9195b --- /dev/null +++ b/pkg/apis/deployment/v1alpha/percent.go @@ -0,0 +1,62 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package v1alpha + +import ( + "github.com/pkg/errors" +) + +// Percent is a percentage between 0 and 100. +type Percent int + +// Validate the given percentage. +func (p Percent) Validate() error { + if p < 0 || p > 100 { + return maskAny(errors.Wrapf(ValidationError, "Percentage must be between 0 and 100, got %d", int(p))) + } + return nil +} + +// NewPercent returns a reference to a percent with given value. +func NewPercent(input Percent) *Percent { + return &input +} + +// NewPercentOrNil returns nil if input is nil, otherwise returns a clone of the given value. +func NewPercentOrNil(input *Percent) *Percent { + if input == nil { + return nil + } + return NewPercent(*input) +} + +// PercentOrDefault returns the default value or 0 if input is nil, otherwise returns the referenced value. +func PercentOrDefault(input *Percent, defaultValue ...Percent) Percent { + if input == nil { + if len(defaultValue) > 0 { + return defaultValue[0] + } + return 0 + } + return *input +} diff --git a/pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go b/pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go index 61ff324f7..d4b0b1756 100644 --- a/pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go +++ b/pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go @@ -144,6 +144,49 @@ func (in *AuthenticationSpec) DeepCopy() *AuthenticationSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ChaosSpec) DeepCopyInto(out *ChaosSpec) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + if *in == nil { + *out = nil + } else { + *out = new(bool) + **out = **in + } + } + if in.Interval != nil { + in, out := &in.Interval, &out.Interval + if *in == nil { + *out = nil + } else { + *out = new(time.Duration) + **out = **in + } + } + if in.KillPodProbability != nil { + in, out := &in.KillPodProbability, &out.KillPodProbability + if *in == nil { + *out = nil + } else { + *out = new(Percent) + **out = **in + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ChaosSpec. +func (in *ChaosSpec) DeepCopy() *ChaosSpec { + if in == nil { + return nil + } + out := new(ChaosSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Condition) DeepCopyInto(out *Condition) { *out = *in @@ -220,6 +263,7 @@ func (in *DeploymentSpec) DeepCopyInto(out *DeploymentSpec) { in.Coordinators.DeepCopyInto(&out.Coordinators) in.SyncMasters.DeepCopyInto(&out.SyncMasters) in.SyncWorkers.DeepCopyInto(&out.SyncWorkers) + in.Chaos.DeepCopyInto(&out.Chaos) return } diff --git a/pkg/deployment/chaos/context.go b/pkg/deployment/chaos/context.go new file mode 100644 index 000000000..d2d2a4d8a --- /dev/null +++ b/pkg/deployment/chaos/context.go @@ -0,0 +1,40 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package chaos + +import ( + "k8s.io/api/core/v1" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" +) + +// Context provides methods to the chaos package. +type Context interface { + // GetSpec returns the current specification of the deployment + GetSpec() api.DeploymentSpec + // DeletePod deletes a pod with given name in the namespace + // of the deployment. If the pod does not exist, the error is ignored. + DeletePod(podName string) error + // GetOwnedPods returns a list of all pods owned by the deployment. + GetOwnedPods() ([]v1.Pod, error) +} diff --git a/pkg/deployment/chaos/errors.go b/pkg/deployment/chaos/errors.go new file mode 100644 index 000000000..238d90df4 --- /dev/null +++ b/pkg/deployment/chaos/errors.go @@ -0,0 +1,29 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package chaos + +import "github.com/pkg/errors" + +var ( + maskAny = errors.WithStack +) diff --git a/pkg/deployment/chaos/monkey.go b/pkg/deployment/chaos/monkey.go new file mode 100644 index 000000000..805439517 --- /dev/null +++ b/pkg/deployment/chaos/monkey.go @@ -0,0 +1,90 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package chaos + +import ( + "math/rand" + "time" + + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" +) + +// Monkey is the service that introduces chaos in the deployment +// if allowed and enabled. +type Monkey struct { + log zerolog.Logger + context Context +} + +// NewMonkey creates a new chaos monkey with given context. +func NewMonkey(log zerolog.Logger, context Context) *Monkey { + log = log.With().Str("component", "chaos-monkey").Logger() + return &Monkey{ + log: log, + context: context, + } +} + +// Run the monkey until the given channel is closed. +func (m Monkey) Run(stopCh <-chan struct{}) { + for { + spec := m.context.GetSpec() + if spec.Chaos.IsEnabled() { + // Gamble to set if we must introduce chaos + chance := float64(spec.Chaos.GetKillPodProbability()) / 100.0 + if rand.Float64() < chance { + // Let's introduce pod chaos + if err := m.killRandomPod(); err != nil { + log.Info().Err(err).Msg("Failed to kill random pod") + } + } + } + + select { + case <-time.After(spec.Chaos.GetInterval()): + // Continue + case <-stopCh: + // We're done + return + } + } +} + +// killRandomPod fetches all owned pods and tries to kill one. +func (m Monkey) killRandomPod() error { + pods, err := m.context.GetOwnedPods() + if err != nil { + return maskAny(err) + } + if len(pods) <= 1 { + // Not enough pods + return nil + } + p := pods[rand.Intn(len(pods))] + m.log.Info().Str("pod-name", p.GetName()).Msg("Killing pod") + if err := m.context.DeletePod(p.GetName()); err != nil { + return maskAny(err) + } + return nil +} diff --git a/pkg/deployment/deployment.go b/pkg/deployment/deployment.go index f4104e305..57a007bfa 100644 --- a/pkg/deployment/deployment.go +++ b/pkg/deployment/deployment.go @@ -35,6 +35,7 @@ import ( corev1 "k8s.io/client-go/kubernetes/typed/core/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/deployment/chaos" "github.com/arangodb/kube-arangodb/pkg/deployment/reconcile" "github.com/arangodb/kube-arangodb/pkg/deployment/resources" "github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned" @@ -46,6 +47,7 @@ import ( // Config holds configuration settings for a Deployment type Config struct { ServiceAccount string + AllowChaos bool } // Dependencies holds dependent services for a Deployment @@ -93,6 +95,7 @@ type Deployment struct { clusterScalingIntegration *clusterScalingIntegration reconciler *reconcile.Reconciler resources *resources.Resources + chaosMonkey *chaos.Monkey } // New creates a new Deployment from the given API object. @@ -127,6 +130,10 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De d.clusterScalingIntegration = ci go ci.ListenForClusterEvents(d.stopCh) } + if config.AllowChaos { + d.chaosMonkey = chaos.NewMonkey(deps.Log, d) + d.chaosMonkey.Run(d.stopCh) + } return d, nil } diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 13d21b504..2ba655823 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -67,7 +67,7 @@ type Config struct { ServiceAccount string EnableDeployment bool EnableStorage bool - CreateCRD bool + AllowChaos bool } type Dependencies struct { diff --git a/pkg/operator/operator_deployment.go b/pkg/operator/operator_deployment.go index f19b92560..967743899 100644 --- a/pkg/operator/operator_deployment.go +++ b/pkg/operator/operator_deployment.go @@ -193,6 +193,7 @@ func (o *Operator) handleDeploymentEvent(event *Event) error { func (o *Operator) makeDeploymentConfigAndDeps(apiObject *api.ArangoDeployment) (deployment.Config, deployment.Dependencies) { cfg := deployment.Config{ ServiceAccount: o.Config.ServiceAccount, + AllowChaos: o.Config.AllowChaos, } deps := deployment.Dependencies{ Log: o.Dependencies.LogService.MustGetLogger("deployment").With(). diff --git a/tools/manifests/manifest_builder.go b/tools/manifests/manifest_builder.go index d7951d59c..06c40f010 100644 --- a/tools/manifests/manifest_builder.go +++ b/tools/manifests/manifest_builder.go @@ -48,6 +48,7 @@ var ( DeploymentOperatorName string StorageOperatorName string RBAC bool + AllowChaos bool } deploymentTemplateNames = []string{ "rbac.yaml", @@ -72,6 +73,7 @@ func init() { pflag.StringVar(&options.DeploymentOperatorName, "deployment-operator-name", "arango-deployment-operator", "Name of the ArangoDeployment operator deployment") pflag.StringVar(&options.StorageOperatorName, "storage-operator-name", "arango-storage-operator", "Name of the ArangoLocalStorage operator deployment") pflag.BoolVar(&options.RBAC, "rbac", true, "Use role based access control") + pflag.BoolVar(&options.AllowChaos, "allow-chaos", false, "If set, allows chaos in deployments") pflag.Parse() } @@ -96,6 +98,7 @@ type ResourceOptions struct { User CommonOptions Operator CommonOptions OperatorDeploymentName string + AllowChaos bool } func main() { @@ -149,6 +152,7 @@ func main() { ServiceAccountName: "default", }, OperatorDeploymentName: "arango-deployment-operator", + AllowChaos: options.AllowChaos, }, Storage: ResourceOptions{ User: CommonOptions{ diff --git a/tools/release/release.go b/tools/release/release.go index e51b4b484..368e91960 100644 --- a/tools/release/release.go +++ b/tools/release/release.go @@ -58,6 +58,7 @@ func main() { version := bumpVersion(releaseType) make("clean", nil) make("all", map[string]string{ + "ALLOWCHAOS": "false", "DOCKERNAMESPACE": "arangodb", "IMAGETAG": version, "MANIFESTSUFFIX": "-",