mirror of
https://github.com/TwiN/gatus.git
synced 2024-12-14 11:58:04 +00:00
Close #74: Add maintenance window
This commit is contained in:
parent
dc173b29bc
commit
fa4736c672
6 changed files with 399 additions and 8 deletions
32
README.md
32
README.md
|
@ -47,6 +47,7 @@ For more details, see [Usage](#usage)
|
|||
- [Configuring Twilio alerts](#configuring-twilio-alerts)
|
||||
- [Configuring custom alerts](#configuring-custom-alerts)
|
||||
- [Setting a default alert](#setting-a-default-alert)
|
||||
- [Maintenance](#maintenance)
|
||||
- [Deployment](#deployment)
|
||||
- [Docker](#docker)
|
||||
- [Helm Chart](#helm-chart)
|
||||
|
@ -736,6 +737,37 @@ services:
|
|||
- type: pagerduty
|
||||
```
|
||||
|
||||
### Maintenance
|
||||
If you have maintenance windows, you may not want to be annoyed by alerts.
|
||||
To do that, you'll have to use the maintenance configuration:
|
||||
|
||||
| Parameter | Description | Default |
|
||||
|:----------------------- |:----------------------------------------------------------------------------- |:--------------- |
|
||||
| `maintenance.enabled` | Whether the maintenance period is enabled | `true` |
|
||||
| `maintenance.start` | Time at which the maintenance window starts in `hh:mm` format (e.g. `23:00`) | Required `""` |
|
||||
| `maintenance.duration` | Duration of the maintenance window (e.g. `1h`, `30m`) | Required `""` |
|
||||
| `maintenance.every` | Days on which the maintenance period applies (e.g. `[Monday, Thursday]`).<br />If left empty, the maintenance window applies every day | `[]` |
|
||||
**Note that the maintenance configuration uses UTC.**
|
||||
|
||||
|
||||
Here's an example:
|
||||
```yaml
|
||||
maintenance:
|
||||
start: 23:00
|
||||
duration: 1h
|
||||
every: [Monday, Thursday]
|
||||
```
|
||||
Note that you can also specify each day on separate lines:
|
||||
```yaml
|
||||
maintenance:
|
||||
start: 23:00
|
||||
duration: 1h
|
||||
every:
|
||||
- Monday
|
||||
- Thursday
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Deployment
|
||||
Many examples can be found in the [examples](examples) folder, but this section will focus on the most popular ways of deploying Gatus.
|
||||
|
|
|
@ -10,6 +10,7 @@ import (
|
|||
"github.com/TwinProduction/gatus/alerting"
|
||||
"github.com/TwinProduction/gatus/alerting/alert"
|
||||
"github.com/TwinProduction/gatus/alerting/provider"
|
||||
"github.com/TwinProduction/gatus/config/maintenance"
|
||||
"github.com/TwinProduction/gatus/core"
|
||||
"github.com/TwinProduction/gatus/security"
|
||||
"github.com/TwinProduction/gatus/storage"
|
||||
|
@ -82,6 +83,9 @@ type Config struct {
|
|||
// UI is the configuration for the UI
|
||||
UI *UIConfig `yaml:"ui"`
|
||||
|
||||
// Maintenance is the configuration for creating a maintenance window in which no alerts are sent
|
||||
Maintenance *maintenance.Config `yaml:"maintenance"`
|
||||
|
||||
filePath string // path to the file from which config was loaded from
|
||||
lastFileModTime time.Time // last modification time
|
||||
}
|
||||
|
@ -172,6 +176,9 @@ func parseAndValidateConfigBytes(yamlBytes []byte) (config *Config, err error) {
|
|||
if err := validateUIConfig(config); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := validateMaintenanceConfig(config); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := validateStorageConfig(config); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -201,6 +208,17 @@ func validateStorageConfig(config *Config) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func validateMaintenanceConfig(config *Config) error {
|
||||
if config.Maintenance == nil {
|
||||
config.Maintenance = maintenance.GetDefaultConfig()
|
||||
} else {
|
||||
if err := config.Maintenance.ValidateAndSetDefaults(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateUIConfig(config *Config) error {
|
||||
if config.UI == nil {
|
||||
config.UI = GetDefaultUIConfig()
|
||||
|
|
|
@ -43,6 +43,11 @@ func TestParseAndValidateConfigBytes(t *testing.T) {
|
|||
config, err := parseAndValidateConfigBytes([]byte(fmt.Sprintf(`
|
||||
storage:
|
||||
file: %s
|
||||
maintenance:
|
||||
enabled: true
|
||||
start: 00:00
|
||||
duration: 4h
|
||||
every: [Monday, Thursday]
|
||||
ui:
|
||||
title: Test
|
||||
services:
|
||||
|
@ -79,6 +84,9 @@ services:
|
|||
if config.UI == nil || config.UI.Title != "Test" {
|
||||
t.Error("Expected Config.UI.Title to be Test")
|
||||
}
|
||||
if mc := config.Maintenance; mc == nil || mc.Start != "00:00" || !mc.IsEnabled() || mc.Duration != 4*time.Hour || len(mc.Every) != 2 {
|
||||
t.Error("Expected Config.Maintenance to be configured properly")
|
||||
}
|
||||
if len(config.Services) != 3 {
|
||||
t.Error("Should have returned two services")
|
||||
}
|
||||
|
|
133
config/maintenance/maintenance.go
Normal file
133
config/maintenance/maintenance.go
Normal file
|
@ -0,0 +1,133 @@
|
|||
package maintenance
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
errInvalidMaintenanceStartFormat = errors.New("invalid maintenance start format: must be hh:mm, between 00:00 and 23:59 inclusively (e.g. 23:00)")
|
||||
errInvalidMaintenanceDuration = errors.New("invalid maintenance duration: must be bigger than 0 (e.g. 30m)")
|
||||
errInvalidDayName = fmt.Errorf("invalid value specified for 'on'. supported values are %s", longDayNames)
|
||||
|
||||
longDayNames = []string{
|
||||
"Sunday",
|
||||
"Monday",
|
||||
"Tuesday",
|
||||
"Wednesday",
|
||||
"Thursday",
|
||||
"Friday",
|
||||
"Saturday",
|
||||
}
|
||||
)
|
||||
|
||||
// Config allows for the configuration of a maintenance period.
|
||||
// During this maintenance period, no alerts will be sent.
|
||||
//
|
||||
// Uses UTC.
|
||||
type Config struct {
|
||||
Enabled *bool `yaml:"enabled"` // Whether the maintenance period is enabled. Enabled by default if nil.
|
||||
Start string `yaml:"start"` // Time at which the maintenance period starts (e.g. 23:00)
|
||||
Duration time.Duration `yaml:"duration"` // Duration of the maintenance period (e.g. 4h)
|
||||
|
||||
// Every is a list of days of the week during which maintenance period applies.
|
||||
// See longDayNames for list of valid values.
|
||||
// Every day if empty.
|
||||
Every []string `yaml:"every"`
|
||||
|
||||
durationToStartFromMidnight time.Duration
|
||||
timeLocation *time.Location
|
||||
}
|
||||
|
||||
func GetDefaultConfig() *Config {
|
||||
defaultValue := false
|
||||
return &Config{
|
||||
Enabled: &defaultValue,
|
||||
}
|
||||
}
|
||||
|
||||
// IsEnabled returns whether maintenance is enabled or not
|
||||
func (c Config) IsEnabled() bool {
|
||||
if c.Enabled == nil {
|
||||
return true
|
||||
}
|
||||
return *c.Enabled
|
||||
}
|
||||
|
||||
// ValidateAndSetDefaults validates the maintenance configuration and sets the default values if necessary.
|
||||
//
|
||||
// Must be called once in the application's lifecycle before IsUnderMaintenance is called, since it
|
||||
// also sets durationToStartFromMidnight.
|
||||
func (c *Config) ValidateAndSetDefaults() error {
|
||||
if c == nil || !c.IsEnabled() {
|
||||
// Don't waste time validating if maintenance is not enabled.
|
||||
return nil
|
||||
}
|
||||
for _, day := range c.Every {
|
||||
isDayValid := false
|
||||
for _, longDayName := range longDayNames {
|
||||
if day == longDayName {
|
||||
isDayValid = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !isDayValid {
|
||||
return errInvalidDayName
|
||||
}
|
||||
}
|
||||
var err error
|
||||
c.durationToStartFromMidnight, err = hhmmToDuration(c.Start)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if c.Duration <= 0 || c.Duration >= 24*time.Hour {
|
||||
return errInvalidMaintenanceDuration
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsUnderMaintenance checks whether the services that Gatus monitors are within the configured maintenance window
|
||||
func (c Config) IsUnderMaintenance() bool {
|
||||
if !c.IsEnabled() {
|
||||
return false
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
dayWhereMaintenancePeriodWouldStart := now.Add(-c.Duration).Truncate(24 * time.Hour)
|
||||
hasMaintenanceEveryDay := len(c.Every) == 0
|
||||
hasMaintenancePeriodScheduledForThatWeekday := sort.SearchStrings(c.Every, dayWhereMaintenancePeriodWouldStart.Weekday().String()) != len(c.Every)
|
||||
if !hasMaintenanceEveryDay && !hasMaintenancePeriodScheduledForThatWeekday {
|
||||
// The day when the maintenance period would start is not scheduled
|
||||
// to have any maintenance, so we can just return false.
|
||||
return false
|
||||
}
|
||||
startOfMaintenancePeriod := dayWhereMaintenancePeriodWouldStart.Add(c.durationToStartFromMidnight)
|
||||
endOfMaintenancePeriod := startOfMaintenancePeriod.Add(c.Duration)
|
||||
return now.After(startOfMaintenancePeriod) && now.Before(endOfMaintenancePeriod)
|
||||
}
|
||||
|
||||
func hhmmToDuration(s string) (time.Duration, error) {
|
||||
if len(s) != 5 {
|
||||
return 0, errInvalidMaintenanceStartFormat
|
||||
}
|
||||
var hours, minutes int
|
||||
var err error
|
||||
if hours, err = extractNumericalValueFromPotentiallyZeroPaddedString(s[:2]); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if minutes, err = extractNumericalValueFromPotentiallyZeroPaddedString(s[3:5]); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
duration := (time.Duration(hours) * time.Hour) + (time.Duration(minutes) * time.Minute)
|
||||
if hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || duration < 0 || duration >= 24*time.Hour {
|
||||
return 0, errInvalidMaintenanceStartFormat
|
||||
}
|
||||
return duration, nil
|
||||
}
|
||||
|
||||
func extractNumericalValueFromPotentiallyZeroPaddedString(s string) (int, error) {
|
||||
return strconv.Atoi(strings.TrimPrefix(s, "0"))
|
||||
}
|
193
config/maintenance/maintenance_test.go
Normal file
193
config/maintenance/maintenance_test.go
Normal file
|
@ -0,0 +1,193 @@
|
|||
package maintenance
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestGetDefaultConfig(t *testing.T) {
|
||||
if *GetDefaultConfig().Enabled {
|
||||
t.Fatal("expected default config to be disabled by default")
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_Validate(t *testing.T) {
|
||||
yes, no := true, false
|
||||
scenarios := []struct {
|
||||
name string
|
||||
cfg *Config
|
||||
expectedError error
|
||||
}{
|
||||
{
|
||||
name: "nil",
|
||||
cfg: nil,
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
name: "disabled",
|
||||
cfg: &Config{
|
||||
Enabled: &no,
|
||||
},
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
name: "invalid-day",
|
||||
cfg: &Config{
|
||||
Every: []string{"invalid-day"},
|
||||
},
|
||||
expectedError: errInvalidDayName,
|
||||
},
|
||||
{
|
||||
name: "invalid-day",
|
||||
cfg: &Config{
|
||||
Every: []string{"invalid-day"},
|
||||
},
|
||||
expectedError: errInvalidDayName,
|
||||
},
|
||||
{
|
||||
name: "invalid-start-format",
|
||||
cfg: &Config{
|
||||
Start: "0000",
|
||||
},
|
||||
expectedError: errInvalidMaintenanceStartFormat,
|
||||
},
|
||||
{
|
||||
name: "invalid-start-hours",
|
||||
cfg: &Config{
|
||||
Start: "25:00",
|
||||
},
|
||||
expectedError: errInvalidMaintenanceStartFormat,
|
||||
},
|
||||
{
|
||||
name: "invalid-start-minutes",
|
||||
cfg: &Config{
|
||||
Start: "0:61",
|
||||
},
|
||||
expectedError: errInvalidMaintenanceStartFormat,
|
||||
},
|
||||
{
|
||||
name: "invalid-start-minutes-non-numerical",
|
||||
cfg: &Config{
|
||||
Start: "00:zz",
|
||||
},
|
||||
expectedError: strconv.ErrSyntax,
|
||||
},
|
||||
{
|
||||
name: "invalid-start-hours-non-numerical",
|
||||
cfg: &Config{
|
||||
Start: "zz:00",
|
||||
},
|
||||
expectedError: strconv.ErrSyntax,
|
||||
},
|
||||
{
|
||||
name: "invalid-duration",
|
||||
cfg: &Config{
|
||||
Start: "23:00",
|
||||
Duration: 0,
|
||||
},
|
||||
expectedError: errInvalidMaintenanceDuration,
|
||||
},
|
||||
{
|
||||
name: "every-day-at-2300",
|
||||
cfg: &Config{
|
||||
Start: "23:00",
|
||||
Duration: time.Hour,
|
||||
},
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
name: "every-monday-at-0000",
|
||||
cfg: &Config{
|
||||
Start: "00:00",
|
||||
Duration: 30 * time.Minute,
|
||||
Every: []string{"Monday"},
|
||||
},
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
name: "every-friday-and-sunday-at-0000-explicitly-enabled",
|
||||
cfg: &Config{
|
||||
Enabled: &yes,
|
||||
Start: "08:00",
|
||||
Duration: 8 * time.Hour,
|
||||
Every: []string{"Friday", "Sunday"},
|
||||
},
|
||||
expectedError: nil,
|
||||
},
|
||||
}
|
||||
for _, scenario := range scenarios {
|
||||
t.Run(scenario.name, func(t *testing.T) {
|
||||
err := scenario.cfg.ValidateAndSetDefaults()
|
||||
if !errors.Is(err, scenario.expectedError) {
|
||||
t.Errorf("expected %v, got %v", scenario.expectedError, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfig_IsUnderMaintenance(t *testing.T) {
|
||||
yes, no := true, false
|
||||
now := time.Now().UTC()
|
||||
scenarios := []struct {
|
||||
name string
|
||||
cfg *Config
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "disabled",
|
||||
cfg: &Config{
|
||||
Enabled: &no,
|
||||
},
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "under-maintenance-explicitly-enabled",
|
||||
cfg: &Config{
|
||||
Enabled: &yes,
|
||||
Start: fmt.Sprintf("%02d:00", now.Hour()),
|
||||
Duration: 2 * time.Hour,
|
||||
},
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "under-maintenance",
|
||||
cfg: &Config{
|
||||
Start: fmt.Sprintf("%02d:00", now.Hour()),
|
||||
Duration: 2 * time.Hour,
|
||||
},
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "not-under-maintenance",
|
||||
cfg: &Config{
|
||||
Start: fmt.Sprintf("%02d:00", now.Add(-5*time.Hour).Hour()),
|
||||
Duration: time.Hour,
|
||||
},
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "not-under-maintenance-today",
|
||||
cfg: &Config{
|
||||
Start: fmt.Sprintf("%02d:00", now.Hour()),
|
||||
Duration: time.Hour,
|
||||
Every: []string{now.Add(48 * time.Hour).Weekday().String()},
|
||||
},
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
for _, scenario := range scenarios {
|
||||
t.Run(scenario.name, func(t *testing.T) {
|
||||
if scenario.cfg.ValidateAndSetDefaults() != nil {
|
||||
t.Fatal("validation shouldn't have returned an error")
|
||||
}
|
||||
isUnderMaintenance := scenario.cfg.IsUnderMaintenance()
|
||||
if isUnderMaintenance != scenario.expected {
|
||||
t.Errorf("expected %v, got %v", scenario.expected, isUnderMaintenance)
|
||||
t.Logf("start=%v; duration=%v; now=%v", scenario.cfg.Start, scenario.cfg.Duration, time.Now().UTC())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
|
@ -8,6 +8,7 @@ import (
|
|||
|
||||
"github.com/TwinProduction/gatus/alerting"
|
||||
"github.com/TwinProduction/gatus/config"
|
||||
"github.com/TwinProduction/gatus/config/maintenance"
|
||||
"github.com/TwinProduction/gatus/core"
|
||||
"github.com/TwinProduction/gatus/metric"
|
||||
"github.com/TwinProduction/gatus/storage"
|
||||
|
@ -27,17 +28,17 @@ func Monitor(cfg *config.Config) {
|
|||
ctx, cancelFunc = context.WithCancel(context.Background())
|
||||
for _, service := range cfg.Services {
|
||||
if service.IsEnabled() {
|
||||
// To prevent multiple requests from running at the same time, we'll wait for a little bit before each iteration
|
||||
// To prevent multiple requests from running at the same time, we'll wait for a little before each iteration
|
||||
time.Sleep(1111 * time.Millisecond)
|
||||
go monitor(service, cfg.Alerting, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx)
|
||||
go monitor(service, cfg.Alerting, cfg.Maintenance, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// monitor monitors a single service in a loop
|
||||
func monitor(service *core.Service, alertingConfig *alerting.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) {
|
||||
func monitor(service *core.Service, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) {
|
||||
// Run it immediately on start
|
||||
execute(service, alertingConfig, disableMonitoringLock, enabledMetrics, debug)
|
||||
execute(service, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug)
|
||||
// Loop for the next executions
|
||||
for {
|
||||
select {
|
||||
|
@ -45,12 +46,12 @@ func monitor(service *core.Service, alertingConfig *alerting.Config, disableMoni
|
|||
log.Printf("[watchdog][monitor] Canceling current execution of group=%s; service=%s", service.Group, service.Name)
|
||||
return
|
||||
case <-time.After(service.Interval):
|
||||
execute(service, alertingConfig, disableMonitoringLock, enabledMetrics, debug)
|
||||
execute(service, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func execute(service *core.Service, alertingConfig *alerting.Config, disableMonitoringLock, enabledMetrics, debug bool) {
|
||||
func execute(service *core.Service, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool) {
|
||||
if !disableMonitoringLock {
|
||||
// By placing the lock here, we prevent multiple services from being monitored at the exact same time, which
|
||||
// could cause performance issues and return inaccurate results
|
||||
|
@ -72,7 +73,11 @@ func execute(service *core.Service, alertingConfig *alerting.Config, disableMoni
|
|||
len(result.Errors),
|
||||
result.Duration.Round(time.Millisecond),
|
||||
)
|
||||
HandleAlerting(service, result, alertingConfig, debug)
|
||||
if !maintenanceConfig.IsUnderMaintenance() {
|
||||
HandleAlerting(service, result, alertingConfig, debug)
|
||||
} else if debug {
|
||||
log.Println("[watchdog][execute] Not handling alerting because currently in the maintenance window")
|
||||
}
|
||||
if debug {
|
||||
log.Printf("[watchdog][execute] Waiting for interval=%s before monitoring group=%s service=%s again", service.Interval, service.Group, service.Name)
|
||||
}
|
||||
|
@ -83,7 +88,9 @@ func execute(service *core.Service, alertingConfig *alerting.Config, disableMoni
|
|||
|
||||
// UpdateServiceStatuses updates the slice of service statuses
|
||||
func UpdateServiceStatuses(service *core.Service, result *core.Result) {
|
||||
storage.Get().Insert(service, result)
|
||||
if err := storage.Get().Insert(service, result); err != nil {
|
||||
log.Println("[watchdog][UpdateServiceStatuses] Failed to insert data in storage:", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown stops monitoring all services
|
||||
|
|
Loading…
Reference in a new issue