1
0
Fork 0
mirror of https://github.com/TwiN/gatus.git synced 2024-12-14 11:58:04 +00:00

Support sending notifications when alert is resolved

Add debug parameter for those wishing to filter some noise from the logs
This commit is contained in:
TwinProduction 2020-09-04 21:31:28 -04:00
parent 8a0a2ef51f
commit 139e186ac2
6 changed files with 175 additions and 76 deletions

BIN
.github/assets/slack-alerts.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View file

@ -67,38 +67,40 @@ This example would look like this:
![Simple example](.github/assets/example.png)
Note that you can also add environment variables in the your configuration file (i.e. `$DOMAIN`, `${DOMAIN}`)
Note that you can also add environment variables in the configuration file (i.e. `$DOMAIN`, `${DOMAIN}`)
### Configuration
| Parameter | Description | Default |
| --------------------------------- | --------------------------------------------------------------- | -------------- |
| `metrics` | Whether to expose metrics at /metrics | `false` |
| `services` | List of services to monitor | Required `[]` |
| `services[].name` | Name of the service. Can be anything. | Required `""` |
| `services[].url` | URL to send the request to | Required `""` |
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
| `services[].interval` | Duration to wait between every status check | `60s` |
| `services[].method` | Request method | `GET` |
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
| `services[].body` | Request body | `""` |
| `services[].headers` | Request headers | `{}` |
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
| `alerting` | Configuration for alerting | `{}` |
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
| `alerting.twilio.token` | Twilio auth token | Required `""` |
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
| `alerting.custom.url` | Custom alerting request url | `""` |
| `alerting.custom.body` | Custom alerting request body. | `""` |
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
| Parameter | Description | Default |
| -------------------------------------- | --------------------------------------------------------------- | -------------- |
| `debug` | Whether to enable debug logs | `false` |
| `metrics` | Whether to expose metrics at /metrics | `false` |
| `services` | List of services to monitor | Required `[]` |
| `services[].name` | Name of the service. Can be anything. | Required `""` |
| `services[].url` | URL to send the request to | Required `""` |
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
| `services[].interval` | Duration to wait between every status check | `60s` |
| `services[].method` | Request method | `GET` |
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
| `services[].body` | Request body | `""` |
| `services[].headers` | Request headers | `{}` |
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
| `alerting` | Configuration for alerting | `{}` |
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
| `alerting.twilio.token` | Twilio auth token | Required `""` |
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
| `alerting.custom.url` | Custom alerting request url | `""` |
| `alerting.custom.body` | Custom alerting request body. | `""` |
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
### Conditions
@ -121,7 +123,7 @@ Here are some examples of conditions you can use:
## Docker
Building the Docker image is done as following:
Building the Docker image is done as follows:
```
docker build . -t gatus
@ -194,33 +196,37 @@ services:
- type: slack
enabled: true
description: "healthcheck failed 3 times in a row"
send-on-resolved: true
- type: slack
enabled: true
threshold: 5
description: "healthcheck failed 5 times in a row"
send-on-resolved: true
conditions:
- "[STATUS] == 200"
- "[BODY].status == UP"
- "[RESPONSE_TIME] < 300"
```
Here's an example of what the notifications look like:
![Slack notifications](.github/assets/slack-alerts.png)
### Configuring Twilio alerts
```yaml
alerting:
twilio:
sid: ****
token: ****
from: +1-234-567-8901
to: +1-234-567-8901
sid: "..."
token: "..."
from: "+1-234-567-8901"
to: "+1-234-567-8901"
services:
- name: twinnation
interval: 30s
url: "https://twinnation.org/health"
alerts:
- type: twilio
enabled: true
description: "healthcheck failed 3 times in a row"
- type: twilio
enabled: true
threshold: 5

View file

@ -22,6 +22,7 @@ var (
type Config struct {
Metrics bool `yaml:"metrics"`
Debug bool `yaml:"debug"`
Alerting *core.AlertingConfig `yaml:"alerting"`
Services []*core.Service `yaml:"services"`
}

View file

@ -2,9 +2,11 @@ package core
import (
"bytes"
"encoding/base64"
"fmt"
"github.com/TwinProduction/gatus/client"
"net/http"
"net/url"
"strings"
)
@ -70,3 +72,64 @@ func (provider *CustomAlertProvider) Send(serviceName, alertDescription string)
}
return nil
}
func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider {
var message string
var color string
if resolved {
message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow)
color = "#36A64F"
} else {
message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name)
color = "#DD0000"
}
var results string
for _, conditionResult := range result.ConditionResults {
var prefix string
if conditionResult.Success {
prefix = ":heavy_check_mark:"
} else {
prefix = ":x:"
}
results += fmt.Sprintf("%s - `%s`\n", prefix, conditionResult.Condition)
}
return &CustomAlertProvider{
Url: slackWebHookUrl,
Method: "POST",
Body: fmt.Sprintf(`{
"text": "",
"attachments": [
{
"title": ":helmet_with_white_cross: Gatus",
"text": "%s:\n> %s",
"short": false,
"color": "%s",
"fields": [
{
"title": "Condition results",
"value": "%s",
"short": false
}
]
},
]
}`, message, alert.Description, color, results),
Headers: map[string]string{"Content-Type": "application/json"},
}
}
func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message string) *CustomAlertProvider {
return &CustomAlertProvider{
Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", provider.SID),
Method: "POST",
Body: url.Values{
"To": {provider.To},
"From": {provider.From},
"Body": {message},
}.Encode(),
Headers: map[string]string{
"Content-Type": "application/x-www-form-urlencoded",
"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", provider.SID, provider.Token)))),
},
}
}

View file

@ -3,7 +3,6 @@ package main
import (
"bytes"
"compress/gzip"
"encoding/json"
"github.com/TwinProduction/gatus/config"
"github.com/TwinProduction/gatus/watchdog"
"github.com/prometheus/client_golang/prometheus/promhttp"
@ -53,12 +52,11 @@ func serviceResultsHandler(writer http.ResponseWriter, r *http.Request) {
if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired {
buffer := &bytes.Buffer{}
gzipWriter := gzip.NewWriter(buffer)
serviceResults := watchdog.GetServiceResults()
data, err := json.Marshal(serviceResults)
data, err := watchdog.GetJsonEncodedServiceResults()
if err != nil {
log.Printf("[main][serviceResultsHandler] Unable to marshall object to JSON: %s", err.Error())
log.Printf("[main][serviceResultsHandler] Unable to marshal object to JSON: %s", err.Error())
writer.WriteHeader(http.StatusInternalServerError)
_, _ = writer.Write([]byte("Unable to marshall object to JSON"))
_, _ = writer.Write([]byte("Unable to marshal object to JSON"))
return
}
gzipWriter.Write(data)

View file

@ -1,25 +1,34 @@
package watchdog
import (
"encoding/base64"
"encoding/json"
"fmt"
"github.com/TwinProduction/gatus/config"
"github.com/TwinProduction/gatus/core"
"github.com/TwinProduction/gatus/metric"
"log"
"net/url"
"sync"
"time"
)
var (
serviceResults = make(map[string][]*core.Result)
rwLock sync.RWMutex
// serviceResultsMutex is used to prevent concurrent map access
serviceResultsMutex sync.RWMutex
// monitoringMutex is used to prevent multiple services from being evaluated at the same time.
// Without this, conditions using response time may become inaccurate.
monitoringMutex sync.Mutex
)
// GetServiceResults returns a list of the last 20 results for each services
func GetServiceResults() *map[string][]*core.Result {
return &serviceResults
// GetJsonEncodedServiceResults returns a list of the last 20 results for each services encoded using json.Marshal.
// The reason why the encoding is done here is because we use a mutex to prevent concurrent map access.
func GetJsonEncodedServiceResults() ([]byte, error) {
serviceResultsMutex.RLock()
data, err := json.Marshal(serviceResults)
serviceResultsMutex.RUnlock()
return data, err
}
// Monitor loops over each services and starts a goroutine to monitor each services separately
@ -33,33 +42,39 @@ func Monitor(cfg *config.Config) {
// monitor monitors a single service in a loop
func monitor(service *core.Service) {
cfg := config.Get()
for {
// By placing the lock here, we prevent multiple services from being monitored at the exact same time, which
// could cause performance issues and return inaccurate results
rwLock.Lock()
log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name)
monitoringMutex.Lock()
if cfg.Debug {
log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name)
}
result := service.EvaluateConditions()
metric.PublishMetricsForService(service, result)
serviceResultsMutex.Lock()
serviceResults[service.Name] = append(serviceResults[service.Name], result)
if len(serviceResults[service.Name]) > 20 {
serviceResults[service.Name] = serviceResults[service.Name][1:]
}
rwLock.Unlock()
serviceResultsMutex.Unlock()
var extra string
if !result.Success {
extra = fmt.Sprintf("responseBody=%s", result.Body)
}
log.Printf(
"[watchdog][monitor] Finished monitoring serviceName=%s; errors=%d; requestDuration=%s; %s",
"[watchdog][monitor] Monitored serviceName=%s; success=%v; errors=%d; requestDuration=%s; %s",
service.Name,
result.Success,
len(result.Errors),
result.Duration.Round(time.Millisecond),
extra,
)
handleAlerting(service, result)
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s", service.Interval, service.Name)
if cfg.Debug {
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name)
}
monitoringMutex.Unlock()
time.Sleep(service.Interval)
}
}
@ -72,10 +87,43 @@ func handleAlerting(service *core.Service, result *core.Result) {
if result.Success {
if service.NumberOfFailuresInARow > 0 {
for _, alert := range service.Alerts {
if !alert.Enabled || !alert.SendOnResolved || alert.Threshold < service.NumberOfFailuresInARow {
if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow {
continue
}
// TODO
var alertProvider *core.CustomAlertProvider
if alert.Type == core.SlackAlert {
if len(cfg.Alerting.Slack) > 0 {
log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been resolved", alert.Description)
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true)
} else {
log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
}
} else if alert.Type == core.TwilioAlert {
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alert.Description)
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("%s - %s", service.Name, alert.Description))
} else {
log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because Twilio isn't configured properly'")
}
} else if alert.Type == core.CustomAlert {
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
log.Printf("[watchdog][monitor] Sending custom alert because alert with description=%s has been triggered", alert.Description)
alertProvider = &core.CustomAlertProvider{
Url: cfg.Alerting.Custom.Url,
Method: cfg.Alerting.Custom.Method,
Body: cfg.Alerting.Custom.Body,
Headers: cfg.Alerting.Custom.Headers,
}
} else {
log.Printf("[watchdog][monitor] Not sending custom alert despite being triggered, because there is no custom url configured")
}
}
if alertProvider != nil {
err := alertProvider.Send(service.Name, alert.Description)
if err != nil {
log.Printf("[watchdog][monitor] Ran into error sending an alert: %s", err.Error())
}
}
}
}
service.NumberOfFailuresInARow = 0
@ -90,33 +138,16 @@ func handleAlerting(service *core.Service, result *core.Result) {
if alert.Type == core.SlackAlert {
if len(cfg.Alerting.Slack) > 0 {
log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been triggered", alert.Description)
alertProvider = &core.CustomAlertProvider{
Url: cfg.Alerting.Slack,
Method: "POST",
Body: fmt.Sprintf(`{"text":"*[Gatus]*\n*service:* %s\n*description:* %s"}`, service.Name, alert.Description),
Headers: map[string]string{"Content-Type": "application/json"},
}
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false)
} else {
log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
}
} else if alert.Type == core.TwilioAlert {
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alert.Description)
alertProvider = &core.CustomAlertProvider{
Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", cfg.Alerting.Twilio.SID),
Method: "POST",
Body: url.Values{
"To": {cfg.Alerting.Twilio.To},
"From": {cfg.Alerting.Twilio.From},
"Body": {fmt.Sprintf("%s - %s", service.Name, alert.Description)},
}.Encode(),
Headers: map[string]string{
"Content-Type": "application/x-www-form-urlencoded",
"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", cfg.Alerting.Twilio.SID, cfg.Alerting.Twilio.Token)))),
},
}
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("%s - %s", service.Name, alert.Description))
} else {
log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because twilio config settings missing")
log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because Twilio config settings missing")
}
} else if alert.Type == core.CustomAlert {
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {