mirror of
https://github.com/TwiN/gatus.git
synced 2024-12-14 11:58:04 +00:00
Support sending notifications when alert is resolved
Add debug parameter for those wishing to filter some noise from the logs
This commit is contained in:
parent
8a0a2ef51f
commit
139e186ac2
6 changed files with 175 additions and 76 deletions
BIN
.github/assets/slack-alerts.png
vendored
Normal file
BIN
.github/assets/slack-alerts.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 34 KiB |
78
README.md
78
README.md
|
@ -67,38 +67,40 @@ This example would look like this:
|
|||
|
||||
![Simple example](.github/assets/example.png)
|
||||
|
||||
Note that you can also add environment variables in the your configuration file (i.e. `$DOMAIN`, `${DOMAIN}`)
|
||||
Note that you can also add environment variables in the configuration file (i.e. `$DOMAIN`, `${DOMAIN}`)
|
||||
|
||||
|
||||
### Configuration
|
||||
|
||||
| Parameter | Description | Default |
|
||||
| --------------------------------- | --------------------------------------------------------------- | -------------- |
|
||||
| `metrics` | Whether to expose metrics at /metrics | `false` |
|
||||
| `services` | List of services to monitor | Required `[]` |
|
||||
| `services[].name` | Name of the service. Can be anything. | Required `""` |
|
||||
| `services[].url` | URL to send the request to | Required `""` |
|
||||
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
|
||||
| `services[].interval` | Duration to wait between every status check | `60s` |
|
||||
| `services[].method` | Request method | `GET` |
|
||||
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
|
||||
| `services[].body` | Request body | `""` |
|
||||
| `services[].headers` | Request headers | `{}` |
|
||||
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
|
||||
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
|
||||
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
|
||||
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
|
||||
| `alerting` | Configuration for alerting | `{}` |
|
||||
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
|
||||
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
|
||||
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
|
||||
| `alerting.twilio.token` | Twilio auth token | Required `""` |
|
||||
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
|
||||
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
|
||||
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
|
||||
| `alerting.custom.url` | Custom alerting request url | `""` |
|
||||
| `alerting.custom.body` | Custom alerting request body. | `""` |
|
||||
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
|
||||
| Parameter | Description | Default |
|
||||
| -------------------------------------- | --------------------------------------------------------------- | -------------- |
|
||||
| `debug` | Whether to enable debug logs | `false` |
|
||||
| `metrics` | Whether to expose metrics at /metrics | `false` |
|
||||
| `services` | List of services to monitor | Required `[]` |
|
||||
| `services[].name` | Name of the service. Can be anything. | Required `""` |
|
||||
| `services[].url` | URL to send the request to | Required `""` |
|
||||
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
|
||||
| `services[].interval` | Duration to wait between every status check | `60s` |
|
||||
| `services[].method` | Request method | `GET` |
|
||||
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
|
||||
| `services[].body` | Request body | `""` |
|
||||
| `services[].headers` | Request headers | `{}` |
|
||||
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
|
||||
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
|
||||
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
|
||||
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
|
||||
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
|
||||
| `alerting` | Configuration for alerting | `{}` |
|
||||
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
|
||||
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
|
||||
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
|
||||
| `alerting.twilio.token` | Twilio auth token | Required `""` |
|
||||
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
|
||||
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
|
||||
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
|
||||
| `alerting.custom.url` | Custom alerting request url | `""` |
|
||||
| `alerting.custom.body` | Custom alerting request body. | `""` |
|
||||
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
|
||||
|
||||
|
||||
### Conditions
|
||||
|
@ -121,7 +123,7 @@ Here are some examples of conditions you can use:
|
|||
|
||||
## Docker
|
||||
|
||||
Building the Docker image is done as following:
|
||||
Building the Docker image is done as follows:
|
||||
|
||||
```
|
||||
docker build . -t gatus
|
||||
|
@ -194,33 +196,37 @@ services:
|
|||
- type: slack
|
||||
enabled: true
|
||||
description: "healthcheck failed 3 times in a row"
|
||||
send-on-resolved: true
|
||||
- type: slack
|
||||
enabled: true
|
||||
threshold: 5
|
||||
description: "healthcheck failed 5 times in a row"
|
||||
send-on-resolved: true
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
||||
Here's an example of what the notifications look like:
|
||||
|
||||
![Slack notifications](.github/assets/slack-alerts.png)
|
||||
|
||||
|
||||
### Configuring Twilio alerts
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
twilio:
|
||||
sid: ****
|
||||
token: ****
|
||||
from: +1-234-567-8901
|
||||
to: +1-234-567-8901
|
||||
sid: "..."
|
||||
token: "..."
|
||||
from: "+1-234-567-8901"
|
||||
to: "+1-234-567-8901"
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: twilio
|
||||
enabled: true
|
||||
description: "healthcheck failed 3 times in a row"
|
||||
- type: twilio
|
||||
enabled: true
|
||||
threshold: 5
|
||||
|
|
|
@ -22,6 +22,7 @@ var (
|
|||
|
||||
type Config struct {
|
||||
Metrics bool `yaml:"metrics"`
|
||||
Debug bool `yaml:"debug"`
|
||||
Alerting *core.AlertingConfig `yaml:"alerting"`
|
||||
Services []*core.Service `yaml:"services"`
|
||||
}
|
||||
|
|
|
@ -2,9 +2,11 @@ package core
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"github.com/TwinProduction/gatus/client"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
|
@ -70,3 +72,64 @@ func (provider *CustomAlertProvider) Send(serviceName, alertDescription string)
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider {
|
||||
var message string
|
||||
var color string
|
||||
if resolved {
|
||||
message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow)
|
||||
color = "#36A64F"
|
||||
} else {
|
||||
message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name)
|
||||
color = "#DD0000"
|
||||
}
|
||||
var results string
|
||||
for _, conditionResult := range result.ConditionResults {
|
||||
var prefix string
|
||||
if conditionResult.Success {
|
||||
prefix = ":heavy_check_mark:"
|
||||
} else {
|
||||
prefix = ":x:"
|
||||
}
|
||||
results += fmt.Sprintf("%s - `%s`\n", prefix, conditionResult.Condition)
|
||||
}
|
||||
return &CustomAlertProvider{
|
||||
Url: slackWebHookUrl,
|
||||
Method: "POST",
|
||||
Body: fmt.Sprintf(`{
|
||||
"text": "",
|
||||
"attachments": [
|
||||
{
|
||||
"title": ":helmet_with_white_cross: Gatus",
|
||||
"text": "%s:\n> %s",
|
||||
"short": false,
|
||||
"color": "%s",
|
||||
"fields": [
|
||||
{
|
||||
"title": "Condition results",
|
||||
"value": "%s",
|
||||
"short": false
|
||||
}
|
||||
]
|
||||
},
|
||||
]
|
||||
}`, message, alert.Description, color, results),
|
||||
Headers: map[string]string{"Content-Type": "application/json"},
|
||||
}
|
||||
}
|
||||
|
||||
func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message string) *CustomAlertProvider {
|
||||
return &CustomAlertProvider{
|
||||
Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", provider.SID),
|
||||
Method: "POST",
|
||||
Body: url.Values{
|
||||
"To": {provider.To},
|
||||
"From": {provider.From},
|
||||
"Body": {message},
|
||||
}.Encode(),
|
||||
Headers: map[string]string{
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", provider.SID, provider.Token)))),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
8
main.go
8
main.go
|
@ -3,7 +3,6 @@ package main
|
|||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"encoding/json"
|
||||
"github.com/TwinProduction/gatus/config"
|
||||
"github.com/TwinProduction/gatus/watchdog"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
|
@ -53,12 +52,11 @@ func serviceResultsHandler(writer http.ResponseWriter, r *http.Request) {
|
|||
if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired {
|
||||
buffer := &bytes.Buffer{}
|
||||
gzipWriter := gzip.NewWriter(buffer)
|
||||
serviceResults := watchdog.GetServiceResults()
|
||||
data, err := json.Marshal(serviceResults)
|
||||
data, err := watchdog.GetJsonEncodedServiceResults()
|
||||
if err != nil {
|
||||
log.Printf("[main][serviceResultsHandler] Unable to marshall object to JSON: %s", err.Error())
|
||||
log.Printf("[main][serviceResultsHandler] Unable to marshal object to JSON: %s", err.Error())
|
||||
writer.WriteHeader(http.StatusInternalServerError)
|
||||
_, _ = writer.Write([]byte("Unable to marshall object to JSON"))
|
||||
_, _ = writer.Write([]byte("Unable to marshal object to JSON"))
|
||||
return
|
||||
}
|
||||
gzipWriter.Write(data)
|
||||
|
|
|
@ -1,25 +1,34 @@
|
|||
package watchdog
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/TwinProduction/gatus/config"
|
||||
"github.com/TwinProduction/gatus/core"
|
||||
"github.com/TwinProduction/gatus/metric"
|
||||
"log"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
serviceResults = make(map[string][]*core.Result)
|
||||
rwLock sync.RWMutex
|
||||
|
||||
// serviceResultsMutex is used to prevent concurrent map access
|
||||
serviceResultsMutex sync.RWMutex
|
||||
|
||||
// monitoringMutex is used to prevent multiple services from being evaluated at the same time.
|
||||
// Without this, conditions using response time may become inaccurate.
|
||||
monitoringMutex sync.Mutex
|
||||
)
|
||||
|
||||
// GetServiceResults returns a list of the last 20 results for each services
|
||||
func GetServiceResults() *map[string][]*core.Result {
|
||||
return &serviceResults
|
||||
// GetJsonEncodedServiceResults returns a list of the last 20 results for each services encoded using json.Marshal.
|
||||
// The reason why the encoding is done here is because we use a mutex to prevent concurrent map access.
|
||||
func GetJsonEncodedServiceResults() ([]byte, error) {
|
||||
serviceResultsMutex.RLock()
|
||||
data, err := json.Marshal(serviceResults)
|
||||
serviceResultsMutex.RUnlock()
|
||||
return data, err
|
||||
}
|
||||
|
||||
// Monitor loops over each services and starts a goroutine to monitor each services separately
|
||||
|
@ -33,33 +42,39 @@ func Monitor(cfg *config.Config) {
|
|||
|
||||
// monitor monitors a single service in a loop
|
||||
func monitor(service *core.Service) {
|
||||
cfg := config.Get()
|
||||
for {
|
||||
// By placing the lock here, we prevent multiple services from being monitored at the exact same time, which
|
||||
// could cause performance issues and return inaccurate results
|
||||
rwLock.Lock()
|
||||
log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name)
|
||||
monitoringMutex.Lock()
|
||||
if cfg.Debug {
|
||||
log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name)
|
||||
}
|
||||
result := service.EvaluateConditions()
|
||||
metric.PublishMetricsForService(service, result)
|
||||
serviceResultsMutex.Lock()
|
||||
serviceResults[service.Name] = append(serviceResults[service.Name], result)
|
||||
if len(serviceResults[service.Name]) > 20 {
|
||||
serviceResults[service.Name] = serviceResults[service.Name][1:]
|
||||
}
|
||||
rwLock.Unlock()
|
||||
serviceResultsMutex.Unlock()
|
||||
var extra string
|
||||
if !result.Success {
|
||||
extra = fmt.Sprintf("responseBody=%s", result.Body)
|
||||
}
|
||||
log.Printf(
|
||||
"[watchdog][monitor] Finished monitoring serviceName=%s; errors=%d; requestDuration=%s; %s",
|
||||
"[watchdog][monitor] Monitored serviceName=%s; success=%v; errors=%d; requestDuration=%s; %s",
|
||||
service.Name,
|
||||
result.Success,
|
||||
len(result.Errors),
|
||||
result.Duration.Round(time.Millisecond),
|
||||
extra,
|
||||
)
|
||||
|
||||
handleAlerting(service, result)
|
||||
|
||||
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s", service.Interval, service.Name)
|
||||
if cfg.Debug {
|
||||
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name)
|
||||
}
|
||||
monitoringMutex.Unlock()
|
||||
time.Sleep(service.Interval)
|
||||
}
|
||||
}
|
||||
|
@ -72,10 +87,43 @@ func handleAlerting(service *core.Service, result *core.Result) {
|
|||
if result.Success {
|
||||
if service.NumberOfFailuresInARow > 0 {
|
||||
for _, alert := range service.Alerts {
|
||||
if !alert.Enabled || !alert.SendOnResolved || alert.Threshold < service.NumberOfFailuresInARow {
|
||||
if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow {
|
||||
continue
|
||||
}
|
||||
// TODO
|
||||
var alertProvider *core.CustomAlertProvider
|
||||
if alert.Type == core.SlackAlert {
|
||||
if len(cfg.Alerting.Slack) > 0 {
|
||||
log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been resolved", alert.Description)
|
||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true)
|
||||
} else {
|
||||
log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
||||
}
|
||||
} else if alert.Type == core.TwilioAlert {
|
||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||
log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alert.Description)
|
||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("%s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because Twilio isn't configured properly'")
|
||||
}
|
||||
} else if alert.Type == core.CustomAlert {
|
||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||
log.Printf("[watchdog][monitor] Sending custom alert because alert with description=%s has been triggered", alert.Description)
|
||||
alertProvider = &core.CustomAlertProvider{
|
||||
Url: cfg.Alerting.Custom.Url,
|
||||
Method: cfg.Alerting.Custom.Method,
|
||||
Body: cfg.Alerting.Custom.Body,
|
||||
Headers: cfg.Alerting.Custom.Headers,
|
||||
}
|
||||
} else {
|
||||
log.Printf("[watchdog][monitor] Not sending custom alert despite being triggered, because there is no custom url configured")
|
||||
}
|
||||
}
|
||||
if alertProvider != nil {
|
||||
err := alertProvider.Send(service.Name, alert.Description)
|
||||
if err != nil {
|
||||
log.Printf("[watchdog][monitor] Ran into error sending an alert: %s", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
service.NumberOfFailuresInARow = 0
|
||||
|
@ -90,33 +138,16 @@ func handleAlerting(service *core.Service, result *core.Result) {
|
|||
if alert.Type == core.SlackAlert {
|
||||
if len(cfg.Alerting.Slack) > 0 {
|
||||
log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been triggered", alert.Description)
|
||||
alertProvider = &core.CustomAlertProvider{
|
||||
Url: cfg.Alerting.Slack,
|
||||
Method: "POST",
|
||||
Body: fmt.Sprintf(`{"text":"*[Gatus]*\n*service:* %s\n*description:* %s"}`, service.Name, alert.Description),
|
||||
Headers: map[string]string{"Content-Type": "application/json"},
|
||||
}
|
||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false)
|
||||
} else {
|
||||
log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
||||
}
|
||||
} else if alert.Type == core.TwilioAlert {
|
||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||
log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alert.Description)
|
||||
alertProvider = &core.CustomAlertProvider{
|
||||
Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", cfg.Alerting.Twilio.SID),
|
||||
Method: "POST",
|
||||
Body: url.Values{
|
||||
"To": {cfg.Alerting.Twilio.To},
|
||||
"From": {cfg.Alerting.Twilio.From},
|
||||
"Body": {fmt.Sprintf("%s - %s", service.Name, alert.Description)},
|
||||
}.Encode(),
|
||||
Headers: map[string]string{
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
"Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", cfg.Alerting.Twilio.SID, cfg.Alerting.Twilio.Token)))),
|
||||
},
|
||||
}
|
||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("%s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because twilio config settings missing")
|
||||
log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because Twilio config settings missing")
|
||||
}
|
||||
} else if alert.Type == core.CustomAlert {
|
||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||
|
|
Loading…
Reference in a new issue