2020-09-19 20:22:12 +00:00
package watchdog
2020-09-16 23:26:19 +00:00
import (
"encoding/json"
2020-11-13 20:01:21 +00:00
"log"
2020-09-16 23:26:19 +00:00
"github.com/TwinProduction/gatus/config"
"github.com/TwinProduction/gatus/core"
)
2020-09-19 20:22:12 +00:00
// HandleAlerting takes care of alerts to resolve and alerts to trigger based on result success or failure
func HandleAlerting ( service * core . Service , result * core . Result ) {
2020-09-16 23:26:19 +00:00
cfg := config . Get ( )
if cfg . Alerting == nil {
return
}
if result . Success {
handleAlertsToResolve ( service , result , cfg )
} else {
handleAlertsToTrigger ( service , result , cfg )
}
}
func handleAlertsToTrigger ( service * core . Service , result * core . Result , cfg * config . Config ) {
service . NumberOfSuccessesInARow = 0
service . NumberOfFailuresInARow ++
for _ , alert := range service . Alerts {
// If the alert hasn't been triggered, move to the next one
2021-05-16 01:31:32 +00:00
if ! alert . IsEnabled ( ) || alert . FailureThreshold > service . NumberOfFailuresInARow {
2020-09-16 23:26:19 +00:00
continue
}
if alert . Triggered {
if cfg . Debug {
2021-05-16 01:31:32 +00:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Alert for service=%s with description='%s' has already been TRIGGERED, skipping" , service . Name , alert . GetDescription ( ) )
2020-09-16 23:26:19 +00:00
}
continue
}
2020-09-26 18:23:43 +00:00
alertProvider := config . GetAlertingProviderByAlertType ( cfg , alert . Type )
if alertProvider != nil && alertProvider . IsValid ( ) {
2021-05-16 01:31:32 +00:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Sending %s alert because alert for service=%s with description='%s' has been TRIGGERED" , alert . Type , service . Name , alert . GetDescription ( ) )
2020-09-26 18:23:43 +00:00
customAlertProvider := alertProvider . ToCustomAlertProvider ( service , alert , result , false )
2020-09-16 23:26:19 +00:00
// TODO: retry on error
var err error
2020-09-26 18:23:43 +00:00
// We need to extract the DedupKey from PagerDuty's response
2020-09-16 23:26:19 +00:00
if alert . Type == core . PagerDutyAlert {
var body [ ] byte
2021-05-16 01:31:32 +00:00
if body , err = customAlertProvider . Send ( service . Name , alert . GetDescription ( ) , false ) ; err == nil {
2020-09-16 23:26:19 +00:00
var response pagerDutyResponse
2021-01-10 04:52:11 +00:00
if err = json . Unmarshal ( body , & response ) ; err != nil {
2021-01-21 21:14:32 +00:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Ran into error unmarshaling pagerduty response: %s" , err . Error ( ) )
2020-09-16 23:26:19 +00:00
} else {
alert . ResolveKey = response . DedupKey
}
}
} else {
2020-09-26 18:23:43 +00:00
// All other alert types don't need to extract anything from the body, so we can just send the request right away
2021-05-16 01:31:32 +00:00
_ , err = customAlertProvider . Send ( service . Name , alert . GetDescription ( ) , false )
2020-09-16 23:26:19 +00:00
}
if err != nil {
2021-01-21 21:14:32 +00:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Failed to send an alert for service=%s: %s" , service . Name , err . Error ( ) )
2020-09-16 23:26:19 +00:00
} else {
alert . Triggered = true
}
2020-09-26 18:23:43 +00:00
} else {
2020-09-30 00:06:47 +00:00
log . Printf ( "[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly" , alert . Type )
2020-09-16 23:26:19 +00:00
}
}
}
func handleAlertsToResolve ( service * core . Service , result * core . Result , cfg * config . Config ) {
service . NumberOfSuccessesInARow ++
for _ , alert := range service . Alerts {
2021-05-16 01:31:32 +00:00
if ! alert . IsEnabled ( ) || ! alert . Triggered || alert . SuccessThreshold > service . NumberOfSuccessesInARow {
2020-09-16 23:26:19 +00:00
continue
}
2021-01-21 21:14:32 +00:00
// Even if the alert provider returns an error, we still set the alert's Triggered variable to false.
// Further explanation can be found on Alert's Triggered field.
2020-09-16 23:26:19 +00:00
alert . Triggered = false
2021-05-16 01:31:32 +00:00
if ! alert . IsSendingOnResolved ( ) {
2020-09-16 23:26:19 +00:00
continue
}
2020-09-26 18:23:43 +00:00
alertProvider := config . GetAlertingProviderByAlertType ( cfg , alert . Type )
if alertProvider != nil && alertProvider . IsValid ( ) {
2021-05-16 01:31:32 +00:00
log . Printf ( "[watchdog][handleAlertsToResolve] Sending %s alert because alert for service=%s with description='%s' has been RESOLVED" , alert . Type , service . Name , alert . GetDescription ( ) )
2020-09-26 18:23:43 +00:00
customAlertProvider := alertProvider . ToCustomAlertProvider ( service , alert , result , true )
2020-09-16 23:26:19 +00:00
// TODO: retry on error
2021-05-16 01:31:32 +00:00
_ , err := customAlertProvider . Send ( service . Name , alert . GetDescription ( ) , true )
2020-09-16 23:26:19 +00:00
if err != nil {
2021-01-21 21:14:32 +00:00
log . Printf ( "[watchdog][handleAlertsToResolve] Failed to send an alert for service=%s: %s" , service . Name , err . Error ( ) )
2020-09-16 23:26:19 +00:00
} else {
if alert . Type == core . PagerDutyAlert {
alert . ResolveKey = ""
}
}
2020-09-26 18:23:43 +00:00
} else {
2020-09-30 00:06:47 +00:00
log . Printf ( "[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being RESOLVED, because the provider wasn't configured properly" , alert . Type )
2020-09-16 23:26:19 +00:00
}
}
service . NumberOfFailuresInARow = 0
}
2020-09-19 20:22:12 +00:00
type pagerDutyResponse struct {
Status string ` json:"status" `
Message string ` json:"message" `
DedupKey string ` json:"dedup_key" `
}