1
0
Fork 0
mirror of https://github.com/postmannen/ctrl.git synced 2025-01-05 20:09:16 +00:00
ctrl/errorkernel.go

322 lines
8.9 KiB
Go
Raw Normal View History

2021-02-08 07:45:21 +00:00
// The error kernel shall handle errors for a given process.
2021-08-16 11:01:12 +00:00
// This will be cases where the process itself were unable
2021-02-08 07:45:21 +00:00
// to handle the error on it's own, and we might need to
// restart the process, or send a message back to the operator
2021-08-16 11:01:12 +00:00
// that the action which the message where supposed to trigger
// failed, or that an event where unable to be processed.
2021-02-08 07:45:21 +00:00
package steward
import (
"context"
"fmt"
"log"
2023-01-11 05:09:42 +00:00
"os"
2022-01-18 18:26:36 +00:00
"time"
2023-01-11 05:09:42 +00:00
"golang.org/x/exp/slog"
)
// errorKernel is the structure that will hold all the error
// handling values and logic.
type errorKernel struct {
2021-03-12 08:38:19 +00:00
// NOTE: The errorKernel should probably have a concept
2021-02-08 07:45:21 +00:00
// of error-state which is a map of all the processes,
// how many times a process have failed over the same
// message etc...
2021-02-24 09:58:02 +00:00
// errorCh is used to report errors from a process
2022-01-18 18:26:36 +00:00
errorCh chan errorEvent
2022-05-22 04:36:02 +00:00
// testCh is used within REQTest for receving data for tests.
testCh chan []byte
2021-08-04 06:35:35 +00:00
2023-01-11 05:09:42 +00:00
ctx context.Context
cancel context.CancelFunc
metrics *metrics
configuration *Configuration
}
// newErrorKernel will initialize and return a new error kernel
2023-01-11 05:09:42 +00:00
func newErrorKernel(ctx context.Context, m *metrics, configuration *Configuration) *errorKernel {
2021-08-04 06:35:35 +00:00
ctxC, cancel := context.WithCancel(ctx)
return &errorKernel{
2023-01-11 05:09:42 +00:00
errorCh: make(chan errorEvent, 2),
testCh: make(chan []byte),
ctx: ctxC,
cancel: cancel,
metrics: m,
configuration: configuration,
}
}
2023-01-11 05:09:42 +00:00
type logLevel string
2023-01-11 07:38:15 +00:00
const logError logLevel = "error"
2023-01-11 05:09:42 +00:00
const logInfo logLevel = "info"
const logWarning logLevel = "warning"
const logDebug logLevel = "debug"
const logNone logLevel = "none"
// startErrorKernel will start the error kernel and check if there
// have been reveived any errors from any of the processes, and
// handle them appropriately.
2021-03-12 08:38:19 +00:00
//
// NOTE: Since a process will be locked while waiting to send the error
// on the errorCh maybe it makes sense to have a channel inside the
// processes error handling with a select so we can send back to the
// process if it should continue or not based not based on how severe
// the error where. This should be right after sending the error
// sending in the process.
func (e *errorKernel) start(ringBufferBulkInCh chan<- []subjectAndMessage) error {
2023-01-11 05:09:42 +00:00
// Initiate the slog logger.
var replaceFunc func(groups []string, a slog.Attr) slog.Attr
if !e.configuration.LogConsoleTimestamps {
replaceFunc = func(groups []string, a slog.Attr) slog.Attr {
if a.Key == slog.TimeKey {
return slog.Attr{}
}
return a
}
}
switch {
2023-01-11 07:38:15 +00:00
case e.configuration.LogLevel == string(logError):
opts := slog.HandlerOptions{Level: slog.LevelError,
ReplaceAttr: replaceFunc}
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
2023-01-11 05:09:42 +00:00
case e.configuration.LogLevel == string(logInfo):
opts := slog.HandlerOptions{Level: slog.LevelInfo,
ReplaceAttr: replaceFunc}
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
case e.configuration.LogLevel == string(logWarning):
opts := slog.HandlerOptions{Level: slog.LevelWarn,
ReplaceAttr: replaceFunc}
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
case e.configuration.LogLevel == string(logDebug):
opts := slog.HandlerOptions{Level: slog.LevelDebug,
ReplaceAttr: replaceFunc}
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
case e.configuration.LogLevel == string(logNone):
// TODO:
default:
log.Printf("error: not valid log level: %v\n", e.configuration.LogLevel)
os.Exit(1)
}
for {
2022-01-18 18:26:36 +00:00
var errEvent errorEvent
select {
2022-01-18 18:26:36 +00:00
case errEvent = <-e.errorCh:
2021-08-04 06:35:35 +00:00
case <-e.ctx.Done():
return fmt.Errorf("info: stopping errorKernel")
}
2022-01-21 05:15:26 +00:00
sendErrorOrInfo := func(errEvent errorEvent) {
er := fmt.Sprintf("%v, node: %v, %v\n", time.Now().Format("Mon Jan _2 15:04:05 2006"), errEvent.process.node, errEvent.err)
2022-01-21 05:15:26 +00:00
m := Message{
Directory: "errorLog",
ToNode: "errorCentral",
FromNode: errEvent.process.node,
FileName: "error.log",
Data: []byte(er),
Method: REQErrorLog,
ACKTimeout: errEvent.process.configuration.ErrorMessageTimeout,
Retries: errEvent.process.configuration.ErrorMessageRetries,
}
2022-01-21 05:15:26 +00:00
sam := subjectAndMessage{
Subject: newSubject(REQErrorLog, "errorCentral"),
Message: m,
2022-01-21 05:15:26 +00:00
}
// Put the message on the channel to the ringbuffer.
ringBufferBulkInCh <- []subjectAndMessage{sam}
2023-01-11 07:38:15 +00:00
// if errEvent.process.configuration.EnableDebug {
// log.Printf("%v\n", er)
// }
switch errEvent.logLevel {
case logError:
2023-01-12 06:12:35 +00:00
slog.Error("error", fmt.Errorf("%v", er))
2023-01-11 07:38:15 +00:00
case logInfo:
2023-01-12 06:12:35 +00:00
slog.Info(er)
2023-01-11 07:38:15 +00:00
case logWarning:
2023-01-12 06:12:35 +00:00
slog.Warn(er)
2023-01-11 07:38:15 +00:00
case logDebug:
2023-01-12 06:12:35 +00:00
slog.Debug(er)
2023-01-11 07:38:15 +00:00
case logNone:
// Do nothing for type logNone errors.
}
2023-01-11 07:38:15 +00:00
2022-01-21 05:15:26 +00:00
}
2022-01-18 18:26:36 +00:00
// Check the type of the error to decide what to do.
//
// We should be able to handle each error individually and
2022-01-20 05:55:08 +00:00
// also concurrently, so each handler is started in it's
// own go routine
2022-01-18 18:26:36 +00:00
//
// Here we should check the severity of the error,
// and also possibly the the error-state of the process
2022-01-20 05:55:08 +00:00
// that fails.
2022-01-18 18:26:36 +00:00
switch errEvent.errorType {
2022-01-21 05:15:26 +00:00
case errTypeSendError:
2022-01-20 05:55:08 +00:00
// Just log the error by creating a message and send it
// to the errorCentral log server.
2022-01-18 18:26:36 +00:00
go func() {
2022-01-21 05:15:26 +00:00
sendErrorOrInfo(errEvent)
e.metrics.promErrorMessagesSentTotal.Inc()
}()
2022-01-18 18:26:36 +00:00
2022-01-21 05:15:26 +00:00
case errTypeSendInfo:
// Just log the error by creating a message and send it
// to the errorCentral log server.
2022-01-18 18:26:36 +00:00
2022-01-21 05:15:26 +00:00
go func() {
sendErrorOrInfo(errEvent)
e.metrics.promInfoMessagesSentTotal.Inc()
2022-01-18 18:26:36 +00:00
}()
case errTypeWithAction:
2022-01-18 18:26:36 +00:00
// Just print the error, and tell the process to continue. The
2022-01-20 05:55:08 +00:00
// process who sent the error should block and wait for receiving
2022-01-18 18:26:36 +00:00
// an errActionContinue message.
2022-01-20 05:55:08 +00:00
go func() {
2022-01-18 18:26:36 +00:00
log.Printf("TESTING, we received and error from the process, but we're telling the process back to continue\n")
2022-01-20 05:55:08 +00:00
// Send a message back to where the errWithAction function
// was called on the errorActionCh so the caller can decide
// what to do based on the response.
2022-01-18 18:26:36 +00:00
select {
case errEvent.errorActionCh <- errActionContinue:
case <-e.ctx.Done():
log.Printf("info: errorKernel: got ctx.Done, will stop waiting for errAction\n")
return
}
2022-01-20 05:55:08 +00:00
// We also want to log the error.
2023-01-11 07:38:15 +00:00
e.errSend(errEvent.process, errEvent.message, errEvent.err, logWarning)
2022-01-18 18:26:36 +00:00
}()
default:
2022-02-11 08:04:14 +00:00
// fmt.Printf(" * case default\n")
2022-01-18 18:26:36 +00:00
}
}
}
2021-08-04 06:35:35 +00:00
func (e *errorKernel) stop() {
e.cancel()
}
2023-01-11 05:09:42 +00:00
type errorEvent struct {
// The actual error
err error
// Channel for communicating the action to take back to
// to the process who triggered the error
errorActionCh chan errorAction
// Some informational text
errorType errorType
// The process structure that belongs to a given process
process process
// The message that where in progress when error occured
message Message
2023-01-11 07:38:15 +00:00
// Level, the log level of the severity
logLevel logLevel
2023-01-11 05:09:42 +00:00
}
func (e errorEvent) Error() string {
return fmt.Sprintf("worker error: proc = %#v, message = %#v", e.process, e.message)
}
2022-01-21 05:15:26 +00:00
// errSend will just send an error message to the errorCentral.
2023-01-11 07:38:15 +00:00
func (e *errorKernel) errSend(proc process, msg Message, err error, logLevel logLevel) {
2022-01-19 04:04:11 +00:00
ev := errorEvent{
err: err,
2022-01-21 05:15:26 +00:00
errorType: errTypeSendError,
process: proc,
message: msg,
2023-01-11 07:38:15 +00:00
logLevel: logLevel,
2022-01-21 05:15:26 +00:00
// We don't want to create any actions when just
// sending errors.
// errorActionCh: make(chan errorAction),
}
e.errorCh <- ev
}
// infoSend will just send an info message to the errorCentral.
func (e *errorKernel) infoSend(proc process, msg Message, err error) {
ev := errorEvent{
err: err,
errorType: errTypeSendInfo,
process: proc,
message: msg,
2022-01-19 04:04:11 +00:00
// We don't want to create any actions when just
// sending errors.
// errorActionCh: make(chan errorAction),
}
e.errorCh <- ev
}
2023-01-12 11:01:01 +00:00
func (e *errorKernel) logError(err error, c *Configuration) {
if c.LogLevel == string(logError) {
slog.Error("error", err)
}
}
func (e *errorKernel) logInfo(err error, c *Configuration) {
if c.LogLevel == string(logInfo) {
slog.Info(err.Error())
}
}
func (e *errorKernel) logWarn(err error, c *Configuration) {
if c.LogLevel == string(logWarning) {
slog.Warn(err.Error())
}
}
func (e *errorKernel) logDebug(err error, c *Configuration) {
2023-01-12 06:44:28 +00:00
if c.LogLevel == string(logDebug) {
2023-01-12 06:12:35 +00:00
slog.Debug(err.Error())
}
}
2022-01-18 18:26:36 +00:00
// errorAction is used to tell the process who sent the error
// what it shall do. The process who sends the error will
// have to block and wait for the response on the errorActionCh.
type errorAction int
const (
2022-01-20 05:55:08 +00:00
// errActionContinue is ment to be used when the a process
2022-05-22 04:36:02 +00:00
// can just continue without taking any special care.
errActionContinue errorAction = iota
2022-06-22 12:50:26 +00:00
// TODO NOT IMPLEMENTED YET:
2022-01-20 05:55:08 +00:00
// errActionKill should log the error,
2022-06-22 12:50:26 +00:00
// and f.ex. stop the current work, and restart from start?
2022-02-04 05:24:34 +00:00
// errActionKill errorAction = iota
)
2022-01-18 18:26:36 +00:00
// errorType
type errorType int
const (
2022-01-19 04:04:11 +00:00
// errSend will just send the content of the error to the
// central error logger.
2022-01-21 05:15:26 +00:00
errTypeSendError errorType = iota
errTypeSendInfo errorType = iota
errTypeWithAction errorType = iota
2022-01-18 18:26:36 +00:00
)