mirror of
https://github.com/postmannen/ctrl.git
synced 2024-12-14 12:37:31 +00:00
69995f76ca
updated references removed tui client removed ringbuffer persist store removed ringbuffer enabled audit logging moved audit logging to message readers disabled goreleaser update readme, cbor, zstd removed request type ping and pong update readme testing with cmd.WaitDelay for clicommand fixed readme removed ringbuffer flag default serialization set to cbor, default compression set to zstd, fixed race, removed event type ack and nack, also removed from subject. Fixed file stat error for copy log file removed remaining elements of the event type removed comments renamed toRingbufferCh to samToSendCh renamed directSAMSCh ro samSendLocalCh removed handler interface agpl3 license added license-change.md
321 lines
8.9 KiB
Go
321 lines
8.9 KiB
Go
// The error kernel shall handle errors for a given process.
|
|
// This will be cases where the process itself were unable
|
|
// to handle the error on it's own, and we might need to
|
|
// restart the process, or send a message back to the operator
|
|
// that the action which the message where supposed to trigger
|
|
// failed, or that an event where unable to be processed.
|
|
|
|
package ctrl
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"time"
|
|
|
|
"golang.org/x/exp/slog"
|
|
)
|
|
|
|
// errorKernel is the structure that will hold all the error
|
|
// handling values and logic.
|
|
type errorKernel struct {
|
|
// NOTE: The errorKernel should probably have a concept
|
|
// of error-state which is a map of all the processes,
|
|
// how many times a process have failed over the same
|
|
// message etc...
|
|
|
|
// errorCh is used to report errors from a process
|
|
errorCh chan errorEvent
|
|
// testCh is used within REQTest for receving data for tests.
|
|
testCh chan []byte
|
|
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
metrics *metrics
|
|
configuration *Configuration
|
|
}
|
|
|
|
// newErrorKernel will initialize and return a new error kernel
|
|
func newErrorKernel(ctx context.Context, m *metrics, configuration *Configuration) *errorKernel {
|
|
ctxC, cancel := context.WithCancel(ctx)
|
|
|
|
return &errorKernel{
|
|
errorCh: make(chan errorEvent, 2),
|
|
testCh: make(chan []byte),
|
|
ctx: ctxC,
|
|
cancel: cancel,
|
|
metrics: m,
|
|
configuration: configuration,
|
|
}
|
|
}
|
|
|
|
type logLevel string
|
|
|
|
const logError logLevel = "error"
|
|
const logInfo logLevel = "info"
|
|
const logWarning logLevel = "warning"
|
|
const logDebug logLevel = "debug"
|
|
const logNone logLevel = "none"
|
|
|
|
// startErrorKernel will start the error kernel and check if there
|
|
// have been reveived any errors from any of the processes, and
|
|
// handle them appropriately.
|
|
//
|
|
// NOTE: Since a process will be locked while waiting to send the error
|
|
// on the errorCh maybe it makes sense to have a channel inside the
|
|
// processes error handling with a select so we can send back to the
|
|
// process if it should continue or not based not based on how severe
|
|
// the error where. This should be right after sending the error
|
|
// sending in the process.
|
|
func (e *errorKernel) start(ringBufferBulkInCh chan<- []subjectAndMessage) error {
|
|
// Initiate the slog logger.
|
|
var replaceFunc func(groups []string, a slog.Attr) slog.Attr
|
|
if !e.configuration.LogConsoleTimestamps {
|
|
replaceFunc = func(groups []string, a slog.Attr) slog.Attr {
|
|
if a.Key == slog.TimeKey {
|
|
return slog.Attr{}
|
|
}
|
|
return a
|
|
}
|
|
}
|
|
|
|
switch {
|
|
case e.configuration.LogLevel == string(logError):
|
|
opts := slog.HandlerOptions{Level: slog.LevelError,
|
|
ReplaceAttr: replaceFunc}
|
|
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
|
|
|
|
case e.configuration.LogLevel == string(logInfo):
|
|
opts := slog.HandlerOptions{Level: slog.LevelInfo,
|
|
ReplaceAttr: replaceFunc}
|
|
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
|
|
|
|
case e.configuration.LogLevel == string(logWarning):
|
|
opts := slog.HandlerOptions{Level: slog.LevelWarn,
|
|
ReplaceAttr: replaceFunc}
|
|
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
|
|
|
|
case e.configuration.LogLevel == string(logDebug):
|
|
opts := slog.HandlerOptions{Level: slog.LevelDebug,
|
|
ReplaceAttr: replaceFunc}
|
|
slog.SetDefault(slog.New(opts.NewTextHandler(os.Stderr)))
|
|
|
|
case e.configuration.LogLevel == string(logNone):
|
|
// TODO:
|
|
default:
|
|
log.Printf("error: not valid log level: %v\n", e.configuration.LogLevel)
|
|
os.Exit(1)
|
|
}
|
|
|
|
for {
|
|
var errEvent errorEvent
|
|
select {
|
|
case errEvent = <-e.errorCh:
|
|
case <-e.ctx.Done():
|
|
return fmt.Errorf("info: stopping errorKernel")
|
|
}
|
|
|
|
sendErrorOrInfo := func(errEvent errorEvent) {
|
|
|
|
er := fmt.Sprintf("%v, node: %v, %v\n", time.Now().Format("Mon Jan _2 15:04:05 2006"), errEvent.process.node, errEvent.err)
|
|
|
|
m := Message{
|
|
Directory: "errorLog",
|
|
ToNode: "errorCentral",
|
|
FromNode: errEvent.process.node,
|
|
FileName: "error.log",
|
|
Data: []byte(er),
|
|
Method: REQErrorLog,
|
|
ACKTimeout: errEvent.process.configuration.ErrorMessageTimeout,
|
|
Retries: errEvent.process.configuration.ErrorMessageRetries,
|
|
}
|
|
|
|
sam := subjectAndMessage{
|
|
Subject: newSubject(REQErrorLog, "errorCentral"),
|
|
Message: m,
|
|
}
|
|
|
|
// Put the message on the channel to the ringbuffer.
|
|
ringBufferBulkInCh <- []subjectAndMessage{sam}
|
|
|
|
// if errEvent.process.configuration.EnableDebug {
|
|
// log.Printf("%v\n", er)
|
|
// }
|
|
|
|
switch errEvent.logLevel {
|
|
case logError:
|
|
slog.Error("error", fmt.Errorf("%v", er))
|
|
case logInfo:
|
|
slog.Info(er)
|
|
case logWarning:
|
|
slog.Warn(er)
|
|
case logDebug:
|
|
slog.Debug(er)
|
|
case logNone:
|
|
// Do nothing for type logNone errors.
|
|
}
|
|
|
|
}
|
|
|
|
// Check the type of the error to decide what to do.
|
|
//
|
|
// We should be able to handle each error individually and
|
|
// also concurrently, so each handler is started in it's
|
|
// own go routine
|
|
//
|
|
// Here we should check the severity of the error,
|
|
// and also possibly the the error-state of the process
|
|
// that fails.
|
|
switch errEvent.errorType {
|
|
|
|
case errTypeSendError:
|
|
// Just log the error by creating a message and send it
|
|
// to the errorCentral log server.
|
|
|
|
go func() {
|
|
sendErrorOrInfo(errEvent)
|
|
e.metrics.promErrorMessagesSentTotal.Inc()
|
|
}()
|
|
|
|
case errTypeSendInfo:
|
|
// Just log the error by creating a message and send it
|
|
// to the errorCentral log server.
|
|
|
|
go func() {
|
|
sendErrorOrInfo(errEvent)
|
|
e.metrics.promInfoMessagesSentTotal.Inc()
|
|
}()
|
|
|
|
case errTypeWithAction:
|
|
// Just print the error, and tell the process to continue. The
|
|
// process who sent the error should block and wait for receiving
|
|
// an errActionContinue message.
|
|
|
|
go func() {
|
|
log.Printf("TESTING, we received and error from the process, but we're telling the process back to continue\n")
|
|
|
|
// Send a message back to where the errWithAction function
|
|
// was called on the errorActionCh so the caller can decide
|
|
// what to do based on the response.
|
|
select {
|
|
case errEvent.errorActionCh <- errActionContinue:
|
|
case <-e.ctx.Done():
|
|
log.Printf("info: errorKernel: got ctx.Done, will stop waiting for errAction\n")
|
|
return
|
|
}
|
|
|
|
// We also want to log the error.
|
|
e.errSend(errEvent.process, errEvent.message, errEvent.err, logWarning)
|
|
}()
|
|
|
|
default:
|
|
// fmt.Printf(" * case default\n")
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *errorKernel) stop() {
|
|
e.cancel()
|
|
}
|
|
|
|
type errorEvent struct {
|
|
// The actual error
|
|
err error
|
|
// Channel for communicating the action to take back to
|
|
// to the process who triggered the error
|
|
errorActionCh chan errorAction
|
|
// Some informational text
|
|
errorType errorType
|
|
// The process structure that belongs to a given process
|
|
process process
|
|
// The message that where in progress when error occured
|
|
message Message
|
|
// Level, the log level of the severity
|
|
logLevel logLevel
|
|
}
|
|
|
|
func (e errorEvent) Error() string {
|
|
return fmt.Sprintf("worker error: proc = %#v, message = %#v", e.process, e.message)
|
|
}
|
|
|
|
// errSend will just send an error message to the errorCentral.
|
|
func (e *errorKernel) errSend(proc process, msg Message, err error, logLevel logLevel) {
|
|
ev := errorEvent{
|
|
err: err,
|
|
errorType: errTypeSendError,
|
|
process: proc,
|
|
message: msg,
|
|
logLevel: logLevel,
|
|
// We don't want to create any actions when just
|
|
// sending errors.
|
|
// errorActionCh: make(chan errorAction),
|
|
}
|
|
|
|
e.errorCh <- ev
|
|
}
|
|
|
|
// infoSend will just send an info message to the errorCentral.
|
|
func (e *errorKernel) infoSend(proc process, msg Message, err error) {
|
|
ev := errorEvent{
|
|
err: err,
|
|
errorType: errTypeSendInfo,
|
|
process: proc,
|
|
message: msg,
|
|
// We don't want to create any actions when just
|
|
// sending errors.
|
|
// errorActionCh: make(chan errorAction),
|
|
}
|
|
|
|
e.errorCh <- ev
|
|
}
|
|
|
|
func (e *errorKernel) logError(err error, c *Configuration) {
|
|
if c.LogLevel == string(logError) {
|
|
slog.Error("error", err)
|
|
}
|
|
}
|
|
|
|
func (e *errorKernel) logInfo(err error, c *Configuration) {
|
|
if c.LogLevel == string(logInfo) {
|
|
slog.Info(err.Error())
|
|
}
|
|
}
|
|
|
|
func (e *errorKernel) logWarn(err error, c *Configuration) {
|
|
if c.LogLevel == string(logWarning) {
|
|
slog.Warn(err.Error())
|
|
}
|
|
}
|
|
|
|
func (e *errorKernel) logDebug(err error, c *Configuration) {
|
|
if c.LogLevel == string(logDebug) {
|
|
slog.Debug(err.Error())
|
|
}
|
|
}
|
|
|
|
// errorAction is used to tell the process who sent the error
|
|
// what it shall do. The process who sends the error will
|
|
// have to block and wait for the response on the errorActionCh.
|
|
type errorAction int
|
|
|
|
const (
|
|
// errActionContinue is ment to be used when the a process
|
|
// can just continue without taking any special care.
|
|
errActionContinue errorAction = iota
|
|
// TODO NOT IMPLEMENTED YET:
|
|
// errActionKill should log the error,
|
|
// and f.ex. stop the current work, and restart from start?
|
|
// errActionKill errorAction = iota
|
|
)
|
|
|
|
// errorType
|
|
type errorType int
|
|
|
|
const (
|
|
// errSend will just send the content of the error to the
|
|
// central error logger.
|
|
errTypeSendError errorType = iota
|
|
errTypeSendInfo errorType = iota
|
|
errTypeWithAction errorType = iota
|
|
)
|