2021-02-08 07:45:21 +00:00
|
|
|
// The error kernel shall handle errors for a given process.
|
2021-08-16 11:01:12 +00:00
|
|
|
// This will be cases where the process itself were unable
|
2021-02-08 07:45:21 +00:00
|
|
|
// to handle the error on it's own, and we might need to
|
|
|
|
// restart the process, or send a message back to the operator
|
2021-08-16 11:01:12 +00:00
|
|
|
// that the action which the message where supposed to trigger
|
|
|
|
// failed, or that an event where unable to be processed.
|
2021-02-08 07:45:21 +00:00
|
|
|
|
2021-02-08 05:02:54 +00:00
|
|
|
package steward
|
|
|
|
|
|
|
|
import (
|
2021-08-03 11:43:05 +00:00
|
|
|
"context"
|
2021-02-08 05:02:54 +00:00
|
|
|
"fmt"
|
|
|
|
"log"
|
2022-01-18 18:26:36 +00:00
|
|
|
"time"
|
2021-02-08 05:02:54 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// errorKernel is the structure that will hold all the error
|
|
|
|
// handling values and logic.
|
|
|
|
type errorKernel struct {
|
2021-03-12 08:38:19 +00:00
|
|
|
// NOTE: The errorKernel should probably have a concept
|
2021-02-08 07:45:21 +00:00
|
|
|
// of error-state which is a map of all the processes,
|
|
|
|
// how many times a process have failed over the same
|
|
|
|
// message etc...
|
2021-02-24 09:58:02 +00:00
|
|
|
|
|
|
|
// errorCh is used to report errors from a process
|
2022-01-18 18:26:36 +00:00
|
|
|
errorCh chan errorEvent
|
2022-05-22 04:36:02 +00:00
|
|
|
// testCh is used within REQTest for receving data for tests.
|
|
|
|
testCh chan []byte
|
2021-08-04 06:35:35 +00:00
|
|
|
|
2022-01-19 04:44:20 +00:00
|
|
|
ctx context.Context
|
|
|
|
cancel context.CancelFunc
|
|
|
|
metrics *metrics
|
2021-02-08 05:02:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// newErrorKernel will initialize and return a new error kernel
|
2022-01-19 04:44:20 +00:00
|
|
|
func newErrorKernel(ctx context.Context, m *metrics) *errorKernel {
|
2021-08-04 06:35:35 +00:00
|
|
|
ctxC, cancel := context.WithCancel(ctx)
|
|
|
|
|
2021-02-08 05:02:54 +00:00
|
|
|
return &errorKernel{
|
2022-01-18 18:26:36 +00:00
|
|
|
errorCh: make(chan errorEvent, 2),
|
2022-05-22 04:36:02 +00:00
|
|
|
testCh: make(chan []byte),
|
2021-08-04 06:35:35 +00:00
|
|
|
ctx: ctxC,
|
|
|
|
cancel: cancel,
|
2022-01-19 04:44:20 +00:00
|
|
|
metrics: m,
|
2021-02-08 05:02:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// startErrorKernel will start the error kernel and check if there
|
|
|
|
// have been reveived any errors from any of the processes, and
|
|
|
|
// handle them appropriately.
|
2021-03-12 08:38:19 +00:00
|
|
|
//
|
|
|
|
// NOTE: Since a process will be locked while waiting to send the error
|
2021-02-08 05:02:54 +00:00
|
|
|
// on the errorCh maybe it makes sense to have a channel inside the
|
|
|
|
// processes error handling with a select so we can send back to the
|
|
|
|
// process if it should continue or not based not based on how severe
|
|
|
|
// the error where. This should be right after sending the error
|
|
|
|
// sending in the process.
|
2022-01-20 07:17:37 +00:00
|
|
|
func (e *errorKernel) start(ringBufferBulkInCh chan<- []subjectAndMessage) error {
|
2021-03-12 08:38:19 +00:00
|
|
|
// NOTE: For now it will just print the error messages to the
|
2021-02-08 05:02:54 +00:00
|
|
|
// console.
|
|
|
|
|
2021-08-03 11:43:05 +00:00
|
|
|
for {
|
2022-01-18 18:26:36 +00:00
|
|
|
var errEvent errorEvent
|
2021-08-03 11:43:05 +00:00
|
|
|
select {
|
2022-01-18 18:26:36 +00:00
|
|
|
case errEvent = <-e.errorCh:
|
2021-08-04 06:35:35 +00:00
|
|
|
case <-e.ctx.Done():
|
2021-08-03 11:43:05 +00:00
|
|
|
return fmt.Errorf("info: stopping errorKernel")
|
|
|
|
}
|
2021-02-08 05:02:54 +00:00
|
|
|
|
2022-01-21 05:15:26 +00:00
|
|
|
sendErrorOrInfo := func(errEvent errorEvent) {
|
2022-06-16 21:32:44 +00:00
|
|
|
|
|
|
|
er := fmt.Sprintf("%v, node: %v, %v\n", time.Now().Format("Mon Jan _2 15:04:05 2006"), errEvent.process.node, errEvent.err)
|
2022-01-21 05:15:26 +00:00
|
|
|
|
|
|
|
sam := subjectAndMessage{
|
|
|
|
Subject: newSubject(REQErrorLog, "errorCentral"),
|
|
|
|
Message: Message{
|
|
|
|
Directory: "errorLog",
|
|
|
|
ToNode: "errorCentral",
|
|
|
|
FromNode: errEvent.process.node,
|
|
|
|
FileName: "error.log",
|
2022-01-31 07:49:46 +00:00
|
|
|
Data: []byte(er),
|
2022-01-21 05:15:26 +00:00
|
|
|
Method: REQErrorLog,
|
|
|
|
ACKTimeout: errEvent.process.configuration.ErrorMessageTimeout,
|
|
|
|
Retries: errEvent.process.configuration.ErrorMessageRetries,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put the message on the channel to the ringbuffer.
|
|
|
|
ringBufferBulkInCh <- []subjectAndMessage{sam}
|
2022-02-18 05:40:00 +00:00
|
|
|
|
|
|
|
if errEvent.process.configuration.EnableDebug {
|
|
|
|
log.Printf("%v\n", er)
|
|
|
|
}
|
2022-01-21 05:15:26 +00:00
|
|
|
}
|
|
|
|
|
2022-01-18 18:26:36 +00:00
|
|
|
// Check the type of the error to decide what to do.
|
|
|
|
//
|
2021-08-03 11:43:05 +00:00
|
|
|
// We should be able to handle each error individually and
|
2022-01-20 05:55:08 +00:00
|
|
|
// also concurrently, so each handler is started in it's
|
2021-08-03 11:43:05 +00:00
|
|
|
// own go routine
|
2022-01-18 18:26:36 +00:00
|
|
|
//
|
|
|
|
// Here we should check the severity of the error,
|
|
|
|
// and also possibly the the error-state of the process
|
2022-01-20 05:55:08 +00:00
|
|
|
// that fails.
|
2022-01-18 18:26:36 +00:00
|
|
|
switch errEvent.errorType {
|
|
|
|
|
2022-01-21 05:15:26 +00:00
|
|
|
case errTypeSendError:
|
2022-01-20 05:55:08 +00:00
|
|
|
// Just log the error by creating a message and send it
|
|
|
|
// to the errorCentral log server.
|
2022-01-20 06:19:49 +00:00
|
|
|
|
2022-01-18 18:26:36 +00:00
|
|
|
go func() {
|
2022-01-21 05:15:26 +00:00
|
|
|
sendErrorOrInfo(errEvent)
|
|
|
|
e.metrics.promErrorMessagesSentTotal.Inc()
|
|
|
|
}()
|
2022-01-18 18:26:36 +00:00
|
|
|
|
2022-01-21 05:15:26 +00:00
|
|
|
case errTypeSendInfo:
|
|
|
|
// Just log the error by creating a message and send it
|
|
|
|
// to the errorCentral log server.
|
2022-01-18 18:26:36 +00:00
|
|
|
|
2022-01-21 05:15:26 +00:00
|
|
|
go func() {
|
|
|
|
sendErrorOrInfo(errEvent)
|
|
|
|
e.metrics.promInfoMessagesSentTotal.Inc()
|
2022-01-18 18:26:36 +00:00
|
|
|
}()
|
|
|
|
|
2022-01-19 06:31:25 +00:00
|
|
|
case errTypeWithAction:
|
2022-01-18 18:26:36 +00:00
|
|
|
// Just print the error, and tell the process to continue. The
|
2022-01-20 05:55:08 +00:00
|
|
|
// process who sent the error should block and wait for receiving
|
2022-01-18 18:26:36 +00:00
|
|
|
// an errActionContinue message.
|
|
|
|
|
2022-01-20 05:55:08 +00:00
|
|
|
go func() {
|
2022-01-18 18:26:36 +00:00
|
|
|
log.Printf("TESTING, we received and error from the process, but we're telling the process back to continue\n")
|
|
|
|
|
2022-01-20 05:55:08 +00:00
|
|
|
// Send a message back to where the errWithAction function
|
|
|
|
// was called on the errorActionCh so the caller can decide
|
|
|
|
// what to do based on the response.
|
2022-01-18 18:26:36 +00:00
|
|
|
select {
|
|
|
|
case errEvent.errorActionCh <- errActionContinue:
|
|
|
|
case <-e.ctx.Done():
|
|
|
|
log.Printf("info: errorKernel: got ctx.Done, will stop waiting for errAction\n")
|
|
|
|
return
|
|
|
|
}
|
2022-01-20 05:55:08 +00:00
|
|
|
|
|
|
|
// We also want to log the error.
|
|
|
|
e.errSend(errEvent.process, errEvent.message, errEvent.err)
|
2022-01-18 18:26:36 +00:00
|
|
|
}()
|
2022-01-19 06:31:25 +00:00
|
|
|
|
|
|
|
default:
|
2022-02-11 08:04:14 +00:00
|
|
|
// fmt.Printf(" * case default\n")
|
2022-01-18 18:26:36 +00:00
|
|
|
}
|
2021-08-03 11:43:05 +00:00
|
|
|
}
|
2021-02-08 05:02:54 +00:00
|
|
|
}
|
|
|
|
|
2021-08-04 06:35:35 +00:00
|
|
|
func (e *errorKernel) stop() {
|
|
|
|
e.cancel()
|
|
|
|
}
|
|
|
|
|
2022-01-21 05:15:26 +00:00
|
|
|
// errSend will just send an error message to the errorCentral.
|
2022-01-19 04:04:11 +00:00
|
|
|
func (e *errorKernel) errSend(proc process, msg Message, err error) {
|
|
|
|
ev := errorEvent{
|
2022-01-19 06:31:25 +00:00
|
|
|
err: err,
|
2022-01-21 05:15:26 +00:00
|
|
|
errorType: errTypeSendError,
|
|
|
|
process: proc,
|
|
|
|
message: msg,
|
|
|
|
// We don't want to create any actions when just
|
|
|
|
// sending errors.
|
|
|
|
// errorActionCh: make(chan errorAction),
|
|
|
|
}
|
|
|
|
|
|
|
|
e.errorCh <- ev
|
|
|
|
}
|
|
|
|
|
|
|
|
// infoSend will just send an info message to the errorCentral.
|
|
|
|
func (e *errorKernel) infoSend(proc process, msg Message, err error) {
|
|
|
|
ev := errorEvent{
|
|
|
|
err: err,
|
|
|
|
errorType: errTypeSendInfo,
|
2022-01-19 06:31:25 +00:00
|
|
|
process: proc,
|
|
|
|
message: msg,
|
2022-01-19 04:04:11 +00:00
|
|
|
// We don't want to create any actions when just
|
|
|
|
// sending errors.
|
|
|
|
// errorActionCh: make(chan errorAction),
|
|
|
|
}
|
|
|
|
|
|
|
|
e.errorCh <- ev
|
|
|
|
}
|
|
|
|
|
2022-02-18 06:01:43 +00:00
|
|
|
func (e *errorKernel) logConsoleOnlyIfDebug(err error, c *Configuration) {
|
|
|
|
if c.EnableDebug {
|
|
|
|
log.Printf("%v\n", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-02-04 05:24:34 +00:00
|
|
|
// // TODO: Needs more work.
|
|
|
|
// //
|
|
|
|
// // errWithAction
|
|
|
|
// //
|
|
|
|
// // Will prepare an errorEvent to send to the errorKernel that
|
|
|
|
// // contains a channel of type errorAction.
|
|
|
|
// // The errorActionCh are returned from the function and are used
|
|
|
|
// // to create a channel between where this function is called and
|
|
|
|
// // the go routine started in the errorKernel. From where the
|
|
|
|
// // function was called we can read the channel for a response
|
|
|
|
// // given from the errorKernel, and then decide what to do based
|
|
|
|
// // on the errorAction value.
|
|
|
|
// func (e *errorKernel) errWithAction(proc process, msg Message, err error) chan errorAction {
|
|
|
|
// // Create the channel where to receive what action to do.
|
|
|
|
// errActionCh := make(chan errorAction)
|
2022-01-19 04:04:11 +00:00
|
|
|
//
|
2022-02-04 05:24:34 +00:00
|
|
|
// ev := errorEvent{
|
|
|
|
// err: err,
|
|
|
|
// errorType: errTypeWithAction,
|
|
|
|
// process: proc,
|
|
|
|
// message: msg,
|
|
|
|
// errorActionCh: errActionCh,
|
|
|
|
// }
|
2022-01-20 05:55:08 +00:00
|
|
|
//
|
2022-02-04 05:24:34 +00:00
|
|
|
// e.errorCh <- ev
|
|
|
|
//
|
|
|
|
// return errActionCh
|
|
|
|
// }
|
2022-01-19 04:04:11 +00:00
|
|
|
|
2022-01-18 18:26:36 +00:00
|
|
|
// errorAction is used to tell the process who sent the error
|
|
|
|
// what it shall do. The process who sends the error will
|
|
|
|
// have to block and wait for the response on the errorActionCh.
|
2021-02-08 05:02:54 +00:00
|
|
|
type errorAction int
|
|
|
|
|
|
|
|
const (
|
2022-01-20 05:55:08 +00:00
|
|
|
// errActionContinue is ment to be used when the a process
|
2022-05-22 04:36:02 +00:00
|
|
|
// can just continue without taking any special care.
|
2021-02-08 05:02:54 +00:00
|
|
|
errActionContinue errorAction = iota
|
2022-02-04 05:24:34 +00:00
|
|
|
// TODO:
|
2022-01-20 05:55:08 +00:00
|
|
|
// errActionKill should log the error,
|
2021-02-08 05:02:54 +00:00
|
|
|
// stop the current worker process, and spawn a new.
|
2022-02-04 05:24:34 +00:00
|
|
|
// errActionKill errorAction = iota
|
2021-02-08 05:02:54 +00:00
|
|
|
)
|
|
|
|
|
2022-01-18 18:26:36 +00:00
|
|
|
// errorType
|
|
|
|
type errorType int
|
|
|
|
|
|
|
|
const (
|
2022-01-19 04:04:11 +00:00
|
|
|
// errSend will just send the content of the error to the
|
|
|
|
// central error logger.
|
2022-01-21 05:15:26 +00:00
|
|
|
errTypeSendError errorType = iota
|
|
|
|
errTypeSendInfo errorType = iota
|
|
|
|
errTypeWithAction errorType = iota
|
2022-01-18 18:26:36 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
type errorEvent struct {
|
|
|
|
// The actual error
|
|
|
|
err error
|
2021-02-08 07:45:21 +00:00
|
|
|
// Channel for communicating the action to take back to
|
|
|
|
// to the process who triggered the error
|
2021-02-08 05:02:54 +00:00
|
|
|
errorActionCh chan errorAction
|
2021-02-08 07:45:21 +00:00
|
|
|
// Some informational text
|
2022-01-18 18:26:36 +00:00
|
|
|
errorType errorType
|
2021-02-08 07:45:21 +00:00
|
|
|
// The process structure that belongs to a given process
|
|
|
|
process process
|
|
|
|
// The message that where in progress when error occured
|
|
|
|
message Message
|
2021-02-08 05:02:54 +00:00
|
|
|
}
|
|
|
|
|
2022-01-18 18:26:36 +00:00
|
|
|
func (e errorEvent) Error() string {
|
2021-02-08 05:02:54 +00:00
|
|
|
return fmt.Sprintf("worker error: proc = %#v, message = %#v", e.process, e.message)
|
|
|
|
}
|