1
0
Fork 0
mirror of https://github.com/postmannen/ctrl.git synced 2025-01-07 04:49:17 +00:00
ctrl/process.go

1027 lines
36 KiB
Go
Raw Normal View History

2021-03-03 13:14:32 +00:00
package steward
import (
"bytes"
2022-01-03 11:30:28 +00:00
"compress/gzip"
2021-04-07 16:05:07 +00:00
"context"
2022-02-03 06:04:10 +00:00
"crypto/ed25519"
2021-03-03 13:14:32 +00:00
"encoding/gob"
2022-06-17 07:54:20 +00:00
"errors"
2021-03-03 13:14:32 +00:00
"fmt"
2022-01-03 11:30:28 +00:00
"io"
2021-03-03 13:14:32 +00:00
"log"
2022-01-03 10:36:05 +00:00
"os"
"sync"
2021-03-03 13:14:32 +00:00
"time"
2021-12-29 06:28:09 +00:00
"github.com/fxamacker/cbor/v2"
"github.com/klauspost/compress/zstd"
2021-03-03 13:14:32 +00:00
"github.com/nats-io/nats.go"
"github.com/prometheus/client_golang/prometheus"
2022-06-17 07:54:20 +00:00
// "google.golang.org/protobuf/internal/errors"
2021-03-03 13:14:32 +00:00
)
// processKind are either kindSubscriber or kindPublisher, and are
// used to distinguish the kind of process to spawn and to know
// the process kind put in the process map.
type processKind string
const (
processKindSubscriber processKind = "subscriber"
processKindPublisher processKind = "publisher"
)
2021-08-16 11:01:12 +00:00
// process holds all the logic to handle a message type and it's
// method, subscription/publishin messages for a subject, and more.
2021-03-03 13:14:32 +00:00
type process struct {
// isSubProcess is used to indentify subprocesses spawned by other processes.
isSubProcess bool
2022-04-01 05:09:55 +00:00
// server
server *server
// messageID
2021-03-03 13:14:32 +00:00
messageID int
// the subject used for the specific process. One process
// can contain only one sender on a message bus, hence
// also one subject
subject Subject
// Put a node here to be able know the node a process is at.
// NB: Might not be needed later on.
2021-06-29 06:21:42 +00:00
node Node
2021-03-03 13:14:32 +00:00
// The processID for the current process
processID int
2021-03-03 13:14:32 +00:00
processKind processKind
// methodsAvailable
methodsAvailable MethodsAvailable
2022-03-31 12:57:30 +00:00
// procFunc is a function that will be started when a worker process
// is started. If a procFunc is registered when creating a new process
// the procFunc will be started as a go routine when the process is started,
// and stopped when the process is stopped.
//
// A procFunc can be started both for publishing and subscriber processes.
//
// When used with a subscriber process the usecase is most likely to handle
// some kind of state needed for a request type. The handlers themselves
// can not hold state since they are only called once per message received,
// and exits when the message is handled leaving no state behind. With a procfunc
// we can have a process function running at all times tied to the process, and
// this function can be able to hold the state needed in a certain scenario.
//
// With a subscriber handler you generally take the message in the handler and
// pass it on to the procFunc by putting it on the procFuncCh<-, and the
// message can then be read from the procFuncCh inside the procFunc, and we
// can do some further work on it, for example update registry for metrics that
// is needed for that specific request type.
//
// With a publisher process you can attach a static function that will do some
// work to a request type, and publish the result.
//
// procFunc's can also be used to wrap in other types which we want to
// work with. An example can be handling of metrics which the message
// have no notion of, but a procFunc can have that wrapped in from when it was constructed.
procFunc func(ctx context.Context, procFuncCh chan Message) error
// The channel to send a messages to the procFunc go routine.
// This is typically used within the methodHandler for so we
// can pass messages between the procFunc and the handler.
procFuncCh chan Message
2021-03-08 13:09:14 +00:00
// copy of the configuration from server
configuration *Configuration
// The new messages channel copied from *Server
toRingbufferCh chan<- []subjectAndMessage
// The structure who holds all processes information
processes *processes
// nats connection
natsConn *nats.Conn
// natsSubscription returned when calling natsConn.Subscribe
2021-04-08 10:51:54 +00:00
natsSubscription *nats.Subscription
2021-04-07 16:05:07 +00:00
// context
ctx context.Context
// context cancelFunc
ctxCancel context.CancelFunc
// Process name
processName processName
2022-06-11 04:30:58 +00:00
// handler is used to directly attach a handler to a process upon
// creation of the process, like when a process is spawning a sub
// process like REQCopySrc do. If we're not spawning a sub process
// and it is a regular process the handler to use is found with the
// getHandler method
handler func(proc process, message Message, node string) ([]byte, error)
// startup holds the startup functions for starting up publisher
// or subscriber processes
2021-08-18 10:16:21 +00:00
startup *startup
// Signatures
2022-04-21 11:21:36 +00:00
nodeAuth *nodeAuth
2022-04-07 07:34:06 +00:00
// centralAuth
centralAuth *centralAuth
2022-04-01 06:43:14 +00:00
// errorKernel
errorKernel *errorKernel
2022-04-01 06:51:14 +00:00
// metrics
metrics *metrics
2021-03-03 13:14:32 +00:00
}
// prepareNewProcess will set the the provided values and the default
// values for a process.
2022-04-01 05:09:55 +00:00
func newProcess(ctx context.Context, server *server, subject Subject, processKind processKind, procFunc func() error) process {
2021-03-03 13:14:32 +00:00
// create the initial configuration for a sessions communicating with 1 host process.
server.processes.mu.Lock()
2022-04-01 05:09:55 +00:00
server.processes.lastProcessID++
pid := server.processes.lastProcessID
server.processes.mu.Unlock()
2021-03-03 13:14:32 +00:00
ctx, cancel := context.WithCancel(ctx)
2021-04-07 16:05:07 +00:00
var method Method
2021-03-03 13:14:32 +00:00
proc := process{
2022-04-01 05:09:55 +00:00
server: server,
2021-03-03 13:14:32 +00:00
messageID: 0,
subject: subject,
2022-04-01 05:09:55 +00:00
node: Node(server.configuration.NodeName),
processID: pid,
2021-03-03 13:14:32 +00:00
processKind: processKind,
methodsAvailable: method.GetMethodsAvailable(),
2022-04-01 05:09:55 +00:00
toRingbufferCh: server.toRingBufferCh,
configuration: server.configuration,
processes: server.processes,
natsConn: server.natsConn,
2021-04-07 16:05:07 +00:00
ctx: ctx,
ctxCancel: cancel,
2022-04-01 05:09:55 +00:00
startup: newStartup(server),
2022-04-21 11:21:36 +00:00
nodeAuth: server.nodeAuth,
2022-04-07 07:34:06 +00:00
centralAuth: server.centralAuth,
2022-04-01 06:43:14 +00:00
errorKernel: server.errorKernel,
2022-04-01 06:51:14 +00:00
metrics: server.metrics,
2021-03-03 13:14:32 +00:00
}
// We use the full name of the subject to identify a unique
// process. We can do that since a process can only handle
// one message queue.
if proc.processKind == processKindPublisher {
proc.processName = processNameGet(proc.subject.name(), processKindPublisher)
}
if proc.processKind == processKindSubscriber {
proc.processName = processNameGet(proc.subject.name(), processKindSubscriber)
}
2021-03-03 13:14:32 +00:00
return proc
}
// The purpose of this function is to check if we should start a
// publisher or subscriber process, where a process is a go routine
// that will handle either sending or receiving messages on one
// subject.
//
// It will give the process the next available ID, and also add the
// process to the processes map in the server structure.
2022-04-01 05:09:55 +00:00
func (p process) spawnWorker() {
2021-03-03 13:14:32 +00:00
// processName := processNameGet(p.subject.name(), p.processKind)
2021-06-08 04:02:08 +00:00
// Add prometheus metrics for the process.
2022-11-30 05:18:19 +00:00
if !p.isSubProcess {
p.metrics.promProcessesAllRunning.With(prometheus.Labels{"processName": string(p.processName)})
}
2021-03-03 13:14:32 +00:00
// Start a publisher worker, which will start a go routine (process)
// That will take care of all the messages for the subject it owns.
if p.processKind == processKindPublisher {
2021-03-25 13:30:39 +00:00
2021-03-09 06:43:55 +00:00
// If there is a procFunc for the process, start it.
if p.procFunc != nil {
// Initialize the channel for communication between the proc and
// the procFunc.
p.procFuncCh = make(chan Message)
2021-03-09 06:43:55 +00:00
// Start the procFunc in it's own anonymous func so we are able
// to get the return error.
go func() {
err := p.procFunc(p.ctx, p.procFuncCh)
2021-03-09 06:43:55 +00:00
if err != nil {
2021-09-23 06:19:53 +00:00
er := fmt.Errorf("error: spawnWorker: start procFunc failed: %v", err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, Message{}, er)
2021-03-09 06:43:55 +00:00
}
}()
}
2022-04-01 05:09:55 +00:00
go p.publishMessages(p.natsConn)
2021-03-03 13:14:32 +00:00
}
// Start a subscriber worker, which will start a go routine (process)
// That will take care of all the messages for the subject it owns.
if p.processKind == processKindSubscriber {
// If there is a procFunc for the process, start it.
if p.procFunc != nil {
// Initialize the channel for communication between the proc and
// the procFunc.
p.procFuncCh = make(chan Message)
2021-03-25 13:30:39 +00:00
// Start the procFunc in it's own anonymous func so we are able
// to get the return error.
go func() {
err := p.procFunc(p.ctx, p.procFuncCh)
if err != nil {
2021-09-23 06:19:53 +00:00
er := fmt.Errorf("error: spawnWorker: start procFunc failed: %v", err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, Message{}, er)
}
}()
}
2021-03-05 12:10:46 +00:00
2021-04-08 10:51:54 +00:00
p.natsSubscription = p.subscribeMessages()
// We also need to be able to remove all the information about this process
// when the process context is canceled.
go func() {
<-p.ctx.Done()
err := p.natsSubscription.Unsubscribe()
if err != nil {
er := fmt.Errorf("error: spawnWorker: got <-ctx.Done, but unable to unsubscribe natsSubscription failed: %v", err)
p.errorKernel.errSend(p, Message{}, er)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
}
p.processes.active.mu.Lock()
delete(p.processes.active.procNames, p.processName)
p.processes.active.mu.Unlock()
log.Printf("Successfully stopped process: %v\n", p.processName)
}()
2021-03-03 13:14:32 +00:00
}
2021-04-08 10:51:54 +00:00
// Add information about the new process to the started processes map.
2022-04-01 07:21:50 +00:00
p.processes.active.mu.Lock()
p.processes.active.procNames[p.processName] = p
2022-04-01 07:21:50 +00:00
p.processes.active.mu.Unlock()
2022-10-05 07:24:49 +00:00
er := fmt.Errorf("successfully started process: %v", p.processName)
2022-10-05 07:16:22 +00:00
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2021-03-03 13:14:32 +00:00
}
2022-06-17 07:54:20 +00:00
var (
ErrACKSubscribeRetry = errors.New("steward: retrying to subscribe for ack message")
)
// messageDeliverNats will create the Nats message with headers and payload.
// It will also take care of the delivering the message that is converted to
2021-12-30 05:28:21 +00:00
// gob or cbor format as a nats.Message. It will also take care of checking
// timeouts and retries specified for the message.
2021-12-29 05:40:42 +00:00
func (p process) messageDeliverNats(natsMsgPayload []byte, natsMsgHeader nats.Header, natsConn *nats.Conn, message Message) {
2021-03-03 13:14:32 +00:00
retryAttempts := 0
if message.RetryWait <= 0 {
message.RetryWait = 0
}
// The for loop will run until the message is delivered successfully,
// or that retries are reached.
2021-03-03 13:14:32 +00:00
for {
msg := &nats.Msg{
2021-03-09 06:43:55 +00:00
Subject: string(p.subject.name()),
// Subject: fmt.Sprintf("%s.%s.%s", proc.node, "command", "CLICommandRequest"),
2021-03-03 13:14:32 +00:00
// Structure of the reply message are:
// <nodename>.<message type>.<method>.reply
2021-12-29 05:40:42 +00:00
Reply: fmt.Sprintf("%s.reply", p.subject.name()),
Data: natsMsgPayload,
Header: natsMsgHeader,
2021-12-28 11:05:09 +00:00
}
er := fmt.Errorf("info: preparing to send nats message with subject %v, id: %v", msg.Subject, message.ID)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-12-29 21:49:47 +00:00
var err error
switch {
2022-01-27 06:19:04 +00:00
// If it is a NACK message we just deliver the message and return
// here so we don't create a ACK message and then stop waiting for it.
case message.ACKTimeout < 1:
2022-12-29 21:49:47 +00:00
err = func() error {
err := natsConn.PublishMsg(msg)
if err != nil {
er := fmt.Errorf("error: nats publish for message with subject failed: %v", err)
log.Printf("%v\n", er)
return ErrACKSubscribeRetry
}
p.metrics.promNatsDeliveredTotal.Inc()
2022-12-29 21:49:47 +00:00
//err = natsConn.Flush()
//if err != nil {
// er := fmt.Errorf("error: nats publish flush failed: %v", err)
// log.Printf("%v\n", er)
// return
//}
2022-06-21 08:40:32 +00:00
2022-12-29 21:49:47 +00:00
// The remaining logic is for handling ACK messages, so we return here
// since it was a NACK message, and all or now done.
return nil
2022-12-29 21:49:47 +00:00
}()
case message.ACKTimeout >= 1:
2022-12-29 21:49:47 +00:00
// The function below will return nil if the message should not be retried.
2022-06-17 07:54:20 +00:00
//
2022-12-29 21:49:47 +00:00
// All other errors happening will return ErrACKSubscribeRetry which will lead
// to a 'continue' for the for loop when checking the error directly after this
// function is called
err = func() error {
defer func() { retryAttempts++ }()
if retryAttempts > message.Retries {
// max retries reached
er := fmt.Errorf("info: toNode: %v, fromNode: %v, subject: %v, methodArgs: %v: max retries reached, check if node is up and running and if it got a subscriber started for the given REQ type", message.ToNode, message.FromNode, msg.Subject, message.MethodArgs)
// We do not want to send errorLogs for REQErrorLog type since
// it will just cause an endless loop.
if message.Method != REQErrorLog {
p.errorKernel.infoSend(p, message, er)
}
p.metrics.promNatsMessagesFailedACKsTotal.Inc()
return nil
2022-06-18 06:12:14 +00:00
}
2021-03-03 13:14:32 +00:00
2022-12-29 21:49:47 +00:00
er := fmt.Errorf("send attempt:%v, max retries: %v, ack timeout: %v, message.ID: %v, method: %v, toNode: %v", retryAttempts, message.Retries, message.ACKTimeout, message.ID, message.Method, message.ToNode)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-06-17 07:54:20 +00:00
2022-12-29 21:49:47 +00:00
// The SubscribeSync used in the subscriber, will get messages that
// are sent after it started subscribing.
//
// Create a subscriber for the ACK reply message.
subReply, err := natsConn.SubscribeSync(msg.Reply)
defer func() {
err := subReply.Unsubscribe()
if err != nil {
log.Printf("error: nats SubscribeSync: failed when unsubscribing for ACK: %v\n", err)
}
}()
if err != nil {
er := fmt.Errorf("error: nats SubscribeSync failed: failed to create reply message for subject: %v, error: %v", msg.Reply, err)
// sendErrorLogMessage(p.toRingbufferCh, node(p.node), er)
log.Printf("%v, waiting equal to RetryWait %ds before retrying\n", er, message.RetryWait)
2022-06-17 07:54:20 +00:00
time.Sleep(time.Second * time.Duration(message.RetryWait))
2022-06-17 07:54:20 +00:00
2022-12-29 21:49:47 +00:00
return ErrACKSubscribeRetry
}
2021-03-03 13:14:32 +00:00
2022-12-29 21:49:47 +00:00
// Publish message
err = natsConn.PublishMsg(msg)
if err != nil {
er := fmt.Errorf("error: nats publish failed: %v", err)
// sendErrorLogMessage(p.toRingbufferCh, node(p.node), er)
log.Printf("%v, waiting equal to RetryWait of %ds before retrying\n", er, message.RetryWait)
time.Sleep(time.Second * time.Duration(message.RetryWait))
return ErrACKSubscribeRetry
}
2021-03-03 13:14:32 +00:00
2022-12-29 21:49:47 +00:00
// Wait up until ACKTimeout specified for a reply,
// continue and resend if no reply received,
// or exit if max retries for the message reached.
//
// The nats.Msg returned is discarded with '_' since
// we don't use it.
_, err = subReply.NextMsg(time.Second * time.Duration(message.ACKTimeout))
if err != nil {
2022-12-29 21:49:47 +00:00
switch {
case err == nats.ErrNoResponders || err == nats.ErrTimeout:
er := fmt.Errorf("error: ack receive failed: waiting for %v seconds before retrying: subject=%v: %v", message.RetryWait, p.subject.name(), err)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-06-17 07:54:20 +00:00
2022-12-29 21:49:47 +00:00
time.Sleep(time.Second * time.Duration(message.RetryWait))
p.metrics.promNatsMessagesMissedACKsTotal.Inc()
2022-06-17 07:54:20 +00:00
return ErrACKSubscribeRetry
2022-12-29 21:49:47 +00:00
case err == nats.ErrBadSubscription || err == nats.ErrConnectionClosed:
er := fmt.Errorf("error: ack receive failed: conneciton closed or bad subscription, will not retry message: subject=%v: %v", p.subject.name(), err)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-12-29 21:49:47 +00:00
return er
2022-12-29 21:49:47 +00:00
default:
er := fmt.Errorf("error: ack receive failed: the error was not defined, check if nats client have been updated with new error values, and update steward to handle the new error type: subject=%v: %v", p.subject.name(), err)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-12-29 21:49:47 +00:00
return er
}
2021-08-16 11:01:12 +00:00
2022-12-29 21:49:47 +00:00
}
return nil
}()
}
2022-06-17 07:54:20 +00:00
if err == ErrACKSubscribeRetry {
continue
}
if err != nil {
// All error printing are handled within the function that returns
// the error, so we do nothing and return.
// No more trying to deliver the message
return
}
2022-06-17 07:54:20 +00:00
// Message were delivered successfully.
2022-04-01 07:21:50 +00:00
p.metrics.promNatsDeliveredTotal.Inc()
er = fmt.Errorf("info: sent nats message with subject %v, id: %v", msg.Subject, message.ID)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2021-03-03 13:14:32 +00:00
return
}
}
2021-12-28 11:05:09 +00:00
// messageSubscriberHandler will deserialize the message when a new message is
2021-03-03 13:14:32 +00:00
// received, check the MessageType field in the message to decide what
// kind of message it is and then it will check how to handle that message type,
// and then call the correct method handler for it.
//
// This handler function should be started in it's own go routine,so
// one individual handler is started per message received so we can keep
// the state of the message being processed, and then reply back to the
// correct sending process's reply, meaning so we ACK back to the correct
// publisher.
func (p process) messageSubscriberHandler(natsConn *nats.Conn, thisNode string, msg *nats.Msg, subject string) {
2021-12-28 11:05:09 +00:00
// Variable to hold a copy of the message data, so we don't mess with
// the original data since the original is a pointer value.
msgData := make([]byte, len(msg.Data))
copy(msgData, msg.Data)
// fmt.Printf(" * DEBUG: header value on subscriberHandler: %v\n", msg.Header)
// If debugging is enabled, print the source node name of the nats messages received.
if val, ok := msg.Header["fromNode"]; ok {
er := fmt.Errorf("info: nats message received from %v, with subject %v ", val, subject)
2022-04-01 06:43:14 +00:00
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
}
2021-12-28 11:05:09 +00:00
// If compression is used, decompress it to get the gob data. If
// compression is not used it is the gob encoded data we already
// got in msgData so we do nothing with it.
if val, ok := msg.Header["cmp"]; ok {
switch val[0] {
case "z":
zr, err := zstd.NewReader(nil)
if err != nil {
2022-02-18 05:17:06 +00:00
er := fmt.Errorf("error: zstd NewReader failed: %v", err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, Message{}, er)
2021-12-28 11:05:09 +00:00
return
}
msgData, err = zr.DecodeAll(msg.Data, nil)
if err != nil {
er := fmt.Errorf("error: zstd decoding failed: %v", err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, Message{}, er)
2022-01-03 11:30:28 +00:00
zr.Close()
2021-12-28 11:05:09 +00:00
return
}
2022-01-03 11:30:28 +00:00
zr.Close()
case "g":
r := bytes.NewReader(msgData)
gr, err := gzip.NewReader(r)
if err != nil {
2022-02-18 05:17:06 +00:00
er := fmt.Errorf("error: gzip NewReader failed: %v", err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, Message{}, er)
2022-01-03 11:30:28 +00:00
return
}
b, err := io.ReadAll(gr)
if err != nil {
2022-02-18 05:17:06 +00:00
er := fmt.Errorf("error: gzip ReadAll failed: %v", err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, Message{}, er)
2022-01-03 11:30:28 +00:00
return
}
gr.Close()
msgData = b
2021-12-28 11:05:09 +00:00
}
}
2021-03-03 13:14:32 +00:00
message := Message{}
2021-12-29 07:29:11 +00:00
// Check if serialization is specified.
2022-02-18 05:17:06 +00:00
// Will default to gob serialization if nothing or non existing value is specified.
2021-12-29 07:11:43 +00:00
if val, ok := msg.Header["serial"]; ok {
// fmt.Printf(" * DEBUG: ok = %v, map = %v, len of val = %v\n", ok, msg.Header, len(val))
switch val[0] {
case "cbor":
err := cbor.Unmarshal(msgData, &message)
if err != nil {
2022-02-18 05:17:06 +00:00
er := fmt.Errorf("error: cbor decoding failed, subject: %v, header: %v, error: %v", subject, msg.Header, err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, message, er)
2021-12-29 07:11:43 +00:00
return
}
default: // Deaults to gob if no match was found.
2021-12-29 07:29:11 +00:00
r := bytes.NewReader(msgData)
gobDec := gob.NewDecoder(r)
2021-12-29 07:11:43 +00:00
err := gobDec.Decode(&message)
if err != nil {
2022-02-18 05:17:06 +00:00
er := fmt.Errorf("error: gob decoding failed, subject: %v, header: %v, error: %v", subject, msg.Header, err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, message, er)
2021-12-29 07:11:43 +00:00
return
}
}
} else {
// Default to gob if serialization flag was not specified.
2021-12-29 07:29:11 +00:00
r := bytes.NewReader(msgData)
gobDec := gob.NewDecoder(r)
2021-12-29 07:11:43 +00:00
err := gobDec.Decode(&message)
if err != nil {
2022-02-18 05:17:06 +00:00
er := fmt.Errorf("error: gob decoding failed, subject: %v, header: %v, error: %v", subject, msg.Header, err)
2022-04-01 06:43:14 +00:00
p.errorKernel.errSend(p, message, er)
2021-12-29 07:11:43 +00:00
return
}
2021-03-03 13:14:32 +00:00
}
2021-08-16 11:01:12 +00:00
// Check if it is an ACK or NACK message, and do the appropriate action accordingly.
2021-12-30 05:28:21 +00:00
//
// With ACK messages Steward will keep the state of the message delivery, and try to
// resend the message if an ACK is not received within the timeout/retries specified
// in the message.
// When a process sends an ACK message, it will stop and wait for the nats-reply message
// for the time specified in the replyTimeout value. If no reply message is received
// within the given timeout the publishing process will try to resend the message for
// number of times specified in the retries field of the Steward message.
// When receiving a Steward-message with ACK enabled we send a message back the the
// node where the message originated using the msg.Reply subject field of the nats-message.
//
// With NACK messages we do not send a nats reply message, so the message will only be
// sent from the publisher once, and if it is not delivered it will not be retried.
2021-03-03 13:14:32 +00:00
switch {
2021-12-30 05:28:21 +00:00
// Check for ACK type Event.
case message.ACKTimeout >= 1:
er := fmt.Errorf("subscriberHandler: received ACK message: %v, from: %v, id:%v", message.Method, message.FromNode, message.ID)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-06-11 04:30:58 +00:00
// When spawning sub processes we can directly assign handlers to the process upon
// creation. We here check if a handler is already assigned, and if it is nil, we
// lookup and find the correct handler to use if available.
if p.handler == nil {
// Look up the method handler for the specified method.
mh, ok := p.methodsAvailable.CheckIfExists(message.Method)
p.handler = mh.handler
if !ok {
er := fmt.Errorf("error: subscriberHandler: no such method type: %v", p.subject.Event)
p.errorKernel.errSend(p, message, er)
}
2021-03-03 13:14:32 +00:00
}
//var err error
2021-03-03 13:14:32 +00:00
2022-06-20 09:17:23 +00:00
_ = p.callHandler(message, thisNode)
// Send a confirmation message back to the publisher to ACK that the
// message was received by the subscriber. The reply should be sent
// no matter if the handler was executed successfully or not
2022-06-20 09:17:23 +00:00
natsConn.Publish(msg.Reply, []byte{})
case message.ACKTimeout < 1:
er := fmt.Errorf("subscriberHandler: received NACK message: %v, from: %v, id:%v", message.Method, message.FromNode, message.ID)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-06-11 04:30:58 +00:00
// When spawning sub processes we can directly assign handlers to the process upon
// creation. We here check if a handler is already assigned, and if it is nil, we
// lookup and find the correct handler to use if available.
if p.handler == nil {
// Look up the method handler for the specified method.
mh, ok := p.methodsAvailable.CheckIfExists(message.Method)
p.handler = mh.handler
if !ok {
er := fmt.Errorf("error: subscriberHandler: no such method type: %v", p.subject.Event)
p.errorKernel.errSend(p, message, er)
}
}
// We do not send reply messages for EventNACL, so we can discard the output.
2022-06-11 04:30:58 +00:00
_ = p.callHandler(message, thisNode)
default:
er := fmt.Errorf("info: did not find that specific type of event: %#v", p.subject.Event)
p.errorKernel.infoSend(p, message, er)
}
}
// callHandler will call the handler for the Request type defined in the message.
// If checking signatures and/or acl's are enabled the signatures they will be
// verified, and if OK the handler is called.
2022-06-11 04:30:58 +00:00
func (p process) callHandler(message Message, thisNode string) []byte {
2022-06-20 09:17:23 +00:00
//out := []byte{}
// Call the handler if ACL/signature checking returns true.
// If the handler is to be called in a scheduled manner, we we take care of that too.
go func() {
switch p.verifySigOrAclFlag(message) {
case true:
executeHandler(p, message, thisNode)
case false:
// ACL/Signature checking failed.
er := fmt.Errorf("error: subscriberHandler: ACL were verified not-OK, doing nothing")
p.errorKernel.errSend(p, message, er)
log.Printf("%v\n", er)
}
}()
return []byte{}
}
// executeHandler will call the handler for the Request type defined in the message.
func executeHandler(p process, message Message, thisNode string) {
var err error
2022-06-20 09:17:23 +00:00
// Check if it is a message to run scheduled.
var interval int
var totalTime int
var runAsScheduled bool
switch {
case len(message.Schedule) < 2:
// Not at scheduled message,
case len(message.Schedule) == 2:
interval = message.Schedule[0]
totalTime = message.Schedule[1]
fallthrough
case interval > 0 && totalTime > 0:
runAsScheduled = true
}
2022-12-29 21:49:47 +00:00
if p.configuration.EnableAclCheck {
// Either ACL were verified OK, or ACL/Signature check was not enabled, so we call the handler.
er := fmt.Errorf("info: subscriberHandler: Either ACL were verified OK, or ACL/Signature check was not enabled, so we call the handler: %v", true)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
}
2022-06-20 09:17:23 +00:00
switch {
case !runAsScheduled:
2022-06-20 09:17:23 +00:00
go func() {
_, err = p.handler(p, message, thisNode)
if err != nil {
er := fmt.Errorf("error: subscriberHandler: handler method failed: %v", err)
p.errorKernel.errSend(p, message, er)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
}
}()
2022-06-20 09:17:23 +00:00
case runAsScheduled:
// Create two tickers to use for the scheduling.
intervalTicker := time.NewTicker(time.Second * time.Duration(interval))
totalTimeTicker := time.NewTicker(time.Second * time.Duration(totalTime))
2022-12-26 09:52:43 +00:00
defer intervalTicker.Stop()
defer totalTimeTicker.Stop()
// NB: Commented out this assignement of a specific message context
// to be used within handlers, since it will override the structure
// we have today. Keeping the code for a bit incase it makes sense
// to implement later.
//ctx, cancel := context.WithCancel(p.ctx)
//message.ctx = ctx
// Run the handler once, so we don't have to wait for the first ticker.
go func() {
_, err := p.handler(p, message, thisNode)
if err != nil {
er := fmt.Errorf("error: subscriberHandler: handler method failed: %v", err)
p.errorKernel.errSend(p, message, er)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
}
}()
for {
select {
case <-p.ctx.Done():
er := fmt.Errorf("info: subscriberHandler: proc ctx done: toNode=%v, fromNode=%v, method=%v, methodArgs=%v", message.ToNode, message.FromNode, message.Method, message.MethodArgs)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-06-20 09:17:23 +00:00
//cancel()
return
case <-totalTimeTicker.C:
// Total time reached. End the process.
//cancel()
er := fmt.Errorf("info: subscriberHandler: schedule totalTime done: toNode=%v, fromNode=%v, method=%v, methodArgs=%v", message.ToNode, message.FromNode, message.Method, message.MethodArgs)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
2022-06-20 09:17:23 +00:00
return
case <-intervalTicker.C:
go func() {
_, err := p.handler(p, message, thisNode)
if err != nil {
er := fmt.Errorf("error: subscriberHandler: handler method failed: %v", err)
p.errorKernel.errSend(p, message, er)
p.errorKernel.logConsoleOnlyIfDebug(er, p.configuration)
}
}()
2022-06-20 09:17:23 +00:00
}
}
}
}
// verifySigOrAclFlag will do signature and/or acl checking based on which of
// those features are enabled, and then call the handler.
// The handler will also be called if neither signature or acl checking is enabled
// since it is up to the subscriber to decide if it want to use the auth features
// or not.
func (p process) verifySigOrAclFlag(message Message) bool {
doHandler := false
switch {
// If no checking enabled we should just allow the message.
case !p.nodeAuth.configuration.EnableSignatureCheck && !p.nodeAuth.configuration.EnableAclCheck:
2022-06-20 09:17:23 +00:00
//log.Printf(" * DEBUG: verify acl/sig: no acl or signature checking at all is enabled, ALLOW the message, method=%v\n", message.Method)
doHandler = true
2021-03-03 13:14:32 +00:00
// If only sig check enabled, and sig OK, we should allow the message.
case p.nodeAuth.configuration.EnableSignatureCheck && !p.nodeAuth.configuration.EnableAclCheck:
sigOK := p.nodeAuth.verifySignature(message)
log.Printf(" * DEBUG: verify acl/sig: Only signature checking enabled, ALLOW the message if sigOK, sigOK=%v, method %v\n", sigOK, message.Method)
if sigOK {
doHandler = true
2021-03-03 13:14:32 +00:00
}
// If both sig and acl check enabled, and sig and acl OK, we should allow the message.
case p.nodeAuth.configuration.EnableSignatureCheck && p.nodeAuth.configuration.EnableAclCheck:
2022-05-27 10:17:15 +00:00
sigOK := p.nodeAuth.verifySignature(message)
aclOK := p.nodeAuth.verifyAcl(message)
log.Printf(" * DEBUG: verify acl/sig:both signature and acl checking enabled, allow the message if sigOK and aclOK, or method is not REQCliCommand, sigOK=%v, aclOK=%v, method=%v\n", sigOK, aclOK, message.Method)
2022-05-27 10:17:15 +00:00
if sigOK && aclOK {
doHandler = true
2021-03-03 13:14:32 +00:00
}
2021-08-16 11:01:12 +00:00
// none of the verification options matched, we should keep the default value
// of doHandler=false, so the handler is not done.
2021-03-03 13:14:32 +00:00
default:
log.Printf(" * DEBUG: verify acl/sig: None of the verify flags matched, not doing handler for message, method=%v\n", message.Method)
2021-03-03 13:14:32 +00:00
}
return doHandler
2021-03-03 13:14:32 +00:00
}
2021-08-16 11:01:12 +00:00
// SubscribeMessage will register the Nats callback function for the specified
// nats subject. This allows us to receive Nats messages for a given subject
// on a node.
2021-04-08 10:51:54 +00:00
func (p process) subscribeMessages() *nats.Subscription {
2021-03-03 13:14:32 +00:00
subject := string(p.subject.name())
2022-04-07 19:43:00 +00:00
// natsSubscription, err := p.natsConn.Subscribe(subject, func(msg *nats.Msg) {
natsSubscription, err := p.natsConn.QueueSubscribe(subject, subject, func(msg *nats.Msg) {
2021-04-08 10:51:54 +00:00
//_, err := p.natsConn.Subscribe(subject, func(msg *nats.Msg) {
2021-03-25 13:30:39 +00:00
2021-08-16 11:01:12 +00:00
// Start up the subscriber handler.
go p.messageSubscriberHandler(p.natsConn, p.configuration.NodeName, msg, subject)
2021-03-03 13:14:32 +00:00
})
if err != nil {
log.Printf("error: Subscribe failed: %v\n", err)
2021-04-08 10:51:54 +00:00
return nil
2021-03-03 13:14:32 +00:00
}
2021-04-08 10:51:54 +00:00
return natsSubscription
2021-03-03 13:14:32 +00:00
}
2021-03-09 06:43:55 +00:00
// publishMessages will do the publishing of messages for one single
// process. The function should be run as a goroutine, and will run
// as long as the process it belongs to is running.
2021-04-08 10:51:54 +00:00
func (p process) publishMessages(natsConn *nats.Conn) {
var once sync.Once
2022-01-03 10:36:05 +00:00
var zEnc *zstd.Encoder
// Prepare a zstd encoder if enabled. By enabling it here before
// looping over the messages to send below, we can reuse the zstd
// encoder for all messages.
switch p.configuration.Compression {
case "z": // zstd
2022-01-03 12:00:23 +00:00
// enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedBestCompression))
enc, err := zstd.NewWriter(nil, zstd.WithEncoderConcurrency(1))
2022-01-03 10:36:05 +00:00
if err != nil {
log.Printf("error: zstd new encoder failed: %v\n", err)
os.Exit(1)
}
zEnc = enc
defer zEnc.Close()
}
2022-01-03 09:40:27 +00:00
// Loop and handle 1 message at a time. If some part of the code
// fails in the loop we should throw an error and use `continue`
// to jump back here to the beginning of the loop and continue
// with the next message.
// Adding a timer that will be used for when to remove the sub process
// publisher. The timer is reset each time a message is published with
// the process, so the sub process publisher will not be removed until
// it have not received any messages for the given amount of time.
ticker := time.NewTicker(time.Second * time.Duration(p.configuration.KeepPublishersAliveFor))
2022-12-26 09:52:43 +00:00
defer ticker.Stop()
2021-03-03 13:14:32 +00:00
for {
2021-04-07 16:05:07 +00:00
// Wait and read the next message on the message channel, or
// exit this function if Cancel are received via ctx.
select {
case <-ticker.C:
// We only want to remove subprocesses
if p.isSubProcess {
p.processes.active.mu.Lock()
p.ctxCancel()
delete(p.processes.active.procNames, p.processName)
p.processes.active.mu.Unlock()
er := fmt.Errorf("info: canceled publisher: %v", p.processName)
//sendErrorLogMessage(p.toRingbufferCh, Node(p.node), er)
log.Printf("%v\n", er)
return
}
case m := <-p.subject.messageCh:
ticker.Reset(time.Second * time.Duration(p.configuration.KeepPublishersAliveFor))
2022-02-03 06:04:10 +00:00
// Sign the methodArgs, and add the signature to the message.
m.ArgSignature = p.addMethodArgSignature(m)
2022-02-11 08:04:14 +00:00
// fmt.Printf(" * DEBUG: add signature, fromNode: %v, method: %v, len of signature: %v\n", m.FromNode, m.Method, len(m.ArgSignature))
2022-02-03 06:04:10 +00:00
go p.publishAMessage(m, zEnc, once, natsConn)
2021-04-07 16:05:07 +00:00
case <-p.ctx.Done():
er := fmt.Errorf("info: canceling publisher: %v", p.processName)
2021-07-02 16:32:01 +00:00
//sendErrorLogMessage(p.toRingbufferCh, Node(p.node), er)
log.Printf("%v\n", er)
2021-04-07 16:05:07 +00:00
return
}
}
}
2022-02-03 06:04:10 +00:00
func (p process) addMethodArgSignature(m Message) []byte {
2022-02-04 05:24:34 +00:00
argsString := argsToString(m.MethodArgs)
2022-04-21 11:21:36 +00:00
sign := ed25519.Sign(p.nodeAuth.SignPrivateKey, []byte(argsString))
2022-02-03 06:04:10 +00:00
return sign
}
func (p process) publishAMessage(m Message, zEnc *zstd.Encoder, once sync.Once, natsConn *nats.Conn) {
// Create the initial header, and set values below depending on the
// various configuration options chosen.
2022-02-17 07:14:56 +00:00
natsMsgHeader := make(nats.Header)
2022-02-18 04:22:53 +00:00
natsMsgHeader["fromNode"] = []string{string(p.node)}
2021-12-29 05:40:42 +00:00
// The serialized value of the nats message payload
var natsMsgPayloadSerialized []byte
// encode the message structure into gob binary format before putting
// it into a nats message.
// Prepare a gob encoder with a buffer before we start the loop
switch p.configuration.Serialization {
case "cbor":
b, err := cbor.Marshal(m)
if err != nil {
er := fmt.Errorf("error: messageDeliverNats: cbor encode message failed: %v", err)
log.Printf("%v\n", er)
return
}
2021-12-29 06:28:09 +00:00
natsMsgPayloadSerialized = b
natsMsgHeader["serial"] = []string{p.configuration.Serialization}
2021-12-29 06:28:09 +00:00
default:
var bufGob bytes.Buffer
gobEnc := gob.NewEncoder(&bufGob)
err := gobEnc.Encode(m)
if err != nil {
er := fmt.Errorf("error: messageDeliverNats: gob encode message failed: %v", err)
log.Printf("%v\n", er)
return
}
natsMsgPayloadSerialized = bufGob.Bytes()
2022-02-17 07:14:56 +00:00
natsMsgHeader["serial"] = []string{"gob"}
}
2022-01-03 11:30:28 +00:00
// Get the process name so we can look up the process in the
// processes map, and increment the message counter.
pn := processNameGet(p.subject.name(), processKindPublisher)
// NB: REMOVED: It doesn't really make sense to get the message id
// from the process. Implemented so this is picked up from the id
// used in the ringbuffer.
// m.ID = p.messageID
// The compressed value of the nats message payload. The content
// can either be compressed or in it's original form depening on
// the outcome of the switch below, and if compression were chosen
// or not.
var natsMsgPayloadCompressed []byte
// Compress the data payload if selected with configuration flag.
// The compression chosen is later set in the nats msg header when
// calling p.messageDeliverNats below.
switch p.configuration.Compression {
case "z": // zstd
natsMsgPayloadCompressed = zEnc.EncodeAll(natsMsgPayloadSerialized, nil)
natsMsgHeader["cmp"] = []string{p.configuration.Compression}
// p.zEncMutex.Lock()
// zEnc.Reset(nil)
// p.zEncMutex.Unlock()
case "g": // gzip
var buf bytes.Buffer
2022-06-18 06:12:14 +00:00
func() {
gzipW := gzip.NewWriter(&buf)
defer gzipW.Close()
defer gzipW.Flush()
_, err := gzipW.Write(natsMsgPayloadSerialized)
if err != nil {
log.Printf("error: failed to write gzip: %v\n", err)
return
}
}()
natsMsgPayloadCompressed = buf.Bytes()
natsMsgHeader["cmp"] = []string{p.configuration.Compression}
2022-02-17 07:14:56 +00:00
case "": // no compression
natsMsgPayloadCompressed = natsMsgPayloadSerialized
2022-02-17 07:14:56 +00:00
natsMsgHeader["cmp"] = []string{"none"}
2021-03-09 06:43:55 +00:00
default: // no compression
// Allways log the error to console.
er := fmt.Errorf("error: publishing: compression type not defined, setting default to no compression")
log.Printf("%v\n", er)
2021-03-03 13:14:32 +00:00
// We only wan't to send the error message to errorCentral once.
once.Do(func() {
log.Printf("%v\n", er)
})
2021-10-08 10:07:10 +00:00
2022-02-17 07:14:56 +00:00
// No compression, so we just assign the value of the serialized
// data directly to the variable used with messageDeliverNats.
natsMsgPayloadCompressed = natsMsgPayloadSerialized
2022-02-17 07:14:56 +00:00
natsMsgHeader["cmp"] = []string{"none"}
}
2021-03-12 08:38:19 +00:00
// Create the Nats message with headers and payload, and do the
// sending of the message.
p.messageDeliverNats(natsMsgPayloadCompressed, natsMsgHeader, natsConn, m)
// select {
// case m.done <- struct{}{}:
// // Signaling back to the ringbuffer that we are done with the
// // current message, and it can remove it from the ringbuffer.
// case <-p.ctx.Done():
// return
// }
// Increment the counter for the next message to be sent.
p.messageID++
{
p.processes.active.mu.Lock()
p.processes.active.procNames[pn] = p
p.processes.active.mu.Unlock()
2021-03-03 13:14:32 +00:00
}
// // Handle the error.
// //
// // NOTE: None of the processes above generate an error, so the the
// // if clause will never be triggered. But keeping it here as an example
// // for now for how to handle errors.
// if err != nil {
// // Create an error type which also creates a channel which the
// // errorKernel will send back the action about what to do.
// ep := errorEvent{
// //errorType: logOnly,
// process: p,
// message: m,
// errorActionCh: make(chan errorAction),
// }
// p.errorCh <- ep
//
// // Wait for the response action back from the error kernel, and
// // decide what to do. Should we continue, quit, or .... ?
// switch <-ep.errorActionCh {
// case errActionContinue:
// // Just log and continue
// log.Printf("The errAction was continue...so we're continuing\n")
// case errActionKill:
// log.Printf("The errAction was kill...so we're killing\n")
// // ....
// default:
// log.Printf("Info: publishMessages: The errAction was not defined, so we're doing nothing\n")
// }
// }
2021-03-03 13:14:32 +00:00
}