2021-03-03 13:14:32 +00:00
package steward
import (
"bytes"
2021-04-07 16:05:07 +00:00
"context"
2021-03-03 13:14:32 +00:00
"encoding/gob"
"fmt"
"log"
"time"
"github.com/nats-io/nats.go"
2021-04-12 13:35:20 +00:00
"github.com/prometheus/client_golang/prometheus"
2021-03-03 13:14:32 +00:00
)
// processKind are either kindSubscriber or kindPublisher, and are
// used to distinguish the kind of process to spawn and to know
// the process kind put in the process map.
type processKind string
const (
processKindSubscriber processKind = "subscriber"
processKindPublisher processKind = "publisher"
)
// process are represent the communication to one individual host
type process struct {
messageID int
// the subject used for the specific process. One process
// can contain only one sender on a message bus, hence
// also one subject
subject Subject
// Put a node here to be able know the node a process is at.
// NB: Might not be needed later on.
node node
// The processID for the current process
processID int
2021-03-12 08:38:19 +00:00
// errorCh is the same channel the errorKernel uses to
// read incomming errors. By having this channel available
// within a process we can send errors to the error kernel,
// the EK desided what to do, and sends the action about
// what to do back to the process where the error came from.
2021-03-03 13:14:32 +00:00
errorCh chan errProcess
processKind processKind
// Who are we allowed to receive from ?
allowedReceivers map [ node ] struct { }
2021-03-03 14:44:32 +00:00
// methodsAvailable
methodsAvailable MethodsAvailable
2021-03-04 15:27:55 +00:00
// Helper or service function that can do some kind of work
// for the process.
// The idea is that this can hold for example the map of the
// the hello nodes to limit shared resources in the system as
// a whole for sharing a map from the *server level.
2021-03-10 13:14:09 +00:00
procFunc procFunc
2021-03-04 15:27:55 +00:00
// The channel to send a messages to the procFunc go routine.
// This is typically used within the methodHandler.
procFuncCh chan Message
2021-03-08 13:09:14 +00:00
// copy of the configuration from server
configuration * Configuration
2021-03-09 03:55:51 +00:00
// The new messages channel copied from *Server
2021-03-29 11:36:30 +00:00
toRingbufferCh chan <- [ ] subjectAndMessage
2021-03-31 06:56:13 +00:00
// The structure who holds all processes information
processes * processes
2021-04-07 14:45:51 +00:00
// nats connection
natsConn * nats . Conn
2021-04-08 05:07:13 +00:00
// natsSubscription returned when calling natsConn.Subscribe
2021-04-08 10:51:54 +00:00
natsSubscription * nats . Subscription
2021-04-07 16:05:07 +00:00
// context
ctx context . Context
// context cancelFunc
ctxCancel context . CancelFunc
2021-04-08 05:07:13 +00:00
// Process name
processName processName
2021-04-09 09:30:40 +00:00
// startup holds the startup functions for starting up publisher
// or subscriber processes
startup startup
2021-03-03 13:14:32 +00:00
}
// prepareNewProcess will set the the provided values and the default
// values for a process.
2021-04-07 14:45:51 +00:00
func newProcess ( natsConn * nats . Conn , processes * processes , toRingbufferCh chan <- [ ] subjectAndMessage , configuration * Configuration , subject Subject , errCh chan errProcess , processKind processKind , allowedReceivers [ ] node , procFunc func ( ) error ) process {
2021-03-03 13:14:32 +00:00
// create the initial configuration for a sessions communicating with 1 host process.
2021-03-04 05:53:03 +00:00
processes . lastProcessID ++
2021-03-03 13:14:32 +00:00
// make the slice of allowedReceivers into a map value for easy lookup.
m := make ( map [ node ] struct { } )
for _ , a := range allowedReceivers {
m [ a ] = struct { } { }
}
2021-04-07 16:05:07 +00:00
ctx , cancel := context . WithCancel ( context . Background ( ) )
2021-03-03 14:44:32 +00:00
var method Method
2021-03-03 13:14:32 +00:00
proc := process {
messageID : 0 ,
subject : subject ,
2021-04-16 10:38:48 +00:00
node : node ( configuration . NodeName ) ,
2021-03-04 05:53:03 +00:00
processID : processes . lastProcessID ,
2021-03-03 13:14:32 +00:00
errorCh : errCh ,
processKind : processKind ,
allowedReceivers : m ,
2021-03-03 14:44:32 +00:00
methodsAvailable : method . GetMethodsAvailable ( ) ,
2021-03-29 11:36:30 +00:00
toRingbufferCh : toRingbufferCh ,
2021-03-09 03:55:51 +00:00
configuration : configuration ,
2021-03-31 06:56:13 +00:00
processes : processes ,
2021-04-07 14:45:51 +00:00
natsConn : natsConn ,
2021-04-07 16:05:07 +00:00
ctx : ctx ,
ctxCancel : cancel ,
2021-03-03 13:14:32 +00:00
}
return proc
}
2021-03-10 13:14:09 +00:00
// procFunc is a helper function that will do some extra work for
// a message received for a process. This allows us to ACK back
// to the publisher that the message was received, but we can let
// the processFunc keep on working.
// This can also be used to wrap in other types which we want to
// work with that come from the outside. An example can be handling
// of metrics which the message have no notion of, but a procFunc
// can have that wrapped in from when it was constructed.
2021-04-07 16:54:08 +00:00
type procFunc func ( ctx context . Context ) error
2021-03-10 13:14:09 +00:00
2021-03-03 13:14:32 +00:00
// The purpose of this function is to check if we should start a
// publisher or subscriber process, where a process is a go routine
// that will handle either sending or receiving messages on one
// subject.
//
// It will give the process the next available ID, and also add the
// process to the processes map in the server structure.
2021-04-07 14:45:51 +00:00
func ( p process ) spawnWorker ( procs * processes , natsConn * nats . Conn ) {
2021-03-03 13:14:32 +00:00
// We use the full name of the subject to identify a unique
// process. We can do that since a process can only handle
// one message queue.
var pn processName
if p . processKind == processKindPublisher {
pn = processNameGet ( p . subject . name ( ) , processKindPublisher )
}
if p . processKind == processKindSubscriber {
pn = processNameGet ( p . subject . name ( ) , processKindSubscriber )
}
2021-04-12 13:35:20 +00:00
processName := processNameGet ( p . subject . name ( ) , p . processKind )
2021-06-08 04:02:08 +00:00
// Add prometheus metrics for the process.
2021-04-12 13:35:20 +00:00
p . processes . promProcessesVec . With ( prometheus . Labels { "processName" : string ( processName ) } )
2021-03-03 13:14:32 +00:00
// Start a publisher worker, which will start a go routine (process)
// That will take care of all the messages for the subject it owns.
if p . processKind == processKindPublisher {
2021-03-25 13:30:39 +00:00
2021-03-09 06:43:55 +00:00
// If there is a procFunc for the process, start it.
if p . procFunc != nil {
// Start the procFunc in it's own anonymous func so we are able
// to get the return error.
go func ( ) {
2021-04-07 16:54:08 +00:00
err := p . procFunc ( p . ctx )
2021-03-09 06:43:55 +00:00
if err != nil {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: spawnWorker: procFunc failed: %v" , err )
2021-03-29 11:36:30 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( p . node ) , er )
2021-03-09 06:43:55 +00:00
}
} ( )
}
2021-04-08 10:51:54 +00:00
go p . publishMessages ( natsConn )
2021-03-03 13:14:32 +00:00
}
// Start a subscriber worker, which will start a go routine (process)
// That will take care of all the messages for the subject it owns.
if p . processKind == processKindSubscriber {
2021-03-04 15:27:55 +00:00
// If there is a procFunc for the process, start it.
if p . procFunc != nil {
2021-03-25 13:30:39 +00:00
2021-03-04 15:27:55 +00:00
// Start the procFunc in it's own anonymous func so we are able
// to get the return error.
go func ( ) {
2021-04-07 16:54:08 +00:00
err := p . procFunc ( p . ctx )
2021-03-04 15:27:55 +00:00
if err != nil {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: spawnWorker: procFunc failed: %v" , err )
2021-03-29 11:36:30 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( p . node ) , er )
2021-03-04 15:27:55 +00:00
}
} ( )
}
2021-03-05 12:10:46 +00:00
2021-04-08 10:51:54 +00:00
p . natsSubscription = p . subscribeMessages ( )
2021-03-03 13:14:32 +00:00
}
2021-04-08 10:51:54 +00:00
p . processName = pn
// Add information about the new process to the started processes map.
procs . mu . Lock ( )
procs . active [ pn ] = p
procs . mu . Unlock ( )
2021-03-03 13:14:32 +00:00
}
// messageDeliverNats will take care of the delivering the message
// as converted to gob format as a nats.Message. It will also take
// care of checking timeouts and retries specified for the message.
2021-03-09 06:43:55 +00:00
func ( p process ) messageDeliverNats ( natsConn * nats . Conn , message Message ) {
2021-03-03 13:14:32 +00:00
retryAttempts := 0
2021-04-16 11:18:10 +00:00
const publishTimer time . Duration = 5
const subscribeSyncTimer time . Duration = 5
2021-03-03 13:14:32 +00:00
for {
dataPayload , err := gobEncodeMessage ( message )
if err != nil {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: createDataPayload: %v" , err )
2021-03-29 11:36:30 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( p . node ) , er )
2021-03-12 08:38:19 +00:00
continue
2021-03-03 13:14:32 +00:00
}
msg := & nats . Msg {
2021-03-09 06:43:55 +00:00
Subject : string ( p . subject . name ( ) ) ,
2021-04-04 05:33:18 +00:00
// Subject: fmt.Sprintf("%s.%s.%s", proc.node, "command", "CLICommandRequest"),
2021-03-03 13:14:32 +00:00
// Structure of the reply message are:
2021-06-08 02:45:01 +00:00
// <nodename>.<message type>.<method>.reply
Reply : fmt . Sprintf ( "%s.reply" , p . subject . name ( ) ) ,
2021-03-03 13:14:32 +00:00
Data : dataPayload ,
}
// The SubscribeSync used in the subscriber, will get messages that
2021-04-16 11:18:10 +00:00
// are sent after it started subscribing.
2021-03-03 13:14:32 +00:00
//
// Create a subscriber for the reply message.
2021-03-09 06:43:55 +00:00
subReply , err := natsConn . SubscribeSync ( msg . Reply )
2021-03-03 13:14:32 +00:00
if err != nil {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: nc.SubscribeSync failed: failed to create reply message: %v" , err )
2021-04-16 11:18:10 +00:00
// sendErrorLogMessage(p.toRingbufferCh, node(p.node), er)
2021-04-16 11:43:58 +00:00
log . Printf ( "%v, waiting %ds before retrying\n" , er , subscribeSyncTimer )
2021-04-16 11:18:10 +00:00
time . Sleep ( time . Second * subscribeSyncTimer )
2021-03-03 13:14:32 +00:00
continue
}
// Publish message
2021-03-09 06:43:55 +00:00
err = natsConn . PublishMsg ( msg )
2021-03-03 13:14:32 +00:00
if err != nil {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: publish failed: %v" , err )
2021-04-16 11:18:10 +00:00
// sendErrorLogMessage(p.toRingbufferCh, node(p.node), er)
2021-04-16 11:43:58 +00:00
log . Printf ( "%v, waiting %ds before retrying\n" , er , publishTimer )
2021-04-16 11:18:10 +00:00
time . Sleep ( time . Second * publishTimer )
2021-03-03 13:14:32 +00:00
continue
}
// If the message is an ACK type of message we must check that a
// reply, and if it is not we don't wait here at all.
2021-03-25 13:30:39 +00:00
// fmt.Printf("info: messageDeliverNats: preparing to send message: %v\n", message)
2021-03-09 06:43:55 +00:00
if p . subject . CommandOrEvent == CommandACK || p . subject . CommandOrEvent == EventACK {
2021-04-15 08:33:44 +00:00
// Wait up until ACKTimeout specified for a reply,
2021-03-03 13:14:32 +00:00
// continue and resend if noo reply received,
// or exit if max retries for the message reached.
2021-04-15 08:33:44 +00:00
msgReply , err := subReply . NextMsg ( time . Second * time . Duration ( message . ACKTimeout ) )
2021-03-03 13:14:32 +00:00
if err != nil {
2021-03-26 06:55:42 +00:00
er := fmt . Errorf ( "error: subReply.NextMsg failed for node=%v, subject=%v: %v" , p . node , p . subject . name ( ) , err )
2021-04-16 10:38:48 +00:00
// sendErrorLogMessage(p.toRingbufferCh, p.node, er)
log . Printf ( " ** %v\n" , er )
2021-03-03 13:14:32 +00:00
// did not receive a reply, decide what to do..
retryAttempts ++
2021-04-16 11:43:58 +00:00
log . Printf ( "Retry attempts:%v, retries: %v, ACKTimeout: %v\n" , retryAttempts , message . Retries , message . ACKTimeout )
2021-03-03 13:14:32 +00:00
switch {
case message . Retries == 0 :
// 0 indicates unlimited retries
continue
case retryAttempts >= message . Retries :
// max retries reached
2021-06-08 01:11:22 +00:00
er := fmt . Errorf ( "info: toNode: %v, fromNode: %v, method: %v: max retries reached, check if node is up and running and if it got a subscriber for the given REQ type" , message . ToNode , message . FromNode , message . Method )
2021-04-16 10:38:48 +00:00
sendErrorLogMessage ( p . toRingbufferCh , p . node , er )
2021-03-03 13:14:32 +00:00
return
2021-03-26 06:55:42 +00:00
2021-03-03 13:14:32 +00:00
default :
// none of the above matched, so we've not reached max retries yet
continue
}
}
2021-03-11 05:34:36 +00:00
log . Printf ( "<--- publisher: received ACK from:%v, for: %v, data: %s\n" , message . ToNode , message . Method , msgReply . Data )
2021-03-03 13:14:32 +00:00
}
return
}
}
// subscriberHandler will deserialize the message when a new message is
// received, check the MessageType field in the message to decide what
// kind of message it is and then it will check how to handle that message type,
// and then call the correct method handler for it.
//
// This handler function should be started in it's own go routine,so
// one individual handler is started per message received so we can keep
// the state of the message being processed, and then reply back to the
// correct sending process's reply, meaning so we ACK back to the correct
// publisher.
2021-04-07 14:45:51 +00:00
func ( p process ) subscriberHandler ( natsConn * nats . Conn , thisNode string , msg * nats . Msg ) {
2021-03-03 13:14:32 +00:00
message := Message { }
// Create a buffer to decode the gob encoded binary data back
// to it's original structure.
buf := bytes . NewBuffer ( msg . Data )
gobDec := gob . NewDecoder ( buf )
err := gobDec . Decode ( & message )
if err != nil {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: gob decoding failed: %v" , err )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-03 13:14:32 +00:00
}
switch {
case p . subject . CommandOrEvent == CommandACK || p . subject . CommandOrEvent == EventACK :
2021-03-09 06:43:55 +00:00
mh , ok := p . methodsAvailable . CheckIfExists ( message . Method )
2021-03-03 13:14:32 +00:00
if ! ok {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: subscriberHandler: method type not available: %v" , p . subject . CommandOrEvent )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-03 13:14:32 +00:00
}
out := [ ] byte ( "not allowed from " + message . FromNode )
2021-03-12 08:38:19 +00:00
//var err error
2021-03-03 13:14:32 +00:00
// Check if we are allowed to receive from that host
_ , arOK1 := p . allowedReceivers [ message . FromNode ]
_ , arOK2 := p . allowedReceivers [ "*" ]
if arOK1 || arOK2 {
// Start the method handler for that specific subject type.
// The handler started here is what actually doing the action
// that executed a CLI command, or writes to a log file on
// the node who received the message.
2021-03-09 06:43:55 +00:00
out , err = mh . handler ( p , message , thisNode )
2021-03-03 13:14:32 +00:00
if err != nil {
2021-04-13 11:25:44 +00:00
er := fmt . Errorf ( "error: subscriberHandler: handler method failed: %v" , err )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-03 13:14:32 +00:00
}
} else {
2021-03-25 12:39:59 +00:00
er := fmt . Errorf ( "info: we don't allow receiving from: %v, %v" , message . FromNode , p . subject )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-03 13:14:32 +00:00
}
// Send a confirmation message back to the publisher
natsConn . Publish ( msg . Reply , out )
case p . subject . CommandOrEvent == CommandNACK || p . subject . CommandOrEvent == EventNACK :
2021-03-03 14:44:32 +00:00
mf , ok := p . methodsAvailable . CheckIfExists ( message . Method )
2021-03-03 13:14:32 +00:00
if ! ok {
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "error: subscriberHandler: method type not available: %v" , p . subject . CommandOrEvent )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-03 13:14:32 +00:00
}
2021-03-25 12:39:59 +00:00
// Check if we are allowed to receive from that host
_ , arOK1 := p . allowedReceivers [ message . FromNode ]
_ , arOK2 := p . allowedReceivers [ "*" ]
2021-03-03 13:14:32 +00:00
2021-03-25 12:39:59 +00:00
if arOK1 || arOK2 {
2021-03-25 13:30:39 +00:00
2021-03-25 12:39:59 +00:00
// Start the method handler for that specific subject type.
// The handler started here is what actually doing the action
// that executed a CLI command, or writes to a log file on
// the node who received the message.
//
// since we don't send a reply for a NACK message, we don't care about the
// out return when calling mf.handler
_ , err := mf . handler ( p , message , thisNode )
if err != nil {
2021-04-13 11:25:44 +00:00
er := fmt . Errorf ( "error: subscriberHandler: handler method failed: %v" , err )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-25 12:39:59 +00:00
}
} else {
er := fmt . Errorf ( "info: we don't allow receiving from: %v, %v" , message . FromNode , p . subject )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-03 13:14:32 +00:00
}
2021-03-25 12:39:59 +00:00
// ---
2021-03-03 13:14:32 +00:00
default :
2021-03-12 08:38:19 +00:00
er := fmt . Errorf ( "info: did not find that specific type of command: %#v" , p . subject . CommandOrEvent )
2021-04-07 14:45:51 +00:00
sendErrorLogMessage ( p . toRingbufferCh , node ( thisNode ) , er )
2021-03-12 08:38:19 +00:00
2021-03-03 13:14:32 +00:00
}
}
// Subscribe will start up a Go routine under the hood calling the
// callback function specified when a new message is received.
2021-04-08 10:51:54 +00:00
func ( p process ) subscribeMessages ( ) * nats . Subscription {
2021-03-03 13:14:32 +00:00
subject := string ( p . subject . name ( ) )
2021-04-08 10:51:54 +00:00
natsSubscription , err := p . natsConn . Subscribe ( subject , func ( msg * nats . Msg ) {
//_, err := p.natsConn.Subscribe(subject, func(msg *nats.Msg) {
2021-03-25 13:30:39 +00:00
2021-03-03 13:14:32 +00:00
// We start one handler per message received by using go routines here.
// This is for being able to reply back the current publisher who sent
// the message.
2021-04-07 14:45:51 +00:00
go p . subscriberHandler ( p . natsConn , p . configuration . NodeName , msg )
2021-03-03 13:14:32 +00:00
} )
if err != nil {
log . Printf ( "error: Subscribe failed: %v\n" , err )
2021-04-08 10:51:54 +00:00
return nil
2021-03-03 13:14:32 +00:00
}
2021-04-08 10:51:54 +00:00
return natsSubscription
2021-03-03 13:14:32 +00:00
}
2021-03-09 06:43:55 +00:00
// publishMessages will do the publishing of messages for one single
// process.
2021-04-08 10:51:54 +00:00
func ( p process ) publishMessages ( natsConn * nats . Conn ) {
2021-03-03 13:14:32 +00:00
for {
2021-03-12 08:38:19 +00:00
var err error
2021-04-07 16:05:07 +00:00
var m Message
// Wait and read the next message on the message channel, or
// exit this function if Cancel are received via ctx.
select {
case m = <- p . subject . messageCh :
case <- p . ctx . Done ( ) :
er := fmt . Errorf ( "info: canceling publisher: %v" , p . subject . name ( ) )
sendErrorLogMessage ( p . toRingbufferCh , node ( p . node ) , er )
return
}
2021-03-09 06:43:55 +00:00
// Get the process name so we can look up the process in the
// processes map, and increment the message counter.
2021-03-03 13:14:32 +00:00
pn := processNameGet ( p . subject . name ( ) , processKindPublisher )
2021-04-08 10:51:54 +00:00
m . ID = p . messageID
2021-03-09 06:43:55 +00:00
p . messageDeliverNats ( natsConn , m )
// Signaling back to the ringbuffer that we are done with the
// current message, and it can remove it from the ringbuffer.
2021-03-03 13:14:32 +00:00
m . done <- struct { } { }
// Increment the counter for the next message to be sent.
p . messageID ++
2021-04-08 10:51:54 +00:00
p . processes . mu . Lock ( )
p . processes . active [ pn ] = p
p . processes . mu . Unlock ( )
2021-03-12 08:38:19 +00:00
// Handle the error.
2021-03-04 15:27:55 +00:00
//
2021-03-12 08:38:19 +00:00
// NOTE: None of the processes above generate an error, so the the
// if clause will never be triggered. But keeping it here as an example
// for now for how to handle errors.
if err != nil {
// Create an error type which also creates a channel which the
// errorKernel will send back the action about what to do.
ep := errProcess {
infoText : "process failed" ,
process : p ,
message : m ,
errorActionCh : make ( chan errorAction ) ,
}
p . errorCh <- ep
// Wait for the response action back from the error kernel, and
// decide what to do. Should we continue, quit, or .... ?
switch <- ep . errorActionCh {
case errActionContinue :
// Just log and continue
log . Printf ( "The errAction was continue...so we're continuing\n" )
case errActionKill :
log . Printf ( "The errAction was kill...so we're killing\n" )
// ....
default :
log . Printf ( "Info: publishMessages: The errAction was not defined, so we're doing nothing\n" )
}
}
2021-03-03 13:14:32 +00:00
}
}