2021-01-28 10:17:54 +00:00
// Notes:
2021-02-01 10:13:38 +00:00
package steward
2021-01-25 14:23:00 +00:00
import (
2021-07-02 09:26:52 +00:00
"context"
2021-01-25 14:23:00 +00:00
"fmt"
"log"
2021-03-30 08:37:16 +00:00
"net"
2021-03-02 12:46:02 +00:00
"os"
2021-07-02 06:38:44 +00:00
"os/signal"
2021-05-12 07:50:03 +00:00
"path/filepath"
2021-05-25 08:23:27 +00:00
"strings"
2021-01-28 13:58:16 +00:00
"sync"
2021-01-25 14:23:00 +00:00
"time"
"github.com/nats-io/nats.go"
2021-02-19 15:58:16 +00:00
"github.com/prometheus/client_golang/prometheus"
2021-04-12 08:51:26 +00:00
"github.com/prometheus/client_golang/prometheus/promauto"
2021-01-25 14:23:00 +00:00
)
2021-02-26 06:55:28 +00:00
type processName string
2021-03-09 03:55:51 +00:00
// Will return a process name made up of subjectName+processKind
2021-02-26 06:55:28 +00:00
func processNameGet ( sn subjectName , pk processKind ) processName {
pn := fmt . Sprintf ( "%s_%s" , sn , pk )
return processName ( pn )
}
2021-03-03 14:44:32 +00:00
// processes holds all the information about running processes
type processes struct {
// The active spawned processes
2021-06-08 11:56:31 +00:00
active map [ processName ] map [ int ] process
2021-03-03 14:44:32 +00:00
// mutex to lock the map
2021-03-09 06:43:55 +00:00
mu sync . RWMutex
2021-03-03 14:44:32 +00:00
// The last processID created
lastProcessID int
2021-04-12 08:51:26 +00:00
//
promTotalProcesses prometheus . Gauge
2021-04-12 13:35:20 +00:00
//
promProcessesVec * prometheus . GaugeVec
2021-03-03 14:44:32 +00:00
}
2021-08-09 03:50:08 +00:00
// newProcesses will prepare and return a *processes which
// is map containing all the currently running processes.
2021-04-12 08:51:26 +00:00
func newProcesses ( promRegistry * prometheus . Registry ) * processes {
2021-03-03 14:44:32 +00:00
p := processes {
2021-06-08 11:56:31 +00:00
active : make ( map [ processName ] map [ int ] process ) ,
2021-03-03 14:44:32 +00:00
}
2021-04-12 08:51:26 +00:00
p . promTotalProcesses = promauto . NewGauge ( prometheus . GaugeOpts {
Name : "total_running_processes" ,
Help : "The current number of total running processes" ,
} )
2021-04-12 13:35:20 +00:00
p . promProcessesVec = promauto . NewGaugeVec ( prometheus . GaugeOpts {
Name : "running_process" ,
Help : "Name of the running process" ,
} , [ ] string { "processName" } ,
)
2021-03-03 14:44:32 +00:00
return & p
}
2021-01-28 10:17:54 +00:00
// server is the structure that will hold the state about spawned
// processes on a local instance.
2021-01-27 13:02:57 +00:00
type server struct {
2021-07-02 09:26:52 +00:00
// The main background context
ctx context . Context
// The CancelFunc for the main context
ctxCancelFunc context . CancelFunc
2021-03-01 19:49:43 +00:00
// Configuration options used for running the server
configuration * Configuration
// The nats connection to the broker
2021-01-27 13:02:57 +00:00
natsConn * nats . Conn
2021-07-02 06:38:44 +00:00
// net listener for communicating via the steward socket
StewardSockListener net . Listener
// net listener for the communication with Stew
StewSockListener net . Listener
2021-03-03 14:44:32 +00:00
// processes holds all the information about running processes
processes * processes
2021-02-04 12:26:10 +00:00
// The name of the node
nodeName string
2021-02-24 09:58:02 +00:00
// Mutex for locking when writing to the process map
2021-03-29 11:36:30 +00:00
toRingbufferCh chan [ ] subjectAndMessage
2021-02-24 09:58:02 +00:00
// errorKernel is doing all the error handling like what to do if
// an error occurs.
2021-02-05 12:56:42 +00:00
errorKernel * errorKernel
2021-02-18 11:29:14 +00:00
// metric exporter
metrics * metrics
2021-01-27 13:02:57 +00:00
}
2021-01-28 10:17:54 +00:00
// newServer will prepare and return a server type
2021-03-01 19:49:43 +00:00
func NewServer ( c * Configuration ) ( * server , error ) {
2021-07-02 09:26:52 +00:00
ctx , cancel := context . WithCancel ( context . Background ( ) )
2021-04-19 19:06:37 +00:00
var opt nats . Option
if c . RootCAPath != "" {
opt = nats . RootCAs ( c . RootCAPath )
}
2021-05-20 10:27:25 +00:00
if c . NkeySeedFile != "" {
var err error
// fh, err := os.Open(c.NkeySeedFile)
// if err != nil {
// return nil, fmt.Errorf("error: failed to open nkey seed file: %v\n", err)
// }
// b, err := io.ReadAll(fh)
// if err != nil {
// return nil, fmt.Errorf("error: failed to read nkey seed file: %v\n", err)
// }
opt , err = nats . NkeyOptionFromSeed ( c . NkeySeedFile )
if err != nil {
2021-07-02 09:26:52 +00:00
cancel ( )
2021-05-20 10:27:25 +00:00
return nil , fmt . Errorf ( "error: failed to read nkey seed file: %v" , err )
}
}
2021-05-25 08:23:27 +00:00
// Connect to the nats server, and retry until succesful.
var conn * nats . Conn
const connRetryWait = 5
for {
var err error
// Setting MaxReconnects to -1 which equals unlimited.
conn , err = nats . Connect ( c . BrokerAddress , opt , nats . MaxReconnects ( - 1 ) )
// Nats use string types for errors, so we need to check the content of the error.
// If no servers where available, we loop and retry until succesful.
if err != nil {
if strings . Contains ( err . Error ( ) , "no servers available" ) {
log . Printf ( "error: could not connect, waiting 5 seconds, and retrying: %v\n" , err )
time . Sleep ( time . Duration ( time . Second * connRetryWait ) )
continue
}
er := fmt . Errorf ( "error: nats.Connect failed: %v" , err )
2021-07-02 09:26:52 +00:00
cancel ( )
2021-05-25 08:23:27 +00:00
return nil , er
}
2021-02-01 10:13:38 +00:00
2021-05-25 08:23:27 +00:00
break
}
2021-07-02 06:38:44 +00:00
// Prepare the connection to the Steward socket file
2021-05-12 07:50:03 +00:00
// Check if socket folder exists, if not create it
if _ , err := os . Stat ( c . SocketFolder ) ; os . IsNotExist ( err ) {
err := os . MkdirAll ( c . SocketFolder , 0700 )
if err != nil {
2021-07-02 09:26:52 +00:00
cancel ( )
2021-05-20 10:27:25 +00:00
return nil , fmt . Errorf ( "error: failed to create socket folder directory %v: %v" , c . SocketFolder , err )
2021-05-12 07:50:03 +00:00
}
}
socketFilepath := filepath . Join ( c . SocketFolder , "steward.sock" )
if _ , err := os . Stat ( socketFilepath ) ; ! os . IsNotExist ( err ) {
err = os . Remove ( socketFilepath )
if err != nil {
er := fmt . Errorf ( "error: could not delete sock file: %v" , err )
2021-07-02 09:26:52 +00:00
cancel ( )
2021-05-12 07:50:03 +00:00
return nil , er
}
2021-03-30 08:37:16 +00:00
}
2021-05-12 07:50:03 +00:00
nl , err := net . Listen ( "unix" , socketFilepath )
2021-03-30 08:37:16 +00:00
if err != nil {
2021-04-16 11:18:10 +00:00
er := fmt . Errorf ( "error: failed to open socket: %v" , err )
2021-07-02 09:26:52 +00:00
cancel ( )
2021-04-16 11:18:10 +00:00
return nil , er
2021-03-30 08:37:16 +00:00
}
2021-07-02 06:38:44 +00:00
// ---
// Prepare the connection to the Stew socket file
// Check if socket folder exists, if not create it
if _ , err := os . Stat ( c . SocketFolder ) ; os . IsNotExist ( err ) {
err := os . MkdirAll ( c . SocketFolder , 0700 )
if err != nil {
2021-07-02 09:26:52 +00:00
cancel ( )
2021-07-02 06:38:44 +00:00
return nil , fmt . Errorf ( "error: failed to create socket folder directory %v: %v" , c . SocketFolder , err )
}
}
stewSocketFilepath := filepath . Join ( c . SocketFolder , "stew.sock" )
if _ , err := os . Stat ( stewSocketFilepath ) ; ! os . IsNotExist ( err ) {
err = os . Remove ( stewSocketFilepath )
if err != nil {
er := fmt . Errorf ( "error: could not delete stew.sock file: %v" , err )
2021-07-02 09:26:52 +00:00
cancel ( )
2021-07-02 06:38:44 +00:00
return nil , er
}
}
stewNL , err := net . Listen ( "unix" , stewSocketFilepath )
if err != nil {
er := fmt . Errorf ( "error: failed to open stew socket: %v" , err )
2021-07-02 09:26:52 +00:00
cancel ( )
2021-07-02 06:38:44 +00:00
return nil , er
}
// ---
2021-03-31 06:56:13 +00:00
metrics := newMetrics ( c . PromHostAndPort )
2021-02-01 12:41:04 +00:00
s := & server {
2021-07-02 09:26:52 +00:00
ctx : ctx ,
ctxCancelFunc : cancel ,
2021-07-02 06:38:44 +00:00
configuration : c ,
nodeName : c . NodeName ,
natsConn : conn ,
StewardSockListener : nl ,
StewSockListener : stewNL ,
processes : newProcesses ( metrics . promRegistry ) ,
toRingbufferCh : make ( chan [ ] subjectAndMessage ) ,
metrics : metrics ,
2021-02-01 12:41:04 +00:00
}
2021-01-29 05:09:48 +00:00
2021-03-02 12:46:02 +00:00
// Create the default data folder for where subscribers should
2021-03-25 11:50:58 +00:00
// write it's data, check if data folder exist, and create it if needed.
2021-03-02 12:46:02 +00:00
if _ , err := os . Stat ( c . SubscribersDataFolder ) ; os . IsNotExist ( err ) {
if c . SubscribersDataFolder == "" {
return nil , fmt . Errorf ( "error: subscribersDataFolder value is empty, you need to provide the config or the flag value at startup %v: %v" , c . SubscribersDataFolder , err )
}
err := os . Mkdir ( c . SubscribersDataFolder , 0700 )
if err != nil {
2021-05-20 10:27:25 +00:00
return nil , fmt . Errorf ( "error: failed to create data folder directory %v: %v" , c . SubscribersDataFolder , err )
2021-03-02 12:46:02 +00:00
}
log . Printf ( "info: Creating subscribers data folder at %v\n" , c . SubscribersDataFolder )
}
2021-02-05 06:25:12 +00:00
return s , nil
}
2021-02-24 09:58:02 +00:00
// Start will spawn up all the predefined subscriber processes.
2021-02-10 06:25:44 +00:00
// Spawning of publisher processes is done on the fly by checking
2021-02-24 09:58:02 +00:00
// if there is publisher process for a given message subject, and
// not exist it will spawn one.
2021-02-10 04:11:48 +00:00
func ( s * server ) Start ( ) {
2021-08-03 13:05:34 +00:00
// Stop main server context last when exits.
2021-08-03 11:43:05 +00:00
defer func ( ) {
s . ctxCancelFunc ( )
log . Printf ( "info: stopping the main server context with ctxCancelFunc()\n" )
} ( )
2021-08-04 08:37:24 +00:00
2021-02-19 10:07:09 +00:00
// Start the error kernel that will do all the error handling
2021-08-03 11:57:29 +00:00
// that is not done within a process.
2021-08-04 08:37:24 +00:00
s . errorKernel = newErrorKernel ( s . ctx )
2021-08-03 11:43:05 +00:00
2021-08-04 08:37:24 +00:00
go func ( ) {
err := s . errorKernel . start ( s . toRingbufferCh )
if err != nil {
log . Printf ( "%v\n" , err )
}
} ( )
defer s . errorKernel . stop ( )
2021-02-19 10:07:09 +00:00
2021-02-18 11:29:14 +00:00
// Start collecting the metrics
2021-08-03 11:57:29 +00:00
go func ( ) {
2021-08-04 08:37:24 +00:00
err := s . metrics . start ( )
2021-08-03 11:57:29 +00:00
if err != nil {
log . Printf ( "%v\n" , err )
os . Exit ( 1 )
}
} ( )
2021-02-18 11:29:14 +00:00
2021-03-29 11:36:30 +00:00
// Start the checking the input socket for new messages from operator.
go s . readSocket ( s . toRingbufferCh )
2021-02-26 08:02:53 +00:00
2021-07-02 06:38:44 +00:00
// Delete the socket file when the program exits.
defer func ( ) {
socketFilepath := filepath . Join ( s . configuration . SocketFolder , "steward.sock" )
if _ , err := os . Stat ( socketFilepath ) ; ! os . IsNotExist ( err ) {
err = os . Remove ( socketFilepath )
if err != nil {
er := fmt . Errorf ( "error: could not delete sock file: %v" , err )
log . Printf ( "%v\n" , er )
}
}
} ( )
2021-04-09 09:30:40 +00:00
// Start up the predefined subscribers. Since all the logic to handle
// processes are tied to the process struct, we need to create an
// initial process to start the rest.
2021-08-03 11:43:05 +00:00
{
ctx , cancel := context . WithCancel ( s . ctx )
sub := newSubject ( REQInitial , s . nodeName )
p := newProcess ( ctx , s . natsConn , s . processes , s . toRingbufferCh , s . configuration , sub , s . errorKernel . errorCh , "" , [ ] Node { } , nil )
p . ProcessesStart ( ctx )
time . Sleep ( time . Second * 1 )
s . processes . printProcessesMap ( )
2021-02-18 13:27:53 +00:00
2021-08-03 11:43:05 +00:00
// Start the processing of new messages from an input channel.
s . routeMessagesToProcess ( "./incomingBuffer.db" , s . toRingbufferCh )
2021-02-10 06:25:44 +00:00
2021-08-03 11:43:05 +00:00
defer cancel ( )
}
2021-02-05 09:47:07 +00:00
2021-07-02 06:38:44 +00:00
// Set up channel on which to send signal notifications.
// We must use a buffered channel or risk missing the signal
// if we're not ready to receive when the signal is sent.
sigCh := make ( chan os . Signal , 1 )
signal . Notify ( sigCh , os . Interrupt )
2021-08-03 13:05:34 +00:00
//Block and wait for CTRL+C
2021-07-02 06:38:44 +00:00
sig := <- sigCh
fmt . Printf ( "Got exit signal, terminating all processes, %v\n" , sig )
2021-07-02 11:26:30 +00:00
// Adding a safety function here so we can make sure that all processes
2021-08-03 13:05:34 +00:00
// are stopped after a given time if the context cancelation hangs.
2021-08-03 10:30:18 +00:00
go func ( ) {
2021-08-03 11:57:29 +00:00
time . Sleep ( time . Second * 10 )
2021-07-02 11:26:30 +00:00
log . Printf ( "error: doing a non graceful shutdown of all processes..\n" )
os . Exit ( 1 )
} ( )
2021-02-05 09:47:07 +00:00
}
2021-03-31 06:56:13 +00:00
func ( p * processes ) printProcessesMap ( ) {
2021-02-10 06:25:44 +00:00
fmt . Println ( "--------------------------------------------------------------------------------------------" )
2021-04-16 11:43:58 +00:00
log . Printf ( "*** Output of processes map :\n" )
2021-03-31 06:56:13 +00:00
p . mu . Lock ( )
2021-06-08 11:56:31 +00:00
for _ , vSub := range p . active {
for _ , vID := range vSub {
log . Printf ( "* proc - : %v, id: %v, name: %v, allowed from: %v\n" , vID . processKind , vID . processID , vID . subject . name ( ) , vID . allowedReceivers )
}
2021-02-10 06:25:44 +00:00
}
2021-03-31 06:56:13 +00:00
p . mu . Unlock ( )
2021-02-19 15:58:16 +00:00
2021-04-12 08:51:26 +00:00
p . promTotalProcesses . Set ( float64 ( len ( p . active ) ) )
2021-02-19 15:58:16 +00:00
2021-02-10 06:25:44 +00:00
fmt . Println ( "--------------------------------------------------------------------------------------------" )
}
2021-02-25 10:08:05 +00:00
// sendErrorMessage will put the error message directly on the channel that is
// read by the nats publishing functions.
2021-06-29 06:21:42 +00:00
func sendErrorLogMessage ( newMessagesCh chan <- [ ] subjectAndMessage , FromNode Node , theError error ) {
2021-03-26 04:13:51 +00:00
// NB: Adding log statement here for more visuality during development.
log . Printf ( "%v\n" , theError )
2021-02-25 10:08:05 +00:00
sam := createErrorMsgContent ( FromNode , theError )
newMessagesCh <- [ ] subjectAndMessage { sam }
}
2021-02-24 14:43:31 +00:00
2021-02-25 10:08:05 +00:00
// createErrorMsgContent will prepare a subject and message with the content
// of the error
2021-06-29 06:21:42 +00:00
func createErrorMsgContent ( FromNode Node , theError error ) subjectAndMessage {
2021-04-06 07:06:26 +00:00
// Add time stamp
er := fmt . Sprintf ( "%v, %v\n" , time . Now ( ) . UTC ( ) , theError . Error ( ) )
2021-02-24 14:43:31 +00:00
sam := subjectAndMessage {
2021-04-06 05:56:49 +00:00
Subject : newSubject ( REQErrorLog , "errorCentral" ) ,
2021-02-24 14:43:31 +00:00
Message : Message {
2021-04-09 09:30:40 +00:00
Directory : "errorLog" ,
ToNode : "errorCentral" ,
FromNode : FromNode ,
FileExtension : ".log" ,
Data : [ ] string { er } ,
Method : REQErrorLog ,
2021-02-24 14:43:31 +00:00
} ,
}
2021-02-25 10:08:05 +00:00
return sam
2021-02-24 14:43:31 +00:00
}
2021-03-09 06:43:55 +00:00
2021-07-05 05:43:33 +00:00
// Contains the sam value as it is used in the state DB, and also a
// delivered function to be called when this message is picked up, so
// we can control if messages gets stale at some point.
2021-07-02 17:09:42 +00:00
type samDBValueAndDelivered struct {
2021-07-02 16:32:01 +00:00
samDBValue samDBValue
2021-07-05 05:43:33 +00:00
delivered func ( )
2021-07-02 16:32:01 +00:00
}
2021-03-10 06:11:14 +00:00
// routeMessagesToProcess takes a database name and an input channel as
2021-03-09 06:43:55 +00:00
// it's input arguments.
// The database will be used as the persistent store for the work queue
// which is implemented as a ring buffer.
// The input channel are where we read new messages to publish.
// Incomming messages will be routed to the correct subject process, where
// the handling of each nats subject is handled within it's own separate
// worker process.
// It will also handle the process of spawning more worker processes
// for publisher subjects if it does not exist.
2021-03-10 06:11:14 +00:00
func ( s * server ) routeMessagesToProcess ( dbFileName string , newSAM chan [ ] subjectAndMessage ) {
2021-03-09 06:43:55 +00:00
// Prepare and start a new ring buffer
const bufferSize int = 1000
2021-06-29 06:21:42 +00:00
rb := newringBuffer ( * s . configuration , bufferSize , dbFileName , Node ( s . nodeName ) , s . toRingbufferCh )
2021-03-09 06:43:55 +00:00
inCh := make ( chan subjectAndMessage )
2021-07-02 17:09:42 +00:00
ringBufferOutCh := make ( chan samDBValueAndDelivered )
2021-03-09 06:43:55 +00:00
// start the ringbuffer.
rb . start ( inCh , ringBufferOutCh , s . configuration . DefaultMessageTimeout , s . configuration . DefaultMessageRetries )
// Start reading new fresh messages received on the incomming message
// pipe/file requested, and fill them into the buffer.
go func ( ) {
for samSlice := range newSAM {
for _ , sam := range samSlice {
inCh <- sam
}
}
close ( inCh )
} ( )
// Process the messages that are in the ring buffer. Check and
// send if there are a specific subject for it, and if no subject
// exist throw an error.
var coe CommandOrEvent
coeAvailable := coe . GetCommandOrEventAvailable ( )
var method Method
methodsAvailable := method . GetMethodsAvailable ( )
go func ( ) {
for samTmp := range ringBufferOutCh {
2021-07-05 05:43:33 +00:00
samTmp . delivered ( )
2021-07-02 17:09:42 +00:00
sam := samTmp . samDBValue . Data
2021-03-09 06:43:55 +00:00
// Check if the format of the message is correct.
if _ , ok := methodsAvailable . CheckIfExists ( sam . Message . Method ) ; ! ok {
2021-03-12 11:08:11 +00:00
er := fmt . Errorf ( "error: routeMessagesToProcess: the method do not exist, message dropped: %v" , sam . Message . Method )
2021-06-29 06:21:42 +00:00
sendErrorLogMessage ( s . toRingbufferCh , Node ( s . nodeName ) , er )
2021-03-09 06:43:55 +00:00
continue
}
if ! coeAvailable . CheckIfExists ( sam . Subject . CommandOrEvent , sam . Subject ) {
2021-03-12 11:08:11 +00:00
er := fmt . Errorf ( "error: routeMessagesToProcess: the command or event do not exist, message dropped: %v" , sam . Message . Method )
2021-06-29 06:21:42 +00:00
sendErrorLogMessage ( s . toRingbufferCh , Node ( s . nodeName ) , er )
2021-03-12 11:08:11 +00:00
2021-03-09 06:43:55 +00:00
continue
}
redo :
// Adding a label here so we are able to redo the sending
// of the last message if a process with specified subject
// is not present. The process will then be created, and
// the code will loop back to the redo: label.
m := sam . Message
subjName := sam . Subject . name ( )
// DEBUG: fmt.Printf("** handleNewOperatorMessages: message: %v, ** subject: %#v\n", m, sam.Subject)
pn := processNameGet ( subjName , processKindPublisher )
2021-03-09 10:58:50 +00:00
2021-06-08 11:56:31 +00:00
// Check if there is a map of type map[int]process registered
// for the processName, and if it exists then return it.
2021-03-09 10:58:50 +00:00
s . processes . mu . Lock ( )
2021-06-08 11:56:31 +00:00
existingProcIDMap , ok := s . processes . active [ pn ]
2021-03-09 10:58:50 +00:00
s . processes . mu . Unlock ( )
2021-03-09 06:43:55 +00:00
2021-06-08 11:56:31 +00:00
// If found a map above, range it, and are there already a process
// for that subject, put the message on that processes incomming
// message channel.
2021-03-09 06:43:55 +00:00
if ok {
2021-06-08 11:56:31 +00:00
s . processes . mu . Lock ( )
for _ , existingProc := range existingProcIDMap {
log . Printf ( "info: processNewMessages: found the specific subject: %v\n" , subjName )
existingProc . subject . messageCh <- m
}
s . processes . mu . Unlock ( )
2021-03-09 06:43:55 +00:00
// If no process to handle the specific subject exist,
// the we create and spawn one.
} else {
// If a publisher process do not exist for the given subject, create it, and
// by using the goto at the end redo the process for this specific message.
log . Printf ( "info: processNewMessages: did not find that specific subject, starting new process for subject: %v\n" , subjName )
2021-04-03 05:33:03 +00:00
sub := newSubject ( sam . Subject . Method , sam . Subject . ToNode )
2021-07-02 09:26:52 +00:00
proc := newProcess ( s . ctx , s . natsConn , s . processes , s . toRingbufferCh , s . configuration , sub , s . errorKernel . errorCh , processKindPublisher , nil , nil )
2021-03-09 06:43:55 +00:00
// fmt.Printf("*** %#v\n", proc)
2021-04-07 14:45:51 +00:00
proc . spawnWorker ( s . processes , s . natsConn )
2021-03-09 06:43:55 +00:00
// Now when the process is spawned we jump back to the redo: label,
// and send the message to that new process.
goto redo
}
}
} ( )
}