1
0
Fork 0
mirror of https://github.com/postmannen/ctrl.git synced 2024-12-14 12:37:31 +00:00
ctrl/server.go

466 lines
15 KiB
Go

// Notes:
package steward
import (
"context"
"fmt"
"log"
"net"
"os"
"os/signal"
"path/filepath"
"strings"
"sync"
"time"
"github.com/nats-io/nats.go"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
type processName string
// Will return a process name made up of subjectName+processKind
func processNameGet(sn subjectName, pk processKind) processName {
pn := fmt.Sprintf("%s_%s", sn, pk)
return processName(pn)
}
// processes holds all the information about running processes
type processes struct {
// The active spawned processes
active map[processName]map[int]process
// mutex to lock the map
mu sync.RWMutex
// The last processID created
lastProcessID int
//
promTotalProcesses prometheus.Gauge
//
promProcessesVec *prometheus.GaugeVec
}
// newProcesses will prepare and return a *processes
func newProcesses(promRegistry *prometheus.Registry) *processes {
p := processes{
active: make(map[processName]map[int]process),
}
p.promTotalProcesses = promauto.NewGauge(prometheus.GaugeOpts{
Name: "total_running_processes",
Help: "The current number of total running processes",
})
p.promProcessesVec = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "running_process",
Help: "Name of the running process",
}, []string{"processName"},
)
return &p
}
// server is the structure that will hold the state about spawned
// processes on a local instance.
type server struct {
// The main background context
ctx context.Context
// The CancelFunc for the main context
ctxCancelFunc context.CancelFunc
// Configuration options used for running the server
configuration *Configuration
// The nats connection to the broker
natsConn *nats.Conn
// net listener for communicating via the steward socket
StewardSockListener net.Listener
// net listener for the communication with Stew
StewSockListener net.Listener
// processes holds all the information about running processes
processes *processes
// The name of the node
nodeName string
// Mutex for locking when writing to the process map
toRingbufferCh chan []subjectAndMessage
// errorKernel is doing all the error handling like what to do if
// an error occurs.
errorKernel *errorKernel
// metric exporter
metrics *metrics
}
// newServer will prepare and return a server type
func NewServer(c *Configuration) (*server, error) {
ctx, cancel := context.WithCancel(context.Background())
var opt nats.Option
if c.RootCAPath != "" {
opt = nats.RootCAs(c.RootCAPath)
}
if c.NkeySeedFile != "" {
var err error
// fh, err := os.Open(c.NkeySeedFile)
// if err != nil {
// return nil, fmt.Errorf("error: failed to open nkey seed file: %v\n", err)
// }
// b, err := io.ReadAll(fh)
// if err != nil {
// return nil, fmt.Errorf("error: failed to read nkey seed file: %v\n", err)
// }
opt, err = nats.NkeyOptionFromSeed(c.NkeySeedFile)
if err != nil {
cancel()
return nil, fmt.Errorf("error: failed to read nkey seed file: %v", err)
}
}
// Connect to the nats server, and retry until succesful.
var conn *nats.Conn
const connRetryWait = 5
for {
var err error
// Setting MaxReconnects to -1 which equals unlimited.
conn, err = nats.Connect(c.BrokerAddress, opt, nats.MaxReconnects(-1))
// Nats use string types for errors, so we need to check the content of the error.
// If no servers where available, we loop and retry until succesful.
if err != nil {
if strings.Contains(err.Error(), "no servers available") {
log.Printf("error: could not connect, waiting 5 seconds, and retrying: %v\n", err)
time.Sleep(time.Duration(time.Second * connRetryWait))
continue
}
er := fmt.Errorf("error: nats.Connect failed: %v", err)
cancel()
return nil, er
}
break
}
// Prepare the connection to the Steward socket file
// Check if socket folder exists, if not create it
if _, err := os.Stat(c.SocketFolder); os.IsNotExist(err) {
err := os.MkdirAll(c.SocketFolder, 0700)
if err != nil {
cancel()
return nil, fmt.Errorf("error: failed to create socket folder directory %v: %v", c.SocketFolder, err)
}
}
socketFilepath := filepath.Join(c.SocketFolder, "steward.sock")
if _, err := os.Stat(socketFilepath); !os.IsNotExist(err) {
err = os.Remove(socketFilepath)
if err != nil {
er := fmt.Errorf("error: could not delete sock file: %v", err)
cancel()
return nil, er
}
}
nl, err := net.Listen("unix", socketFilepath)
if err != nil {
er := fmt.Errorf("error: failed to open socket: %v", err)
cancel()
return nil, er
}
// ---
// Prepare the connection to the Stew socket file
// Check if socket folder exists, if not create it
if _, err := os.Stat(c.SocketFolder); os.IsNotExist(err) {
err := os.MkdirAll(c.SocketFolder, 0700)
if err != nil {
cancel()
return nil, fmt.Errorf("error: failed to create socket folder directory %v: %v", c.SocketFolder, err)
}
}
stewSocketFilepath := filepath.Join(c.SocketFolder, "stew.sock")
if _, err := os.Stat(stewSocketFilepath); !os.IsNotExist(err) {
err = os.Remove(stewSocketFilepath)
if err != nil {
er := fmt.Errorf("error: could not delete stew.sock file: %v", err)
cancel()
return nil, er
}
}
stewNL, err := net.Listen("unix", stewSocketFilepath)
if err != nil {
er := fmt.Errorf("error: failed to open stew socket: %v", err)
cancel()
return nil, er
}
// ---
metrics := newMetrics(c.PromHostAndPort)
s := &server{
ctx: ctx,
ctxCancelFunc: cancel,
configuration: c,
nodeName: c.NodeName,
natsConn: conn,
StewardSockListener: nl,
StewSockListener: stewNL,
processes: newProcesses(metrics.promRegistry),
toRingbufferCh: make(chan []subjectAndMessage),
metrics: metrics,
}
// Create the default data folder for where subscribers should
// write it's data, check if data folder exist, and create it if needed.
if _, err := os.Stat(c.SubscribersDataFolder); os.IsNotExist(err) {
if c.SubscribersDataFolder == "" {
return nil, fmt.Errorf("error: subscribersDataFolder value is empty, you need to provide the config or the flag value at startup %v: %v", c.SubscribersDataFolder, err)
}
err := os.Mkdir(c.SubscribersDataFolder, 0700)
if err != nil {
return nil, fmt.Errorf("error: failed to create data folder directory %v: %v", c.SubscribersDataFolder, err)
}
log.Printf("info: Creating subscribers data folder at %v\n", c.SubscribersDataFolder)
}
return s, nil
}
// Start will spawn up all the predefined subscriber processes.
// Spawning of publisher processes is done on the fly by checking
// if there is publisher process for a given message subject, and
// not exist it will spawn one.
func (s *server) Start() {
// Start the error kernel that will do all the error handling
// not done within a process.
s.errorKernel = newErrorKernel()
s.errorKernel.startErrorKernel(s.toRingbufferCh)
// Start collecting the metrics
go s.startMetrics()
// Start the checking the input socket for new messages from operator.
go s.readSocket(s.toRingbufferCh)
// Delete the socket file when the program exits.
defer func() {
socketFilepath := filepath.Join(s.configuration.SocketFolder, "steward.sock")
if _, err := os.Stat(socketFilepath); !os.IsNotExist(err) {
err = os.Remove(socketFilepath)
if err != nil {
er := fmt.Errorf("error: could not delete sock file: %v", err)
log.Printf("%v\n", er)
}
}
}()
// Start up the predefined subscribers. Since all the logic to handle
// processes are tied to the process struct, we need to create an
// initial process to start the rest.
sub := newSubject(REQInitial, s.nodeName)
p := newProcess(s.ctx, s.natsConn, s.processes, s.toRingbufferCh, s.configuration, sub, s.errorKernel.errorCh, "", []Node{}, nil)
p.ProcessesStart(s.ctx)
time.Sleep(time.Second * 1)
s.processes.printProcessesMap()
// Start the processing of new messages from an input channel.
s.routeMessagesToProcess("./incomingBuffer.db", s.toRingbufferCh)
// Set up channel on which to send signal notifications.
// We must use a buffered channel or risk missing the signal
// if we're not ready to receive when the signal is sent.
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, os.Interrupt)
//Block until we receive a signal
sig := <-sigCh
fmt.Printf("Got exit signal, terminating all processes, %v\n", sig)
// Adding a safety function here so we can make sure that all processes
// are stopped after a given time if the context cancelation below hangs.
defer func() {
time.Sleep(time.Second * 0)
log.Printf("error: doing a non graceful shutdown of all processes..\n")
os.Exit(1)
}()
// TODO: The cancelation of all gracefully do not work as expected since it
// seems to hang on terminating somewhere. Meanwhile adding a sleep here
// to be sure that the defered exit above are run before this cancelFunc.
//
// NOTE: The system are built to handle non graceful shutdowns since it keeps
// the state if the messages received have been processed or not, so it is not
// deeply needed to implement this.
// But still.. Need to look into this.
time.Sleep(time.Second * 10)
s.ctxCancelFunc()
fmt.Printf(" *** Done: ctxCancelFunc()\n")
}
func (p *processes) printProcessesMap() {
fmt.Println("--------------------------------------------------------------------------------------------")
log.Printf("*** Output of processes map :\n")
p.mu.Lock()
for _, vSub := range p.active {
for _, vID := range vSub {
log.Printf("* proc - : %v, id: %v, name: %v, allowed from: %v\n", vID.processKind, vID.processID, vID.subject.name(), vID.allowedReceivers)
}
}
p.mu.Unlock()
p.promTotalProcesses.Set(float64(len(p.active)))
fmt.Println("--------------------------------------------------------------------------------------------")
}
// sendErrorMessage will put the error message directly on the channel that is
// read by the nats publishing functions.
func sendErrorLogMessage(newMessagesCh chan<- []subjectAndMessage, FromNode Node, theError error) {
// NB: Adding log statement here for more visuality during development.
log.Printf("%v\n", theError)
sam := createErrorMsgContent(FromNode, theError)
newMessagesCh <- []subjectAndMessage{sam}
}
// createErrorMsgContent will prepare a subject and message with the content
// of the error
func createErrorMsgContent(FromNode Node, theError error) subjectAndMessage {
// Add time stamp
er := fmt.Sprintf("%v, %v\n", time.Now().UTC(), theError.Error())
sam := subjectAndMessage{
Subject: newSubject(REQErrorLog, "errorCentral"),
Message: Message{
Directory: "errorLog",
ToNode: "errorCentral",
FromNode: FromNode,
FileExtension: ".log",
Data: []string{er},
Method: REQErrorLog,
},
}
return sam
}
type samDBValueAndDelivered struct {
samDBValue samDBValue
delivered chan struct{}
}
// routeMessagesToProcess takes a database name and an input channel as
// it's input arguments.
// The database will be used as the persistent store for the work queue
// which is implemented as a ring buffer.
// The input channel are where we read new messages to publish.
// Incomming messages will be routed to the correct subject process, where
// the handling of each nats subject is handled within it's own separate
// worker process.
// It will also handle the process of spawning more worker processes
// for publisher subjects if it does not exist.
func (s *server) routeMessagesToProcess(dbFileName string, newSAM chan []subjectAndMessage) {
// Prepare and start a new ring buffer
const bufferSize int = 1000
rb := newringBuffer(*s.configuration, bufferSize, dbFileName, Node(s.nodeName), s.toRingbufferCh)
inCh := make(chan subjectAndMessage)
ringBufferOutCh := make(chan samDBValueAndDelivered)
// start the ringbuffer.
rb.start(inCh, ringBufferOutCh, s.configuration.DefaultMessageTimeout, s.configuration.DefaultMessageRetries)
// Start reading new fresh messages received on the incomming message
// pipe/file requested, and fill them into the buffer.
go func() {
for samSlice := range newSAM {
for _, sam := range samSlice {
inCh <- sam
}
}
close(inCh)
}()
// Process the messages that are in the ring buffer. Check and
// send if there are a specific subject for it, and if no subject
// exist throw an error.
var coe CommandOrEvent
coeAvailable := coe.GetCommandOrEventAvailable()
var method Method
methodsAvailable := method.GetMethodsAvailable()
go func() {
for samTmp := range ringBufferOutCh {
samTmp.delivered <- struct{}{}
sam := samTmp.samDBValue.Data
// Check if the format of the message is correct.
if _, ok := methodsAvailable.CheckIfExists(sam.Message.Method); !ok {
er := fmt.Errorf("error: routeMessagesToProcess: the method do not exist, message dropped: %v", sam.Message.Method)
sendErrorLogMessage(s.toRingbufferCh, Node(s.nodeName), er)
continue
}
if !coeAvailable.CheckIfExists(sam.Subject.CommandOrEvent, sam.Subject) {
er := fmt.Errorf("error: routeMessagesToProcess: the command or event do not exist, message dropped: %v", sam.Message.Method)
sendErrorLogMessage(s.toRingbufferCh, Node(s.nodeName), er)
continue
}
redo:
// Adding a label here so we are able to redo the sending
// of the last message if a process with specified subject
// is not present. The process will then be created, and
// the code will loop back to the redo: label.
m := sam.Message
subjName := sam.Subject.name()
// DEBUG: fmt.Printf("** handleNewOperatorMessages: message: %v, ** subject: %#v\n", m, sam.Subject)
pn := processNameGet(subjName, processKindPublisher)
// Check if there is a map of type map[int]process registered
// for the processName, and if it exists then return it.
s.processes.mu.Lock()
existingProcIDMap, ok := s.processes.active[pn]
s.processes.mu.Unlock()
// If found a map above, range it, and are there already a process
// for that subject, put the message on that processes incomming
// message channel.
if ok {
s.processes.mu.Lock()
for _, existingProc := range existingProcIDMap {
log.Printf("info: processNewMessages: found the specific subject: %v\n", subjName)
existingProc.subject.messageCh <- m
}
s.processes.mu.Unlock()
// If no process to handle the specific subject exist,
// the we create and spawn one.
} else {
// If a publisher process do not exist for the given subject, create it, and
// by using the goto at the end redo the process for this specific message.
log.Printf("info: processNewMessages: did not find that specific subject, starting new process for subject: %v\n", subjName)
sub := newSubject(sam.Subject.Method, sam.Subject.ToNode)
proc := newProcess(s.ctx, s.natsConn, s.processes, s.toRingbufferCh, s.configuration, sub, s.errorKernel.errorCh, processKindPublisher, nil, nil)
// fmt.Printf("*** %#v\n", proc)
proc.spawnWorker(s.processes, s.natsConn)
// Now when the process is spawned we jump back to the redo: label,
// and send the message to that new process.
goto redo
}
}
}()
}