feat(replica): atomicity for multi shard commands (#598)

2024-12-14 11:58:02 +00:00 · 2022-12-27 16:13:05 +02:00 · 2022-12-27 16:13:05 +02:00 · aacab66434
commit aacab66434
parent e6721d8160
4 changed files with 67 additions and 34 deletions
--- a/src/server/journal/executor.cc
+++ b/src/server/journal/executor.cc
@ -9,20 +9,42 @@

 namespace dfly {

-JournalExecutor::JournalExecutor(Service* service) : service_{service} {
+JournalExecutor::JournalExecutor(Service* service)
+    : service_{service}, conn_context_{&null_sink_, nullptr} {
+  conn_context_.is_replicating = true;
+  conn_context_.journal_emulated = true;
+}
+
+void JournalExecutor::Execute(std::vector<journal::ParsedEntry>& entries) {
+  DCHECK_GT(entries.size(), 1U);
+  conn_context_.conn_state.db_index = entries.front().dbid;
+
+  std::string multi_cmd = {"MULTI"};
+  auto ms = MutableSlice{&multi_cmd[0], multi_cmd.size()};
+  auto span = CmdArgList{&ms, 1};
+  service_->DispatchCommand(span, &conn_context_);
+
+  for (auto& entry : entries) {
+    if (entry.payload) {
+      DCHECK_EQ(entry.dbid, conn_context_.conn_state.db_index);
+      span = CmdArgList{entry.payload->data(), entry.payload->size()};
+
+      service_->DispatchCommand(span, &conn_context_);
+    }
+  }
+
+  std::string exec_cmd = {"EXEC"};
+  ms = {&exec_cmd[0], exec_cmd.size()};
+  span = {&ms, 1};
+  service_->DispatchCommand(span, &conn_context_);
 }

 void JournalExecutor::Execute(journal::ParsedEntry& entry) {
-  if (entry.payload) {
-    io::NullSink null_sink;
-    ConnectionContext conn_context{&null_sink, nullptr};
-    conn_context.is_replicating = true;
-    conn_context.journal_emulated = true;
-    conn_context.conn_state.db_index = entry.dbid;
-
+  conn_context_.conn_state.db_index = entry.dbid;
+  if (entry.payload) {  // TODO - when this is false?
    auto span = CmdArgList{entry.payload->data(), entry.payload->size()};

-    service_->DispatchCommand(span, &conn_context);
+    service_->DispatchCommand(span, &conn_context_);
  }
 }

--- a/src/server/journal/executor.h
+++ b/src/server/journal/executor.h
@ -14,10 +14,13 @@ class Service;
 class JournalExecutor {
 public:
  JournalExecutor(Service* service);
+  void Execute(std::vector<journal::ParsedEntry>& entries);
  void Execute(journal::ParsedEntry& entry);

 private:
  Service* service_;
+  ConnectionContext conn_context_;
+  io::NullSink null_sink_;
 };

 }  // namespace dfly
--- a/src/server/replica.cc
+++ b/src/server/replica.cc
@ -759,13 +759,13 @@ void Replica::StableSyncDflyFb(Context* cntx) {
      cntx->ReportError(res.error(), "Journal format error");
      return;
    }
-    ExecuteEntry(&executor, res.value());
+    ExecuteEntry(&executor, std::move(res.value()));
    last_io_time_ = sock_->proactor()->GetMonotonicTimeNs();
  }
  return;
 }

-void Replica::ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry& entry) {
+void Replica::ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry&& entry) {
  if (entry.shard_cnt <= 1) {  // not multi shard cmd
    executor->Execute(entry);
    return;
@ -773,46 +773,53 @@ void Replica::ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry& entr

  // Multi shard command flow:
  //  step 1: Fiber wait until all the fibers that should execute this tranaction got
-  //    to the journal entry of the transaction.
-  //  step 2: execute the command (All fibers)
-  //  step 3: Fiber wait until all fibers finished the execution
-  //  By step 1 we enforce that replica will execute multi shard commands that finished on master
-  //  By step 3 we ensures the correctness of flushall/flushdb commands
-
-  // TODO: this implemantaion does not support atomicity in replica
-  // Although multi shard transaction happen in step 2 very close to each other,
-  // user can query replica between executions.
-  // To support atomicity we should have one fiber in step 2 which will excute all the entries of
-  // the transaction together. In case of global comand such as flushdb the command can be executed
-  // by only one fiber.
+  //    to the journal entry of the transaction. This step enforces that replica will execute multi
+  //    shard commands that finished on master.
+  //  step 2: Execute the commands from one fiber. This step ensures atomicity of replica.
+  //  step 3: Fiber wait until all fibers finished the execution. This step ensures atomicity of
+  //    operations on replica.

  // TODO: support error handler in this flow

  // Only the first fiber to reach the transaction will create data for transaction in map
  multi_shard_exe_->map_mu.lock();
  auto [it, was_insert] = multi_shard_exe_->tx_sync_execution.emplace(entry.txid, entry.shard_cnt);
+  VLOG(2) << "txid: " << entry.txid << " unique_shard_cnt_: " << entry.shard_cnt
+          << " was_insert: " << was_insert;
+
+  TxId txid = entry.txid;
+  // entries_vec will store all entries of trasaction and will be executed by the fiber that
+  // inserted the txid to map. In case of global command the inserting fiber will executed his
+  // entry.
+  bool global_cmd = (entry.payload.value().size() == 1);
+  if (!global_cmd) {
+    it->second.entries_vec.push_back(std::move(entry));
+  }
+  auto& tx_sync = it->second;

  // Note: we must release the mutex befor calling wait on barrier
  multi_shard_exe_->map_mu.unlock();

-  VLOG(2) << "txid: " << entry.txid << " unique_shard_cnt_: " << entry.shard_cnt
-          << " was_insert: " << was_insert;
-
  // step 1
-  it->second.barrier.wait();
+  tx_sync.barrier.wait();
  // step 2
-  executor->Execute(entry);
+  if (was_insert) {
+    if (global_cmd) {
+      executor->Execute(entry);
+    } else {
+      executor->Execute(tx_sync.entries_vec);
+    }
+  }
  // step 3
-  it->second.barrier.wait();
+  tx_sync.barrier.wait();

  // Note: erase from map can be done only after all fibers returned from wait.
  // The last fiber which will decrease the counter to 0 will be the one to erase the data from map
-  auto val = it->second.counter.fetch_sub(1, std::memory_order_relaxed);
-  VLOG(2) << "txid: " << entry.txid << " unique_shard_cnt_: " << entry.shard_cnt
-          << " counter: " << val;
+  auto val = tx_sync.counter.fetch_sub(1, std::memory_order_relaxed);
+  VLOG(2) << "txid: " << txid << " counter: " << val;
  if (val == 1) {
    std::lock_guard lg{multi_shard_exe_->map_mu};
-    multi_shard_exe_->tx_sync_execution.erase(entry.txid);
+    multi_shard_exe_->tx_sync_execution.erase(txid);
  }
 }

--- a/src/server/replica.h
+++ b/src/server/replica.h
@ -58,6 +58,7 @@ class Replica {
      std::atomic_uint32_t counter;
      TxExecutionSync(uint32_t counter) : barrier(counter), counter(counter) {
      }
+      std::vector<journal::ParsedEntry> entries_vec;
    };

    std::unordered_map<TxId, TxExecutionSync> tx_sync_execution;
@ -141,7 +142,7 @@ class Replica {
  // Send command, update last_io_time, return error.
  std::error_code SendCommand(std::string_view command, facade::ReqSerializer* serializer);

-  void ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry& entry);
+  void ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry&& entry);

 public: /* Utility */
  struct Info {