1
0
Fork 0
mirror of https://github.com/dragonflydb/dragonfly.git synced 2024-12-14 11:58:02 +00:00

fix(replication): fix cancel replication race (#2196)

The bug: One connection calls replica start and second call replica stop. In this flow stop first reset state mask state_mask_.store(0), Start sets state mask state_mask_.store(R_ENABLED) continues to greet and create main replication fiber and then stop runs cntx_.Cancel(), which will at later be reset inside the main replication fiber. In this flow main replication fiber does not cancel and the connection calling to Stop is deadlocked waiting to join the main replication fiber.

The fix: run cntx_.Cancel() and state_mask_.store(0) in replica thread.

Signed-off-by: adi_holden <adi@dragonflydb.io>
This commit is contained in:
adiholden 2023-11-21 12:47:53 +02:00 committed by GitHub
parent 03a6b508e8
commit c10dac4db2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 2 deletions

View file

@ -67,6 +67,7 @@ vector<vector<unsigned>> Partition(unsigned num_flows) {
Replica::Replica(string host, uint16_t port, Service* se, std::string_view id)
: ProtocolClient(std::move(host), port), service_(*se), id_{id} {
proactor_ = ProactorBase::me();
}
Replica::~Replica() {
@ -133,8 +134,11 @@ void Replica::EnableReplication(ConnectionContext* cntx) {
void Replica::Stop() {
VLOG(1) << "Stopping replication";
// Stops the loop in MainReplicationFb.
state_mask_.store(0); // Specifically ~R_ENABLED.
cntx_.Cancel(); // Context is fully resposible for cleanup.
proactor_->Await([this] {
cntx_.Cancel(); // Context is fully resposible for cleanup.
state_mask_.store(0); // Specifically ~R_ENABLED.
});
waker_.notifyAll();

View file

@ -156,6 +156,7 @@ class Replica : ProtocolClient {
std::string GetSyncId() const;
private:
util::fb2::ProactorBase* proactor_ = nullptr;
Service& service_;
MasterContext master_context_;