1
0
Fork 0
mirror of https://github.com/element-hq/synapse.git synced 2025-03-09 09:26:50 +00:00
synapse/tests/storage/test_state_deletion.py
Erik Johnston aa6e5c2ecb
Add locking to more safely delete state groups: Part 1 (#18107)
Currently we don't really have anything that stops us from deleting
state groups when an in-flight event references it. This is a fairly
rare race currently, but we want to be able to more aggressively delete
state groups so it is important to address this to ensure that the
database remains valid.

This implements the locking, but doesn't actually use it.

See the class docstring of the new data store for an explanation for how
this works.

---------

Co-authored-by: Devon Hudson <devon.dmytro@gmail.com>
2025-02-03 17:29:15 +00:00

411 lines
14 KiB
Python

#
# This file is licensed under the Affero General Public License (AGPL) version 3.
#
# Copyright (C) 2025 New Vector, Ltd
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# See the GNU Affero General Public License for more details:
# <https://www.gnu.org/licenses/agpl-3.0.html>.
#
import logging
from twisted.test.proto_helpers import MemoryReactor
from synapse.rest import admin
from synapse.rest.client import login, room
from synapse.server import HomeServer
from synapse.util import Clock
from tests.test_utils.event_injection import create_event
from tests.unittest import HomeserverTestCase
logger = logging.getLogger(__name__)
class StateDeletionStoreTestCase(HomeserverTestCase):
"""Tests for the StateDeletionStore."""
servlets = [
admin.register_servlets,
room.register_servlets,
login.register_servlets,
]
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main
self.state_store = hs.get_datastores().state
self.state_deletion_store = hs.get_datastores().state_deletion
self.user_id = self.register_user("test", "password")
tok = self.login("test", "password")
self.room_id = self.helper.create_room_as(self.user_id, tok=tok)
def check_if_can_be_deleted(self, state_group: int) -> bool:
"""Check if the state group is pending deletion."""
state_group_to_sequence_number = self.get_success(
self.state_deletion_store.get_pending_deletions([state_group])
)
can_be_deleted = self.get_success(
self.state_deletion_store.db_pool.runInteraction(
"test_existing_pending_deletion_is_cleared",
self.state_deletion_store.get_state_groups_ready_for_potential_deletion_txn,
state_group_to_sequence_number,
)
)
return state_group in can_be_deleted
def test_no_deletion(self) -> None:
"""Test that calling persisting_state_group_references is fine if
nothing is pending deletion"""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
sender=self.user_id,
)
)
ctx_mgr = self.state_deletion_store.persisting_state_group_references(
[(event, context)]
)
self.get_success(ctx_mgr.__aenter__())
self.get_success(ctx_mgr.__aexit__(None, None, None))
def test_no_deletion_error(self) -> None:
"""Test that calling persisting_state_group_references is fine if
nothing is pending deletion, but an error occurs."""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
sender=self.user_id,
)
)
ctx_mgr = self.state_deletion_store.persisting_state_group_references(
[(event, context)]
)
self.get_success(ctx_mgr.__aenter__())
self.get_success(ctx_mgr.__aexit__(Exception, Exception("test"), None))
def test_existing_pending_deletion_is_cleared(self) -> None:
"""Test that the pending deletion flag gets cleared when the state group
gets persisted."""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
state_key="",
sender=self.user_id,
)
)
assert context.state_group is not None
# Mark a state group that we're referencing as pending deletion.
self.get_success(
self.state_deletion_store.mark_state_groups_as_pending_deletion(
[context.state_group]
)
)
ctx_mgr = self.state_deletion_store.persisting_state_group_references(
[(event, context)]
)
self.get_success(ctx_mgr.__aenter__())
self.get_success(ctx_mgr.__aexit__(None, None, None))
# The pending deletion flag should be cleared
pending_deletion = self.get_success(
self.state_deletion_store.db_pool.simple_select_one_onecol(
table="state_groups_pending_deletion",
keyvalues={"state_group": context.state_group},
retcol="1",
allow_none=True,
desc="test_existing_pending_deletion_is_cleared",
)
)
self.assertIsNone(pending_deletion)
def test_pending_deletion_is_cleared_during_persist(self) -> None:
"""Test that the pending deletion flag is cleared when a state group
gets marked for deletion during persistence"""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
state_key="",
sender=self.user_id,
)
)
assert context.state_group is not None
ctx_mgr = self.state_deletion_store.persisting_state_group_references(
[(event, context)]
)
self.get_success(ctx_mgr.__aenter__())
# Mark the state group that we're referencing as pending deletion,
# *after* we have started persisting.
self.get_success(
self.state_deletion_store.mark_state_groups_as_pending_deletion(
[context.state_group]
)
)
self.get_success(ctx_mgr.__aexit__(None, None, None))
# The pending deletion flag should be cleared
pending_deletion = self.get_success(
self.state_deletion_store.db_pool.simple_select_one_onecol(
table="state_groups_pending_deletion",
keyvalues={"state_group": context.state_group},
retcol="1",
allow_none=True,
desc="test_existing_pending_deletion_is_cleared",
)
)
self.assertIsNone(pending_deletion)
def test_deletion_check(self) -> None:
"""Test that the `get_state_groups_that_can_be_purged_txn` check is
correct during different points of the lifecycle of persisting an
event."""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
state_key="",
sender=self.user_id,
)
)
assert context.state_group is not None
self.get_success(
self.state_deletion_store.mark_state_groups_as_pending_deletion(
[context.state_group]
)
)
# We shouldn't be able to delete the state group as not enough time as passed
can_be_deleted = self.check_if_can_be_deleted(context.state_group)
self.assertFalse(can_be_deleted)
# After enough time we can delete the state group
self.reactor.advance(
1 + self.state_deletion_store.DELAY_BEFORE_DELETION_MS / 1000
)
can_be_deleted = self.check_if_can_be_deleted(context.state_group)
self.assertTrue(can_be_deleted)
ctx_mgr = self.state_deletion_store.persisting_state_group_references(
[(event, context)]
)
self.get_success(ctx_mgr.__aenter__())
# But once we start persisting we can't delete the state group
can_be_deleted = self.check_if_can_be_deleted(context.state_group)
self.assertFalse(can_be_deleted)
self.get_success(ctx_mgr.__aexit__(None, None, None))
# The pending deletion flag should remain cleared after persistence has
# finished.
can_be_deleted = self.check_if_can_be_deleted(context.state_group)
self.assertFalse(can_be_deleted)
def test_deletion_error_during_persistence(self) -> None:
"""Test that state groups remain marked as pending deletion if persisting
the event fails."""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
state_key="",
sender=self.user_id,
)
)
assert context.state_group is not None
# Mark a state group that we're referencing as pending deletion.
self.get_success(
self.state_deletion_store.mark_state_groups_as_pending_deletion(
[context.state_group]
)
)
ctx_mgr = self.state_deletion_store.persisting_state_group_references(
[(event, context)]
)
self.get_success(ctx_mgr.__aenter__())
self.get_success(ctx_mgr.__aexit__(Exception, Exception("test"), None))
# We should be able to delete the state group after a certain amount of
# time
self.reactor.advance(
1 + self.state_deletion_store.DELAY_BEFORE_DELETION_MS / 1000
)
can_be_deleted = self.check_if_can_be_deleted(context.state_group)
self.assertTrue(can_be_deleted)
def test_race_between_check_and_insert(self) -> None:
"""Check that we correctly handle the race where we go to delete a
state group, check that it is unreferenced, and then it becomes
referenced just before we delete it."""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
state_key="",
sender=self.user_id,
)
)
assert context.state_group is not None
# Mark a state group that we're referencing as pending deletion.
self.get_success(
self.state_deletion_store.mark_state_groups_as_pending_deletion(
[context.state_group]
)
)
# Advance time enough so we can delete the state group
self.reactor.advance(
1 + self.state_deletion_store.DELAY_BEFORE_DELETION_MS / 1000
)
# Check that we'd be able to delete this state group.
state_group_to_sequence_number = self.get_success(
self.state_deletion_store.get_pending_deletions([context.state_group])
)
can_be_deleted = self.get_success(
self.state_deletion_store.db_pool.runInteraction(
"test_existing_pending_deletion_is_cleared",
self.state_deletion_store.get_state_groups_ready_for_potential_deletion_txn,
state_group_to_sequence_number,
)
)
self.assertCountEqual(can_be_deleted, [context.state_group])
# ... in the real world we'd check that the state group isn't referenced here ...
# Now we persist the event to reference the state group, *after* we
# check that the state group wasn't referenced
ctx_mgr = self.state_deletion_store.persisting_state_group_references(
[(event, context)]
)
self.get_success(ctx_mgr.__aenter__())
self.get_success(ctx_mgr.__aexit__(Exception, Exception("test"), None))
# We simulate a pause (required to hit the race)
self.reactor.advance(
1 + self.state_deletion_store.DELAY_BEFORE_DELETION_MS / 1000
)
# We should no longer be able to delete the state group, without having
# to recheck if its referenced.
can_be_deleted = self.get_success(
self.state_deletion_store.db_pool.runInteraction(
"test_existing_pending_deletion_is_cleared",
self.state_deletion_store.get_state_groups_ready_for_potential_deletion_txn,
state_group_to_sequence_number,
)
)
self.assertCountEqual(can_be_deleted, [])
def test_remove_ancestors_from_can_delete(self) -> None:
"""Test that if a state group is not ready to be deleted, we also don't
delete anything that is refernced by it"""
event, context = self.get_success(
create_event(
self.hs,
room_id=self.room_id,
type="m.test",
state_key="",
sender=self.user_id,
)
)
assert context.state_group is not None
# Create a new state group that refernces the one from the event
new_state_group = self.get_success(
self.state_store.store_state_group(
event.event_id,
event.room_id,
prev_group=context.state_group,
delta_ids={},
current_state_ids=None,
)
)
# Mark them both as pending deletion
self.get_success(
self.state_deletion_store.mark_state_groups_as_pending_deletion(
[context.state_group, new_state_group]
)
)
# Advance time enough so we can delete the state group so they're both
# ready for deletion.
self.reactor.advance(
1 + self.state_deletion_store.DELAY_BEFORE_DELETION_MS / 1000
)
# We can now delete both state groups
self.assertTrue(self.check_if_can_be_deleted(context.state_group))
self.assertTrue(self.check_if_can_be_deleted(new_state_group))
# Use the new_state_group to bump its deletion time
self.get_success(
self.state_store.store_state_group(
event.event_id,
event.room_id,
prev_group=new_state_group,
delta_ids={},
current_state_ids=None,
)
)
# We should now not be able to delete either of the state groups.
state_group_to_sequence_number = self.get_success(
self.state_deletion_store.get_pending_deletions(
[context.state_group, new_state_group]
)
)
# We shouldn't be able to delete the state group as not enough time has passed
can_be_deleted = self.get_success(
self.state_deletion_store.db_pool.runInteraction(
"test_existing_pending_deletion_is_cleared",
self.state_deletion_store.get_state_groups_ready_for_potential_deletion_txn,
state_group_to_sequence_number,
)
)
self.assertCountEqual(can_be_deleted, [])