Stop getting missing prev_events after we already know their signature is invalid (#13816)

While https://github.com/matrix-org/synapse/pull/13635 stops us from doing the slow thing after we've already done it once, this PR stops us from doing one of the slow things in the first place.

Related to
 - https://github.com/matrix-org/synapse/issues/13622
    - https://github.com/matrix-org/synapse/pull/13635
 - https://github.com/matrix-org/synapse/issues/13676

Part of https://github.com/matrix-org/synapse/issues/13356

Follow-up to https://github.com/matrix-org/synapse/pull/13815 which tracks event signature failures.

With this PR, we avoid the call to the costly `_get_state_ids_after_missing_prev_event` because the signature failure will count as an attempt before and we filter events based on the backoff before calling `_get_state_ids_after_missing_prev_event` now.

For example, this will save us 156s out of the 185s total that this `matrix.org` `/messages` request. If you want to see the full Jaeger trace of this, you can drag and drop this `trace.json` into your own Jaeger, https://gist.github.com/MadLittleMods/4b12d0d0afe88c2f65ffcc907306b761

To explain this exact scenario around `/messages` -> backfill, we call `/backfill` and first check the signatures of the 100 events. We see bad signature for `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` and `$zuOn2Rd2vsC7SUia3Hp3r6JSkSFKcc5j3QTTqW_0jDw` (both member events). Then we process the 98 events remaining that have valid signatures but one of the events references `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` as a `prev_event`. So we have to do the whole `_get_state_ids_after_missing_prev_event` rigmarole which pulls in those same events which fail again because the signatures are still invalid.

 - `backfill`
    - `outgoing-federation-request` `/backfill`
    - `_check_sigs_and_hash_and_fetch`
       - `_check_sigs_and_hash_and_fetch_one` for each event received over backfill
          -  `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` fails with `Signature on retrieved event was invalid.`: `unable to verify signature for sender domain xxx: 401: Failed to find any key to satisfy: _FetchKeyRequest(...)`
          -  `$zuOn2Rd2vsC7SUia3Hp3r6JSkSFKcc5j3QTTqW_0jDw` fails with `Signature on retrieved event was invalid.`: `unable to verify signature for sender domain xxx: 401: Failed to find any key to satisfy: _FetchKeyRequest(...)`
   - `_process_pulled_events`
      - `_process_pulled_event` for each validated event
         -  Event `$Q0iMdqtz3IJYfZQU2Xk2WjB5NDF8Gg8cFSYYyKQgKJ0` references `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` as a `prev_event` which is missing so we try to get it
            - `_get_state_ids_after_missing_prev_event`
               - `outgoing-federation-request` `/state_ids`
               -  `get_pdu` for `$luA4l7QHhf_jadH3mI-AyFqho0U2Q-IXXUbGSMq6h6M` which fails the signature check again
               -  `get_pdu` for `$zuOn2Rd2vsC7SUia3Hp3r6JSkSFKcc5j3QTTqW_0jDw` which fails the signature check
This commit is contained in:
Eric Eastwood 2022-10-15 00:36:49 -05:00 committed by GitHub
parent 616dcc1d18
commit 40bb37eb27
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 386 additions and 2 deletions

View file

@ -0,0 +1 @@
Stop fetching missing `prev_events` after we already know their signature is invalid.

View file

@ -640,6 +640,27 @@ class FederationError(RuntimeError):
} }
class FederationPullAttemptBackoffError(RuntimeError):
"""
Raised to indicate that we are are deliberately not attempting to pull the given
event over federation because we've already done so recently and are backing off.
Attributes:
event_id: The event_id which we are refusing to pull
message: A custom error message that gives more context
"""
def __init__(self, event_ids: List[str], message: Optional[str]):
self.event_ids = event_ids
if message:
error_message = message
else:
error_message = f"Not attempting to pull event_ids={self.event_ids} because we already tried to pull them recently (backing off)."
super().__init__(error_message)
class HttpResponseException(CodeMessageException): class HttpResponseException(CodeMessageException):
""" """
Represents an HTTP-level failure of an outbound request Represents an HTTP-level failure of an outbound request

View file

@ -45,6 +45,7 @@ from synapse.api.errors import (
Codes, Codes,
FederationDeniedError, FederationDeniedError,
FederationError, FederationError,
FederationPullAttemptBackoffError,
HttpResponseException, HttpResponseException,
LimitExceededError, LimitExceededError,
NotFoundError, NotFoundError,
@ -1720,7 +1721,22 @@ class FederationHandler:
destination, event destination, event
) )
break break
except FederationPullAttemptBackoffError as exc:
# Log a warning about why we failed to process the event (the error message
# for `FederationPullAttemptBackoffError` is pretty good)
logger.warning("_sync_partial_state_room: %s", exc)
# We do not record a failed pull attempt when we backoff fetching a missing
# `prev_event` because not being able to fetch the `prev_events` just means
# we won't be able to de-outlier the pulled event. But we can still use an
# `outlier` in the state/auth chain for another event. So we shouldn't stop
# a downstream event from trying to pull it.
#
# This avoids a cascade of backoff for all events in the DAG downstream from
# one event backoff upstream.
except FederationError as e: except FederationError as e:
# TODO: We should `record_event_failed_pull_attempt` here,
# see https://github.com/matrix-org/synapse/issues/13700
if attempt == len(destinations) - 1: if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up. # We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do # TODO(faster_joins) giving up isn't the right thing to do

View file

@ -44,6 +44,7 @@ from synapse.api.errors import (
AuthError, AuthError,
Codes, Codes,
FederationError, FederationError,
FederationPullAttemptBackoffError,
HttpResponseException, HttpResponseException,
RequestSendFailed, RequestSendFailed,
SynapseError, SynapseError,
@ -567,6 +568,9 @@ class FederationEventHandler:
event: partial-state event to be de-partial-stated event: partial-state event to be de-partial-stated
Raises: Raises:
FederationPullAttemptBackoffError if we are are deliberately not attempting
to pull the given event over federation because we've already done so
recently and are backing off.
FederationError if we fail to request state from the remote server. FederationError if we fail to request state from the remote server.
""" """
logger.info("Updating state for %s", event.event_id) logger.info("Updating state for %s", event.event_id)
@ -901,6 +905,18 @@ class FederationEventHandler:
context, context,
backfilled=backfilled, backfilled=backfilled,
) )
except FederationPullAttemptBackoffError as exc:
# Log a warning about why we failed to process the event (the error message
# for `FederationPullAttemptBackoffError` is pretty good)
logger.warning("_process_pulled_event: %s", exc)
# We do not record a failed pull attempt when we backoff fetching a missing
# `prev_event` because not being able to fetch the `prev_events` just means
# we won't be able to de-outlier the pulled event. But we can still use an
# `outlier` in the state/auth chain for another event. So we shouldn't stop
# a downstream event from trying to pull it.
#
# This avoids a cascade of backoff for all events in the DAG downstream from
# one event backoff upstream.
except FederationError as e: except FederationError as e:
await self._store.record_event_failed_pull_attempt( await self._store.record_event_failed_pull_attempt(
event.room_id, event_id, str(e) event.room_id, event_id, str(e)
@ -947,6 +963,9 @@ class FederationEventHandler:
The event context. The event context.
Raises: Raises:
FederationPullAttemptBackoffError if we are are deliberately not attempting
to pull the given event over federation because we've already done so
recently and are backing off.
FederationError if we fail to get the state from the remote server after any FederationError if we fail to get the state from the remote server after any
missing `prev_event`s. missing `prev_event`s.
""" """
@ -957,6 +976,18 @@ class FederationEventHandler:
seen = await self._store.have_events_in_timeline(prevs) seen = await self._store.have_events_in_timeline(prevs)
missing_prevs = prevs - seen missing_prevs = prevs - seen
# If we've already recently attempted to pull this missing event, don't
# try it again so soon. Since we have to fetch all of the prev_events, we can
# bail early here if we find any to ignore.
prevs_to_ignore = await self._store.get_event_ids_to_not_pull_from_backoff(
room_id, missing_prevs
)
if len(prevs_to_ignore) > 0:
raise FederationPullAttemptBackoffError(
event_ids=prevs_to_ignore,
message=f"While computing context for event={event_id}, not attempting to pull missing prev_event={prevs_to_ignore[0]} because we already tried to pull recently (backing off).",
)
if not missing_prevs: if not missing_prevs:
return await self._state_handler.compute_event_context(event) return await self._state_handler.compute_event_context(event)

View file

@ -1501,6 +1501,12 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
event_id: The event that failed to be fetched or processed event_id: The event that failed to be fetched or processed
cause: The error message or reason that we failed to pull the event cause: The error message or reason that we failed to pull the event
""" """
logger.debug(
"record_event_failed_pull_attempt room_id=%s, event_id=%s, cause=%s",
room_id,
event_id,
cause,
)
await self.db_pool.runInteraction( await self.db_pool.runInteraction(
"record_event_failed_pull_attempt", "record_event_failed_pull_attempt",
self._record_event_failed_pull_attempt_upsert_txn, self._record_event_failed_pull_attempt_upsert_txn,
@ -1530,6 +1536,54 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause)) txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause))
@trace
async def get_event_ids_to_not_pull_from_backoff(
self,
room_id: str,
event_ids: Collection[str],
) -> List[str]:
"""
Filter down the events to ones that we've failed to pull before recently. Uses
exponential backoff.
Args:
room_id: The room that the events belong to
event_ids: A list of events to filter down
Returns:
List of event_ids that should not be attempted to be pulled
"""
event_failed_pull_attempts = await self.db_pool.simple_select_many_batch(
table="event_failed_pull_attempts",
column="event_id",
iterable=event_ids,
keyvalues={},
retcols=(
"event_id",
"last_attempt_ts",
"num_attempts",
),
desc="get_event_ids_to_not_pull_from_backoff",
)
current_time = self._clock.time_msec()
return [
event_failed_pull_attempt["event_id"]
for event_failed_pull_attempt in event_failed_pull_attempts
# Exponential back-off (up to the upper bound) so we don't try to
# pull the same event over and over. ex. 2hr, 4hr, 8hr, 16hr, etc.
if current_time
< event_failed_pull_attempt["last_attempt_ts"]
+ (
2
** min(
event_failed_pull_attempt["num_attempts"],
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS,
)
)
* BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS
]
async def get_missing_events( async def get_missing_events(
self, self,
room_id: str, room_id: str,

View file

@ -14,7 +14,7 @@
from typing import Optional from typing import Optional
from unittest import mock from unittest import mock
from synapse.api.errors import AuthError from synapse.api.errors import AuthError, StoreError
from synapse.api.room_versions import RoomVersion from synapse.api.room_versions import RoomVersion
from synapse.event_auth import ( from synapse.event_auth import (
check_state_dependent_auth_rules, check_state_dependent_auth_rules,
@ -43,7 +43,7 @@ class FederationEventHandlerTests(unittest.FederatingHomeserverTestCase):
def make_homeserver(self, reactor, clock): def make_homeserver(self, reactor, clock):
# mock out the federation transport client # mock out the federation transport client
self.mock_federation_transport_client = mock.Mock( self.mock_federation_transport_client = mock.Mock(
spec=["get_room_state_ids", "get_room_state", "get_event"] spec=["get_room_state_ids", "get_room_state", "get_event", "backfill"]
) )
return super().setup_test_homeserver( return super().setup_test_homeserver(
federation_transport_client=self.mock_federation_transport_client federation_transport_client=self.mock_federation_transport_client
@ -459,6 +459,203 @@ class FederationEventHandlerTests(unittest.FederatingHomeserverTestCase):
) )
self.assertIsNotNone(persisted, "pulled event was not persisted at all") self.assertIsNotNone(persisted, "pulled event was not persisted at all")
def test_backfill_signature_failure_does_not_fetch_same_prev_event_later(
self,
) -> None:
"""
Test to make sure we backoff and don't try to fetch a missing prev_event when we
already know it has a invalid signature from checking the signatures of all of
the events in the backfill response.
"""
OTHER_USER = f"@user:{self.OTHER_SERVER_NAME}"
main_store = self.hs.get_datastores().main
# Create the room
user_id = self.register_user("kermit", "test")
tok = self.login("kermit", "test")
room_id = self.helper.create_room_as(room_creator=user_id, tok=tok)
room_version = self.get_success(main_store.get_room_version(room_id))
# Allow the remote user to send state events
self.helper.send_state(
room_id,
"m.room.power_levels",
{"events_default": 0, "state_default": 0},
tok=tok,
)
# Add the remote user to the room
member_event = self.get_success(
event_injection.inject_member_event(self.hs, room_id, OTHER_USER, "join")
)
initial_state_map = self.get_success(
main_store.get_partial_current_state_ids(room_id)
)
auth_event_ids = [
initial_state_map[("m.room.create", "")],
initial_state_map[("m.room.power_levels", "")],
member_event.event_id,
]
# We purposely don't run `add_hashes_and_signatures_from_other_server`
# over this because we want the signature check to fail.
pulled_event_without_signatures = make_event_from_dict(
{
"type": "test_regular_type",
"room_id": room_id,
"sender": OTHER_USER,
"prev_events": [member_event.event_id],
"auth_events": auth_event_ids,
"origin_server_ts": 1,
"depth": 12,
"content": {"body": "pulled_event_without_signatures"},
},
room_version,
)
# Create a regular event that should pass except for the
# `pulled_event_without_signatures` in the `prev_event`.
pulled_event = make_event_from_dict(
self.add_hashes_and_signatures_from_other_server(
{
"type": "test_regular_type",
"room_id": room_id,
"sender": OTHER_USER,
"prev_events": [
member_event.event_id,
pulled_event_without_signatures.event_id,
],
"auth_events": auth_event_ids,
"origin_server_ts": 1,
"depth": 12,
"content": {"body": "pulled_event"},
}
),
room_version,
)
# We expect an outbound request to /backfill, so stub that out
self.mock_federation_transport_client.backfill.return_value = make_awaitable(
{
"origin": self.OTHER_SERVER_NAME,
"origin_server_ts": 123,
"pdus": [
# This is one of the important aspects of this test: we include
# `pulled_event_without_signatures` so it fails the signature check
# when we filter down the backfill response down to events which
# have valid signatures in
# `_check_sigs_and_hash_for_pulled_events_and_fetch`
pulled_event_without_signatures.get_pdu_json(),
# Then later when we process this valid signature event, when we
# fetch the missing `prev_event`s, we want to make sure that we
# backoff and don't try and fetch `pulled_event_without_signatures`
# again since we know it just had an invalid signature.
pulled_event.get_pdu_json(),
],
}
)
# Keep track of the count and make sure we don't make any of these requests
event_endpoint_requested_count = 0
room_state_ids_endpoint_requested_count = 0
room_state_endpoint_requested_count = 0
async def get_event(
destination: str, event_id: str, timeout: Optional[int] = None
) -> None:
nonlocal event_endpoint_requested_count
event_endpoint_requested_count += 1
async def get_room_state_ids(
destination: str, room_id: str, event_id: str
) -> None:
nonlocal room_state_ids_endpoint_requested_count
room_state_ids_endpoint_requested_count += 1
async def get_room_state(
room_version: RoomVersion, destination: str, room_id: str, event_id: str
) -> None:
nonlocal room_state_endpoint_requested_count
room_state_endpoint_requested_count += 1
# We don't expect an outbound request to `/event`, `/state_ids`, or `/state` in
# the happy path but if the logic is sneaking around what we expect, stub that
# out so we can detect that failure
self.mock_federation_transport_client.get_event.side_effect = get_event
self.mock_federation_transport_client.get_room_state_ids.side_effect = (
get_room_state_ids
)
self.mock_federation_transport_client.get_room_state.side_effect = (
get_room_state
)
# The function under test: try to backfill and process the pulled event
with LoggingContext("test"):
self.get_success(
self.hs.get_federation_event_handler().backfill(
self.OTHER_SERVER_NAME,
room_id,
limit=1,
extremities=["$some_extremity"],
)
)
if event_endpoint_requested_count > 0:
self.fail(
"We don't expect an outbound request to /event in the happy path but if "
"the logic is sneaking around what we expect, make sure to fail the test. "
"We don't expect it because the signature failure should cause us to backoff "
"and not asking about pulled_event_without_signatures="
f"{pulled_event_without_signatures.event_id} again"
)
if room_state_ids_endpoint_requested_count > 0:
self.fail(
"We don't expect an outbound request to /state_ids in the happy path but if "
"the logic is sneaking around what we expect, make sure to fail the test. "
"We don't expect it because the signature failure should cause us to backoff "
"and not asking about pulled_event_without_signatures="
f"{pulled_event_without_signatures.event_id} again"
)
if room_state_endpoint_requested_count > 0:
self.fail(
"We don't expect an outbound request to /state in the happy path but if "
"the logic is sneaking around what we expect, make sure to fail the test. "
"We don't expect it because the signature failure should cause us to backoff "
"and not asking about pulled_event_without_signatures="
f"{pulled_event_without_signatures.event_id} again"
)
# Make sure we only recorded a single failure which corresponds to the signature
# failure initially in `_check_sigs_and_hash_for_pulled_events_and_fetch` before
# we process all of the pulled events.
backfill_num_attempts_for_event_without_signatures = self.get_success(
main_store.db_pool.simple_select_one_onecol(
table="event_failed_pull_attempts",
keyvalues={"event_id": pulled_event_without_signatures.event_id},
retcol="num_attempts",
)
)
self.assertEqual(backfill_num_attempts_for_event_without_signatures, 1)
# And make sure we didn't record a failure for the event that has the missing
# prev_event because we don't want to cause a cascade of failures. Not being
# able to fetch the `prev_events` just means we won't be able to de-outlier the
# pulled event. But we can still use an `outlier` in the state/auth chain for
# another event. So we shouldn't stop a downstream event from trying to pull it.
self.get_failure(
main_store.db_pool.simple_select_one_onecol(
table="event_failed_pull_attempts",
keyvalues={"event_id": pulled_event.event_id},
retcol="num_attempts",
),
# StoreError: 404: No row found
StoreError,
)
def test_process_pulled_event_with_rejected_missing_state(self) -> None: def test_process_pulled_event_with_rejected_missing_state(self) -> None:
"""Ensure that we correctly handle pulled events with missing state containing a """Ensure that we correctly handle pulled events with missing state containing a
rejected state event rejected state event

View file

@ -27,6 +27,8 @@ from synapse.api.room_versions import (
RoomVersion, RoomVersion,
) )
from synapse.events import _EventInternalMetadata from synapse.events import _EventInternalMetadata
from synapse.rest import admin
from synapse.rest.client import login, room
from synapse.server import HomeServer from synapse.server import HomeServer
from synapse.storage.database import LoggingTransaction from synapse.storage.database import LoggingTransaction
from synapse.types import JsonDict from synapse.types import JsonDict
@ -43,6 +45,12 @@ class _BackfillSetupInfo:
class EventFederationWorkerStoreTestCase(tests.unittest.HomeserverTestCase): class EventFederationWorkerStoreTestCase(tests.unittest.HomeserverTestCase):
servlets = [
admin.register_servlets,
room.register_servlets,
login.register_servlets,
]
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main self.store = hs.get_datastores().main
@ -1122,6 +1130,62 @@ class EventFederationWorkerStoreTestCase(tests.unittest.HomeserverTestCase):
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points] backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
self.assertEqual(backfill_event_ids, ["insertion_eventA"]) self.assertEqual(backfill_event_ids, ["insertion_eventA"])
def test_get_event_ids_to_not_pull_from_backoff(
self,
):
"""
Test to make sure only event IDs we should backoff from are returned.
"""
# Create the room
user_id = self.register_user("alice", "test")
tok = self.login("alice", "test")
room_id = self.helper.create_room_as(room_creator=user_id, tok=tok)
self.get_success(
self.store.record_event_failed_pull_attempt(
room_id, "$failed_event_id", "fake cause"
)
)
event_ids_to_backoff = self.get_success(
self.store.get_event_ids_to_not_pull_from_backoff(
room_id=room_id, event_ids=["$failed_event_id", "$normal_event_id"]
)
)
self.assertEqual(event_ids_to_backoff, ["$failed_event_id"])
def test_get_event_ids_to_not_pull_from_backoff_retry_after_backoff_duration(
self,
):
"""
Test to make sure no event IDs are returned after the backoff duration has
elapsed.
"""
# Create the room
user_id = self.register_user("alice", "test")
tok = self.login("alice", "test")
room_id = self.helper.create_room_as(room_creator=user_id, tok=tok)
self.get_success(
self.store.record_event_failed_pull_attempt(
room_id, "$failed_event_id", "fake cause"
)
)
# Now advance time by 2 hours so we wait long enough for the single failed
# attempt (2^1 hours).
self.reactor.advance(datetime.timedelta(hours=2).total_seconds())
event_ids_to_backoff = self.get_success(
self.store.get_event_ids_to_not_pull_from_backoff(
room_id=room_id, event_ids=["$failed_event_id", "$normal_event_id"]
)
)
# Since this function only returns events we should backoff from, time has
# elapsed past the backoff range so there is no events to backoff from.
self.assertEqual(event_ids_to_backoff, [])
@attr.s @attr.s
class FakeEvent: class FakeEvent: