From cc32634cd04ce7cba897d7d84e3ffff525c639ea Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 11 Jul 2022 15:27:49 -0500 Subject: [PATCH] Fix event_id always being marked as failed to fetch when fetching state for it Discovered while working on https://github.com/matrix-org/synapse/pull/13205 --- synapse/handlers/federation_event.py | 45 +++++++++++++++---- .../storage/databases/main/events_worker.py | 6 ++- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index c74117c19aae..ac2e7456de9b 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -939,7 +939,8 @@ async def _get_state_ids_after_missing_prev_event( ) logger.debug( - "state_ids returned %i state events, %i auth events", + "_get_state_ids_after_missing_prev_event(event_id=%s): state_ids returned %i state events, %i auth events", + event_id, len(state_event_ids), len(auth_event_ids), ) @@ -947,12 +948,17 @@ async def _get_state_ids_after_missing_prev_event( # Start by checking events we already have in the DB desired_events = set(state_event_ids) desired_events.add(event_id) - logger.debug("Fetching %i events from cache/store", len(desired_events)) + logger.debug( + "_get_state_ids_after_missing_prev_event(event_id=%s): Fetching %i events from cache/store", + event_id, + len(desired_events), + ) have_events = await self._store.have_seen_events(room_id, desired_events) missing_desired_events = desired_events - have_events logger.debug( - "We are missing %i events (got %i)", + "_get_state_ids_after_missing_prev_event(event_id=%s): We are missing %i events (got %i)", + event_id, len(missing_desired_events), len(have_events), ) @@ -969,7 +975,11 @@ async def _get_state_ids_after_missing_prev_event( missing_auth_events.difference_update( await self._store.have_seen_events(room_id, missing_auth_events) ) - logger.debug("We are also missing %i auth events", len(missing_auth_events)) + logger.debug( + "_get_state_ids_after_missing_prev_event(event_id=%s): We are also missing %i auth events", + event_id, + len(missing_auth_events), + ) missing_events = missing_desired_events | missing_auth_events @@ -983,10 +993,17 @@ async def _get_state_ids_after_missing_prev_event( # TODO: might it be better to have an API which lets us do an aggregate event # request if (len(missing_events) * 10) >= len(auth_event_ids) + len(state_event_ids): - logger.debug("Requesting complete state from remote") + logger.debug( + "_get_state_ids_after_missing_prev_event(event_id=%s): Requesting complete state from remote", + event_id, + ) await self._get_state_and_persist(destination, room_id, event_id) else: - logger.debug("Fetching %i events from remote", len(missing_events)) + logger.debug( + "_get_state_ids_after_missing_prev_event(event_id=%s): Fetching %i events from remote", + event_id, + len(missing_events), + ) await self._get_events_and_persist( destination=destination, room_id=room_id, event_ids=missing_events ) @@ -1005,8 +1022,9 @@ async def _get_state_ids_after_missing_prev_event( # This can happen if a remote server claims that the state or # auth_events at an event in room A are actually events in room B logger.warning( - "Remote server %s claims event %s in room %s is an auth/state " + "_get_state_ids_after_missing_prev_event(event_id=%s): Remote server %s claims event %s in room %s is an auth/state " "event in room %s", + event_id, destination, state_event_id, metadata.room_id, @@ -1016,7 +1034,9 @@ async def _get_state_ids_after_missing_prev_event( if metadata.state_key is None: logger.warning( - "Remote server gave us non-state event in state: %s", state_event_id + "_get_state_ids_after_missing_prev_event(event_id=%s): Remote server gave us non-state event in state: %s", + event_id, + state_event_id, ) continue @@ -1036,9 +1056,16 @@ async def _get_state_ids_after_missing_prev_event( # XXX: this doesn't sound right? it means that we'll end up with incomplete # state. failed_to_fetch = desired_events - event_metadata.keys() + + # The event_id is part of the `desired_events` but isn't fetched as part + # of the `event_metadata` so we remove it here separately if we did find it. + have_event_id = await self._store.have_seen_event(room_id, event_id) + if have_event_id: + failed_to_fetch = failed_to_fetch - {event_id} + if failed_to_fetch: logger.warning( - "Failed to fetch missing state events for %s %s", + "_get_state_ids_after_missing_prev_event(event_id=%s): Failed to fetch missing state events %s", event_id, failed_to_fetch, ) diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index b99b10778490..4d916b240efe 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -1370,7 +1370,8 @@ async def have_seen_events( return results - @cachedList(cached_method_name="have_seen_event", list_name="keys") + # TODO: The cache is giving us stale results for the `failed_to_fetch` stuff. + # @cachedList(cached_method_name="have_seen_event", list_name="keys") async def _have_seen_events_dict( self, keys: Collection[Tuple[str, str]] ) -> Dict[Tuple[str, str], bool]: @@ -1410,7 +1411,8 @@ def have_seen_events_txn(txn: LoggingTransaction) -> None: await self.db_pool.runInteraction("have_seen_events", have_seen_events_txn) return results - @cached(max_entries=100000, tree=True) + # TODO: The cache is giving us stale results for the `failed_to_fetch` stuff. + # @cached(max_entries=100000, tree=True) async def have_seen_event(self, room_id: str, event_id: str) -> bool: res = await self._have_seen_events_dict(((room_id, event_id),)) return res[(room_id, event_id)]