Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix bug in device list caching when remote users leave rooms #13749

Merged
merged 8 commits into from
Sep 14, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13749.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long standing bug where device lists would remain cached when remote users left and rejoined the last room shared with the local homeserver.
11 changes: 0 additions & 11 deletions synapse/handlers/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
JsonDict,
StreamKeyType,
StreamToken,
UserID,
get_domain_from_id,
get_verify_key_from_cross_signing_key,
)
Expand Down Expand Up @@ -324,8 +323,6 @@ def __init__(self, hs: "HomeServer"):
self.device_list_updater.incoming_device_list_update,
)

hs.get_distributor().observe("user_left_room", self.user_left_room)

# Whether `_handle_new_device_update_async` is currently processing.
self._handle_new_device_update_is_processing = False

Expand Down Expand Up @@ -569,14 +566,6 @@ async def notify_user_signature_update(
StreamKeyType.DEVICE_LIST, position, users=[from_user_id]
)

async def user_left_room(self, user: UserID, room_id: str) -> None:
user_id = user.to_string()
room_ids = await self.store.get_rooms_for_user(user_id)
if not room_ids:
# We no longer share rooms with this user, so we'll no longer
# receive device updates. Mark this in DB.
await self.store.mark_remote_user_device_list_as_unsubscribed(user_id)

async def store_dehydrated_device(
self,
user_id: str,
Expand Down
27 changes: 27 additions & 0 deletions synapse/handlers/e2e_keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,33 @@ async def query_devices(
user_ids_not_in_cache,
remote_results,
) = await self.store.get_user_devices_from_cache(query_list)

# Check that the homeserver still shares a room with all cached users.
# Note that this check may be slightly racy when a remote user leaves a
# room after we have fetched their cached device list. In the worst case
# we will do extra federation queries for devices that we had cached.
cached_users = set(remote_results.keys())
valid_cached_users = (
await self.store.get_users_server_still_shares_room_with(
remote_results.keys()
)
)
invalid_cached_users = cached_users - valid_cached_users
if invalid_cached_users:
# Fix up results. If we get here there may be bugs in device list
# tracking.
# TODO: is this actually useful? it doesn't fix the case where a
# remote user rejoins a room and we query their device list
# after.
squahtx marked this conversation as resolved.
Show resolved Hide resolved
user_ids_not_in_cache.update(invalid_cached_users)
for invalid_user_id in invalid_cached_users:
remote_results.pop(invalid_user_id)
logger.error(
"Devices for %s were cached, but the server no longer shares "
"any rooms with them. The cached device lists are stale.",
invalid_cached_users,
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not fully convinced this check is useful, because it'll only catch /keys/query requests done while we no longer share a room with a remote user, which I wouldn't expect to happen often, or at all.

It's also slightly racy and can trigger spuriously when a remote user leaves a room while we're in the middle of handling the request.

Do we want to keep it or remove it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think its valid for clients to do a /keys/query for users they don't share a room with, e.g. because they're about to start a DM or something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds plausible!

for user_id, devices in remote_results.items():
user_devices = results.setdefault(user_id, {})
for device_id, device in devices.items():
Expand Down
20 changes: 17 additions & 3 deletions synapse/storage/controllers/persist_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,9 +598,9 @@ async def _persist_event_batch(
# room
state_delta_for_room: Dict[str, DeltaState] = {}

# Set of remote users which were in rooms the server has left. We
# should check if we still share any rooms and if not we mark their
# device lists as stale.
# Set of remote users which were in rooms the server has left or who may
# have left rooms the server is in. We should check if we still share any
# rooms and if not we mark their device lists as stale.
potentially_left_users: Set[str] = set()

if not backfilled:
Expand Down Expand Up @@ -725,6 +725,20 @@ async def _persist_event_batch(
current_state = {}
delta.no_longer_in_room = True

# Add all remote users that might have left rooms.
potentially_left_users.update(
user_id
for event_type, user_id in delta.to_delete
if event_type == EventTypes.Member
and not self.is_mine_id(user_id)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs #13746, otherwise we can raise errors when there are malformed user IDs in the room state.

)
potentially_left_users.update(
user_id
for event_type, user_id in delta.to_insert.keys()
if event_type == EventTypes.Member
and not self.is_mine_id(user_id)
)

state_delta_for_room[room_id] = delta

await self.persist_events_store._persist_events_and_state_updates(
Expand Down
8 changes: 7 additions & 1 deletion tests/handlers/test_e2e_keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,14 +891,20 @@ def test_query_all_devices_caches_result(self, device_ids: Iterable[str]) -> Non
new_callable=mock.MagicMock,
return_value=make_awaitable(["some_room_id"]),
)
mock_get_users = mock.patch.object(
self.store,
"get_users_server_still_shares_room_with",
new_callable=mock.MagicMock,
return_value=make_awaitable({remote_user_id}),
)
mock_request = mock.patch.object(
self.hs.get_federation_client(),
"query_user_devices",
new_callable=mock.MagicMock,
return_value=make_awaitable(response_body),
)

with mock_get_rooms, mock_request as mocked_federation_request:
with mock_get_rooms, mock_get_users, mock_request as mocked_federation_request:
# Make the first query and sanity check it succeeds.
response_1 = self.get_success(
e2e_handler.query_devices(
Expand Down