Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix background updates failing to add unique indexes on receipts #14453

Merged
merged 7 commits into from
Nov 16, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/14453.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug introduced in Synapse 1.70.0 where the background updates to add non-thread unique indexes on receipts could fail when upgrading from 1.67.0 or earlier.
171 changes: 147 additions & 24 deletions synapse/storage/databases/main/receipts.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,24 +113,6 @@ def __init__(
prefilled_cache=receipts_stream_prefill,
)

self.db_pool.updates.register_background_index_update(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why move these to the non-worker store?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved them to the *BackgroundUpdateStore. I thought that's where we usually put the background updates?
Is there a motivation for having these on the worker store that I've completely missed?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no motivation besides it being consistent with other examples I saw. 🤷

"receipts_linearized_unique_index",
index_name="receipts_linearized_unique_index",
table="receipts_linearized",
columns=["room_id", "receipt_type", "user_id"],
where_clause="thread_id IS NULL",
unique=True,
)

self.db_pool.updates.register_background_index_update(
"receipts_graph_unique_index",
index_name="receipts_graph_unique_index",
table="receipts_graph",
columns=["room_id", "receipt_type", "user_id"],
where_clause="thread_id IS NULL",
unique=True,
)

def get_max_receipt_stream_id(self) -> int:
"""Get the current max stream ID for receipts stream"""
return self._receipts_id_gen.get_current_token()
Expand Down Expand Up @@ -702,9 +684,6 @@ def _insert_linearized_receipt_txn(
"data": json_encoder.encode(data),
},
where_clause=where_clause,
# receipts_linearized has a unique constraint on
# (user_id, room_id, receipt_type), so no need to lock
lock=False,
Comment on lines -705 to -707
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To double check: is the table correctly deemed safe to upsert into when the relevant background updates have run? (Wasn't sure how the second commit would affect this, if at all)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, once the unique index has been added by the background update, we will be able to rely on native upserts again (and the value of lock won't matter).

)

return rx_ts
Expand Down Expand Up @@ -862,14 +841,13 @@ def _insert_graph_receipt_txn(
"data": json_encoder.encode(data),
},
where_clause=where_clause,
# receipts_graph has a unique constraint on
# (user_id, room_id, receipt_type), so no need to lock
lock=False,
)


class ReceiptsBackgroundUpdateStore(SQLBaseStore):
POPULATE_RECEIPT_EVENT_STREAM_ORDERING = "populate_event_stream_ordering"
RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME = "receipts_linearized_unique_index"
RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME = "receipts_graph_unique_index"

def __init__(
self,
Expand All @@ -883,6 +861,14 @@ def __init__(
self.POPULATE_RECEIPT_EVENT_STREAM_ORDERING,
self._populate_receipt_event_stream_ordering,
)
self.db_pool.updates.register_background_update_handler(
self.RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME,
self._background_receipts_linearized_unique_index,
)
self.db_pool.updates.register_background_update_handler(
self.RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME,
self._background_receipts_graph_unique_index,
)

async def _populate_receipt_event_stream_ordering(
self, progress: JsonDict, batch_size: int
Expand Down Expand Up @@ -938,6 +924,143 @@ def _populate_receipt_event_stream_ordering_txn(

return batch_size

async def _create_receipts_index(self, index_name: str, table: str) -> None:
"""Adds a unique index on `(room_id, receipt_type, user_id)` to the given
receipts table, for non-thread receipts."""

def _create_index(conn: LoggingDatabaseConnection) -> None:
conn.rollback()

# we have to set autocommit, because postgres refuses to
# CREATE INDEX CONCURRENTLY without it.
if isinstance(self.database_engine, PostgresEngine):
conn.set_session(autocommit=True)

try:
c = conn.cursor()

# Now that the duplicates are gone, we can create the index.
concurrently = (
"CONCURRENTLY"
if isinstance(self.database_engine, PostgresEngine)
else ""
)
sql = f"""
CREATE UNIQUE INDEX {concurrently} {index_name}
ON {table}(room_id, receipt_type, user_id)
WHERE thread_id IS NULL
"""
c.execute(sql)
finally:
if isinstance(self.database_engine, PostgresEngine):
conn.set_session(autocommit=False)

await self.db_pool.runWithConnection(_create_index)

async def _background_receipts_linearized_unique_index(
self, progress: dict, batch_size: int
) -> int:
"""Removes duplicate receipts and adds a unique index on
`(room_id, receipt_type, user_id)` to `receipts_linearized`, for non-thread
receipts."""

def _remote_duplicate_receipts_txn(txn: LoggingTransaction) -> None:
# Identify any duplicate receipts arising from
# https://github.com/matrix-org/synapse/issues/14406.
# We expect the following query to use the per-thread receipt index and take
# less than a minute.
sql = """
SELECT MAX(stream_id), room_id, receipt_type, user_id
FROM receipts_linearized
WHERE thread_id IS NULL
GROUP BY room_id, receipt_type, user_id
HAVING COUNT(*) > 1
"""
txn.execute(sql)
duplicate_keys = cast(List[Tuple[int, str, str, str]], list(txn))

# Then remove duplicate receipts, keeping the one with the highest
# `stream_id`. There should only be a single receipt with any given
# `stream_id`.
for max_stream_id, room_id, receipt_type, user_id in duplicate_keys:
sql = """
DELETE FROM receipts_linearized
WHERE
room_id = ? AND
receipt_type = ? AND
user_id = ? AND
thread_id IS NULL AND
stream_id < ?
"""
txn.execute(sql, (room_id, receipt_type, user_id, max_stream_id))

await self.db_pool.runInteraction(
self.RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME,
_remote_duplicate_receipts_txn,
)

await self._create_receipts_index(
"receipts_linearized_unique_index",
"receipts_linearized",
)

await self.db_pool.updates._end_background_update(
self.RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME
)

return 1

async def _background_receipts_graph_unique_index(
self, progress: dict, batch_size: int
) -> int:
"""Removes duplicate receipts and adds a unique index on
`(room_id, receipt_type, user_id)` to `receipts_graph`, for non-thread
receipts."""

def _remote_duplicate_receipts_txn(txn: LoggingTransaction) -> None:
# Identify any duplicate receipts arising from
# https://github.com/matrix-org/synapse/issues/14406.
# We expect the following query to use the per-thread receipt index and take
# less than a minute.
sql = """
SELECT room_id, receipt_type, user_id FROM receipts_graph
WHERE thread_id IS NULL
GROUP BY room_id, receipt_type, user_id
HAVING COUNT(*) > 1
"""
txn.execute(sql)
duplicate_keys = cast(List[Tuple[str, str, str]], list(txn))

# Then remove all duplicate receipts.
# We could be clever and try to keep the latest receipt out of every set of
# duplicates, but it's far simpler to remove them all.
for room_id, receipt_type, user_id in duplicate_keys:
sql = """
DELETE FROM receipts_graph
WHERE
room_id = ? AND
receipt_type = ? AND
user_id = ? AND
thread_id IS NULL
"""
txn.execute(sql, (room_id, receipt_type, user_id))

await self.db_pool.runInteraction(
self.RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME,
_remote_duplicate_receipts_txn,
)

await self._create_receipts_index(
"receipts_graph_unique_index",
"receipts_graph",
)

await self.db_pool.updates._end_background_update(
self.RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME
)

return 1


class ReceiptsStore(ReceiptsWorkerStore, ReceiptsBackgroundUpdateStore):
pass
Loading