Skip to content

Commit

Permalink
Casefold when processing email addresses (#374)
Browse files Browse the repository at this point in the history
  • Loading branch information
H-Shay authored Aug 24, 2021
1 parent 779db24 commit bc72eb5
Show file tree
Hide file tree
Showing 13 changed files with 674 additions and 27 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ jobs:
with:
python-version: '3.6'
- run: python -m pip install -e .
- run: black --check --diff sydent/ tests/ matrix_is_test/ setup.py
- run: flake8 sydent/ tests/ matrix_is_test/ setup.py
- run: isort --check-only --diff sydent/ tests/ matrix_is_test/ setup.py
- run: black --check --diff sydent/ tests/ matrix_is_test/ scripts/ setup.py
- run: flake8 sydent/ tests/ matrix_is_test/ scripts/ setup.py
- run: isort --check-only --diff sydent/ tests/ matrix_is_test/ scripts/ setup.py

run-unit-tests:
needs: [check-code-style]
Expand Down
1 change: 1 addition & 0 deletions changelog.d/374.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Case-fold email addresses when binding to MXIDs or performing look-ups. Contributed by H. Shay.
37 changes: 37 additions & 0 deletions res/matrix-org/migration_template.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
Date: %(date)s
From: %(from)s
To: %(to)s
Message-ID: %(messageid)s
Subject: %(subject_header_value)s
MIME-Version: 1.0
Content-Type: multipart/alternative;
boundary="%(multipart_boundary)s"

--%(multipart_boundary)s
Content-Type: text/plain; charset=UTF-8
Content-Disposition: inline
Hello,
We’ve recently improved how people discover your Matrix account.
In the past, identity services did not take capitalization into account when creating and storing Matrix IDs. We’ve now updated this behavior so anyone can find you, no matter how your email is capitalized. As part of this recent update, the duplicate Matrix ID %(mxid)s is no longer associated with this e-mail address.
No action is needed on your part. This doesn’t affect any passwords or password reset options on your account.
About Matrix:
Matrix.org is an open standard for interoperable, decentralised, real-time communication
over IP, supporting group chat, file transfer, voice and video calling, integrations to
other apps, bridges to other communication systems and much more. It can be used to power
Instant Messaging, VoIP/WebRTC signalling, Internet of Things communication - or anywhere
you need a standard HTTP API for publishing and subscribing to data whilst tracking the
conversation history.
Matrix defines the standard, and provides open source reference implementations of
Matrix-compatible Servers, Clients, Client SDKs and Application Services to help you
create new communication solutions or extend the capabilities and reach of existing ones.
Thanks,
Matrix
258 changes: 258 additions & 0 deletions scripts/casefold_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
#!/usr/bin/env python
# Copyright 2021 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import os
import sqlite3
import sys
from typing import Any, Dict, List, Tuple

import signedjson.sign

from sydent.sydent import Sydent, parse_config_file
from sydent.util import json_decoder
from sydent.util.emailutils import sendEmail
from sydent.util.hash import sha256_and_url_safe_base64
from tests.utils import ResolvingMemoryReactorClock


def calculate_lookup_hash(sydent, address):
cur = sydent.db.cursor()
pepper_result = cur.execute("SELECT lookup_pepper from hashing_metadata")
pepper = pepper_result.fetchone()[0]
combo = "%s %s %s" % (address, "email", pepper)
lookup_hash = sha256_and_url_safe_base64(combo)
return lookup_hash


def update_local_associations(
sydent, db: sqlite3.Connection, send_email: bool, dry_run: bool
):
"""Update the DB table local_threepid_associations so that all stored
emails are casefolded, and any duplicate mxid's associated with the
given email are deleted.
:return: None
"""
cur = db.cursor()

res = cur.execute(
"SELECT address, mxid FROM local_threepid_associations WHERE medium = 'email'"
"ORDER BY ts DESC"
)

# a dict that associates an email address with correspoinding mxids and lookup hashes
associations: Dict[str, List[Tuple[str, str, str]]] = {}

# iterate through selected associations, casefold email, rehash it, and add to
# associations dict
for address, mxid in res.fetchall():
casefold_address = address.casefold()

# rehash email since hashes are case-sensitive
lookup_hash = calculate_lookup_hash(sydent, casefold_address)

if casefold_address in associations:
associations[casefold_address].append((address, mxid, lookup_hash))
else:
associations[casefold_address] = [(address, mxid, lookup_hash)]

# list of arguments to update db with
db_update_args: List[Tuple[str, str, str, str]] = []

# list of mxids to delete
to_delete: List[Tuple[str]] = []

# list of mxids to send emails to letting them know the mxid has been deleted
mxids: List[Tuple[str, str]] = []

for casefold_address, assoc_tuples in associations.items():
db_update_args.append(
(
casefold_address,
assoc_tuples[0][2],
assoc_tuples[0][0],
assoc_tuples[0][1],
)
)

if len(assoc_tuples) > 1:
# Iterate over all associations except for the first one, since we've already
# processed it.
for address, mxid, _ in assoc_tuples[1:]:
to_delete.append((address,))
mxids.append((mxid, address))

# iterate through the mxids and send email, let's only send one email per mxid
if send_email and not dry_run:
for mxid, address in mxids:
processed_mxids = []

if mxid in processed_mxids:
continue
else:
templateFile = sydent.get_branded_template(
None,
"migration_template.eml",
("email", "email.template"),
)

sendEmail(
sydent,
templateFile,
address,
{"mxid": "mxid", "subject_header_value": "MatrixID Update"},
)
processed_mxids.append(mxid)

print(
f"{len(to_delete)} rows to delete, {len(db_update_args)} rows to update in local_threepid_associations"
)

if not dry_run:
if len(to_delete) > 0:
cur.executemany(
"DELETE FROM local_threepid_associations WHERE address = ?", to_delete
)

if len(db_update_args) > 0:
cur.executemany(
"UPDATE local_threepid_associations SET address = ?, lookup_hash = ? WHERE address = ? AND mxid = ?",
db_update_args,
)

# We've finished updating the database, committing the transaction.
db.commit()


def update_global_associations(
sydent, db: sqlite3.Connection, send_email: bool, dry_run: bool
):
"""Update the DB table global_threepid_associations so that all stored
emails are casefolded, the signed association is re-signed and any duplicate
mxid's associated with the given email are deleted.
:return: None
"""

# get every row where the local server is origin server and medium is email
origin_server = sydent.server_name
medium = "email"

cur = db.cursor()
res = cur.execute(
"SELECT address, mxid, sgAssoc FROM global_threepid_associations WHERE medium = ?"
"AND originServer = ? ORDER BY ts DESC",
(medium, origin_server),
)

# dict that stores email address with mxid, email address, lookup hash, and
# signed association
associations: Dict[str, List[Tuple[str, str, str, str]]] = {}

# iterate through selected associations, casefold email, rehash it, re-sign the
# associations and add to associations dict
for address, mxid, sg_assoc in res.fetchall():
casefold_address = address.casefold()

# rehash the email since hash functions are case-sensitive
lookup_hash = calculate_lookup_hash(sydent, casefold_address)

# update signed associations with new casefolded address and re-sign
sg_assoc = json_decoder.decode(sg_assoc)
sg_assoc["address"] = address.casefold()
sg_assoc = json.dumps(
signedjson.sign.sign_json(
sg_assoc, sydent.server_name, sydent.keyring.ed25519
)
)

if casefold_address in associations:
associations[casefold_address].append(
(address, mxid, lookup_hash, sg_assoc)
)
else:
associations[casefold_address] = [(address, mxid, lookup_hash, sg_assoc)]

# list of arguments to update db with
db_update_args: List[Tuple[Any, str, str, str, str]] = []

# list of mxids to delete
to_delete: List[Tuple[str]] = []

for casefold_address, assoc_tuples in associations.items():
db_update_args.append(
(
casefold_address,
assoc_tuples[0][2],
assoc_tuples[0][3],
assoc_tuples[0][0],
assoc_tuples[0][1],
)
)

if len(assoc_tuples) > 1:
# Iterate over all associations except for the first one, since we've already
# processed it.
for address, mxid, _, _ in assoc_tuples[1:]:
to_delete.append((address,))

print(
f"{len(to_delete)} rows to delete, {len(db_update_args)} rows to update in global_threepid_associations"
)

if not dry_run:
if len(to_delete) > 0:
cur.executemany(
"DELETE FROM global_threepid_associations WHERE address = ?", to_delete
)

if len(db_update_args) > 0:
cur.executemany(
"UPDATE global_threepid_associations SET address = ?, lookup_hash = ?, sgAssoc = ? WHERE address = ? AND mxid = ?",
db_update_args,
)

db.commit()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Casefold email addresses in database")
parser.add_argument(
"--no-email", action="store_true", help="run script but do not send emails"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="run script but do not send emails or alter database",
)

parser.add_argument("config_path", help="path to the sydent configuration file")

args = parser.parse_args()

# if the path the user gives us doesn't work, find it for them
if not os.path.exists(args.config_path):
print(f"The config file '{args.config_path}' does not exist.")
sys.exit(1)

config = parse_config_file(args.config_path)

reactor = ResolvingMemoryReactorClock()
sydent = Sydent(config, reactor, False)

update_global_associations(sydent, sydent.db, not args.no_email, args.dry_run)
update_local_associations(sydent, sydent.db, not args.no_email, args.dry_run)
10 changes: 6 additions & 4 deletions sydent/db/invite_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,26 @@ def __init__(self, sydent: "Sydent") -> None:
self.sydent = sydent

def storeToken(
self, medium: str, address: str, roomId: str, sender: str, token: str
self, medium: str, normalised_address: str, roomId: str, sender: str, token: str
) -> None:
"""
Store a new invite token and its metadata.
Store a new invite token and its metadata. Please note that email
addresses need to be casefolded before calling this function.
:param medium: The medium of the 3PID the token is associated to.
:param address: The address of the 3PID the token is associated to.
:param normalised_address: The address of the 3PID the token is associated to.
:param roomId: The ID of the room the 3PID is invited in.
:param sender: The MXID of the user that sent the invite.
:param token: The token to store.
"""

cur = self.sydent.db.cursor()

cur.execute(
"INSERT INTO invite_tokens"
" ('medium', 'address', 'room_id', 'sender', 'token', 'received_ts')"
" VALUES (?, ?, ?, ?, ?, ?)",
(medium, address, roomId, sender, token, int(time.time())),
(medium, normalised_address, roomId, sender, token, int(time.time())),
)
self.sydent.db.commit()

Expand Down
Loading

0 comments on commit bc72eb5

Please sign in to comment.