Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Support underscores (in addition to hyphens) for charset detection. #10410

Merged
merged 7 commits into from
Jul 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/10410.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.
6 changes: 4 additions & 2 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@

logger = logging.getLogger(__name__)

_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
_charset_match = re.compile(
br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
)
_xml_encoding_match = re.compile(
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

Expand Down
13 changes: 13 additions & 0 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,19 @@ def test_meta_charset(self):
)
self.assertEqual(encoding, "ascii")

def test_meta_charset_underscores(self):
"""A character encoding contains underscore."""
clokep marked this conversation as resolved.
Show resolved Hide resolved
encoding = get_html_media_encoding(
b"""
<html>
<head><meta charset="Shift_JIS">
</head>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "Shift_JIS")

def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(
Expand Down