From c5afc99862e7cca41325a101eb262403e23c08f3 Mon Sep 17 00:00:00 2001 From: Philippe Daouadi Date: Wed, 5 Jan 2022 19:00:57 +0100 Subject: [PATCH 1/4] Fix preview of some gif URLs Signed-off-by: Philippe Daouadi --- docs/development/url_previews.md | 20 +++++++++++-------- synapse/rest/media/v1/oembed.py | 7 +++++-- synapse/rest/media/v1/preview_url_resource.py | 20 +++++++++++++------ tests/rest/media/v1/test_url_preview.py | 1 + 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/docs/development/url_previews.md b/docs/development/url_previews.md index aff38136091d..d802f2cb8526 100644 --- a/docs/development/url_previews.md +++ b/docs/development/url_previews.md @@ -34,19 +34,23 @@ When Synapse is asked to preview a URL it does the following: 2. Generates an Open Graph response based on image properties. 5. If the media is HTML: 1. Decodes the HTML via the stored file. - 2. Generates an Open Graph response from the HTML. - 3. If an image exists in the Open Graph response: + 2. If a JSON oEmbed URL was found in the HTML: + 1. Convert the oEmbed response to an Open Graph response. + 2. If a thumbnail or image is in the oEmbed response: + 1. Downloads the URL and stores it into a file via the media storage + provider and saves the local media metadata. + 2. Generates thumbnails. + 3. Updates the Open Graph response based on image properties. + 3. If the oEmbed type is video but no video is provided, abort oEmbed + parsing and fall back to Open Graph + 3. Generates an Open Graph response from the HTML. + 4. If an image exists in the Open Graph response: 1. Downloads the URL and stores it into a file via the media storage provider and saves the local media metadata. 2. Generates thumbnails. 3. Updates the Open Graph response based on image properties. 6. If the media is JSON and an oEmbed URL was found: - 1. Convert the oEmbed response to an Open Graph response. - 2. If a thumbnail or image is in the oEmbed response: - 1. Downloads the URL and stores it into a file via the media storage - provider and saves the local media metadata. - 2. Generates thumbnails. - 3. Updates the Open Graph response based on image properties. + 1. Parse it as described in 3.5.2 7. Stores the result in the database cache. 4. Returns the result. diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index cce1527ed9fb..697bc462575a 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -154,11 +154,14 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: "og:url": url, } - # Use either title or author's name as the title. - title = oembed.get("title") or oembed.get("author_name") + title = oembed.get("title") if title: open_graph_response["og:title"] = title + author_name = oembed.get("author_name") + if author_name: + open_graph_response["og:author_name"] = author_name + # Use the provider name and as the site. provider_name = oembed.get("provider_name") if provider_name: diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index a3829d943b7f..6cb903fbd841 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -294,17 +294,20 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: # Check if this HTML document points to oEmbed information and # defer to that. oembed_url = self._oembed.autodiscover_from_html(tree) - og = {} + og_from_oembed: JsonDict = {} if oembed_url: oembed_info = await self._download_url(oembed_url, user) - og, expiration_ms = await self._handle_oembed_response( + og_from_oembed, expiration_ms = await self._handle_oembed_response( url, oembed_info, expiration_ms ) - # If there was no oEmbed URL (or oEmbed parsing failed), attempt - # to generate the Open Graph information from the HTML. - if not oembed_url or not og: - og = parse_html_to_open_graph(tree, media_info.uri) + og_from_og = parse_html_to_open_graph(tree, media_info.uri) + + # If there was no oEmbed URL, or oEmbed parsing failed, or the + # information retrieved was incomplete, we complete it from + # the OpenGraph information. We give oEmbed information + # precedence. + og = {**og_from_og, **og_from_oembed} await self._precache_image_url(user, media_info, og) else: @@ -321,6 +324,11 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: logger.warning("Failed to find any OG data in %s", url) og = {} + # If we don't have a title but we have author_name, copy it as + # title + if not og.get("og:title") and og.get("og:author_name"): + og["og:title"] = og["og:author_name"] + # filter out any stupidly long values keys_to_remove = [] for k, v in og.items(): diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 16e904f15b45..b55b26660687 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -674,6 +674,7 @@ def test_oembed_rich(self): "og:url": "http://twitter.com/matrixdotorg/status/12345", "og:title": "Alice", "og:description": "Content Preview", + "og:author_name": "Alice", }, ) From 73164af75b94f2c0ea6820f0073d1b49cfb0409b Mon Sep 17 00:00:00 2001 From: Philippe Daouadi Date: Sat, 1 Jan 2022 15:21:58 +0100 Subject: [PATCH 2/4] Add changelog entry --- changelog.d/11669.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/11669.bugfix diff --git a/changelog.d/11669.bugfix b/changelog.d/11669.bugfix new file mode 100644 index 000000000000..10d913aaceed --- /dev/null +++ b/changelog.d/11669.bugfix @@ -0,0 +1 @@ +Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi. From cb2375dcb7c098ac9a914dc1dba9ed33a7580554 Mon Sep 17 00:00:00 2001 From: Philippe Daouadi Date: Thu, 13 Jan 2022 21:08:47 +0100 Subject: [PATCH 3/4] Move author_name out of the OpenGraph response from OEmbed --- synapse/rest/media/v1/oembed.py | 6 +++--- synapse/rest/media/v1/preview_url_resource.py | 19 ++++++++++++------- tests/rest/media/v1/test_url_preview.py | 1 - 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 697bc462575a..8e1b1eeb9a04 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -33,6 +33,8 @@ class OEmbedResult: # The Open Graph result (converted from the oEmbed result). open_graph_result: JsonDict + # The author_name of the OEmbed result + author_name: Optional[str] # Number of milliseconds to cache the content, according to the oEmbed response. # # This will be None if no cache-age is provided in the oEmbed response (or @@ -159,8 +161,6 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: open_graph_response["og:title"] = title author_name = oembed.get("author_name") - if author_name: - open_graph_response["og:author_name"] = author_name # Use the provider name and as the site. provider_name = oembed.get("provider_name") @@ -198,7 +198,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: open_graph_response = {} cache_age = None - return OEmbedResult(open_graph_response, cache_age) + return OEmbedResult(open_graph_response, author_name, cache_age) def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 6cb903fbd841..7e94282364bf 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -262,6 +262,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: # The number of milliseconds that the response should be considered valid. expiration_ms = media_info.expires + author_name: Optional[str] = None if _is_media(media_info.media_type): file_id = media_info.filesystem_id @@ -297,7 +298,11 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: og_from_oembed: JsonDict = {} if oembed_url: oembed_info = await self._download_url(oembed_url, user) - og_from_oembed, expiration_ms = await self._handle_oembed_response( + ( + og_from_oembed, + author_name, + expiration_ms, + ) = await self._handle_oembed_response( url, oembed_info, expiration_ms ) @@ -315,7 +320,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: elif oembed_url: # Handle the oEmbed information. - og, expiration_ms = await self._handle_oembed_response( + og, author_name, expiration_ms = await self._handle_oembed_response( url, media_info, expiration_ms ) await self._precache_image_url(user, media_info, og) @@ -326,8 +331,8 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: # If we don't have a title but we have author_name, copy it as # title - if not og.get("og:title") and og.get("og:author_name"): - og["og:title"] = og["og:author_name"] + if not og.get("og:title") and author_name: + og["og:title"] = author_name # filter out any stupidly long values keys_to_remove = [] @@ -492,7 +497,7 @@ async def _precache_image_url( async def _handle_oembed_response( self, url: str, media_info: MediaInfo, expiration_ms: int - ) -> Tuple[JsonDict, int]: + ) -> Tuple[JsonDict, Optional[str], int]: """ Parse the downloaded oEmbed info. @@ -509,7 +514,7 @@ async def _handle_oembed_response( """ # If JSON was not returned, there's nothing to do. if not _is_json(media_info.media_type): - return {}, expiration_ms + return {}, None, expiration_ms with open(media_info.filename, "rb") as file: body = file.read() @@ -521,7 +526,7 @@ async def _handle_oembed_response( if open_graph_result and oembed_response.cache_age is not None: expiration_ms = oembed_response.cache_age - return open_graph_result, expiration_ms + return open_graph_result, oembed_response.author_name, expiration_ms def _start_expire_url_cache_data(self) -> Deferred: return run_as_background_process( diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index b55b26660687..16e904f15b45 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -674,7 +674,6 @@ def test_oembed_rich(self): "og:url": "http://twitter.com/matrixdotorg/status/12345", "og:title": "Alice", "og:description": "Content Preview", - "og:author_name": "Alice", }, ) From 71fe4c634e4e0be5daa747f0831650531f80991c Mon Sep 17 00:00:00 2001 From: Philippe Daouadi Date: Fri, 14 Jan 2022 19:27:33 +0100 Subject: [PATCH 4/4] Apply comments from review --- docs/development/url_previews.md | 23 ++++++++++--------- synapse/rest/media/v1/oembed.py | 3 ++- synapse/rest/media/v1/preview_url_resource.py | 14 ++++++----- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/docs/development/url_previews.md b/docs/development/url_previews.md index d802f2cb8526..154b9a5e12f4 100644 --- a/docs/development/url_previews.md +++ b/docs/development/url_previews.md @@ -34,23 +34,24 @@ When Synapse is asked to preview a URL it does the following: 2. Generates an Open Graph response based on image properties. 5. If the media is HTML: 1. Decodes the HTML via the stored file. - 2. If a JSON oEmbed URL was found in the HTML: - 1. Convert the oEmbed response to an Open Graph response. - 2. If a thumbnail or image is in the oEmbed response: - 1. Downloads the URL and stores it into a file via the media storage - provider and saves the local media metadata. - 2. Generates thumbnails. - 3. Updates the Open Graph response based on image properties. - 3. If the oEmbed type is video but no video is provided, abort oEmbed - parsing and fall back to Open Graph - 3. Generates an Open Graph response from the HTML. + 2. Generates an Open Graph response from the HTML. + 3. If a JSON oEmbed URL was found in the HTML via autodiscovery: + 1. Downloads the URL and stores it into a file via the media storage provider + and saves the local media metadata. + 2. Convert the oEmbed response to an Open Graph response. + 3. Override any Open Graph data from the HTML with data from oEmbed. 4. If an image exists in the Open Graph response: 1. Downloads the URL and stores it into a file via the media storage provider and saves the local media metadata. 2. Generates thumbnails. 3. Updates the Open Graph response based on image properties. 6. If the media is JSON and an oEmbed URL was found: - 1. Parse it as described in 3.5.2 + 1. Convert the oEmbed response to an Open Graph response. + 2. If a thumbnail or image is in the oEmbed response: + 1. Downloads the URL and stores it into a file via the media storage + provider and saves the local media metadata. + 2. Generates thumbnails. + 3. Updates the Open Graph response based on image properties. 7. Stores the result in the database cache. 4. Returns the result. diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 8e1b1eeb9a04..2177b46c9eba 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -33,7 +33,7 @@ class OEmbedResult: # The Open Graph result (converted from the oEmbed result). open_graph_result: JsonDict - # The author_name of the OEmbed result + # The author_name of the oEmbed result author_name: Optional[str] # Number of milliseconds to cache the content, according to the oEmbed response. # @@ -196,6 +196,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: # Trap any exception and let the code follow as usual. logger.warning("Error parsing oEmbed metadata from %s: %r", url, e) open_graph_response = {} + author_name = None cache_age = None return OEmbedResult(open_graph_response, author_name, cache_age) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7e94282364bf..e8881bc8709e 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -306,13 +306,14 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: url, oembed_info, expiration_ms ) - og_from_og = parse_html_to_open_graph(tree, media_info.uri) + # Parse Open Graph information from the HTML in case the oEmbed + # response failed or is incomplete. + og_from_html = parse_html_to_open_graph(tree, media_info.uri) - # If there was no oEmbed URL, or oEmbed parsing failed, or the - # information retrieved was incomplete, we complete it from - # the OpenGraph information. We give oEmbed information - # precedence. - og = {**og_from_og, **og_from_oembed} + # Compile the Open Graph response by using the scraped + # information from the HTML and overlaying any information + # from the oEmbed response. + og = {**og_from_html, **og_from_oembed} await self._precache_image_url(user, media_info, og) else: @@ -510,6 +511,7 @@ async def _handle_oembed_response( Returns: A tuple of: The Open Graph dictionary, if the oEmbed info can be parsed. + The author name if it could be retrieved from oEmbed. The (possibly updated) length of time, in milliseconds, the media is valid for. """ # If JSON was not returned, there's nothing to do.