From 076d667125b7d591141099e42325fd3a5a5bfff9 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 20 Jul 2022 13:28:01 +0200 Subject: [PATCH 1/2] Handle HomeDepot.com style price obfuscation --- changedetectionio/fetch_site_status.py | 3 +- changedetectionio/html_tools.py | 13 ++++++ changedetectionio/tests/test_obfuscations.py | 43 ++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 changedetectionio/tests/test_obfuscations.py diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 6c3dbec8939..5798292fcaf 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -151,7 +151,8 @@ def run(self, uuid): if is_html or is_source: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - html_content = fetcher.content + html_content = html_tools.workarounds_for_obfuscations(fetcher.content) + # If not JSON, and if it's not text/plain.. if 'text/plain' in fetcher.headers.get('Content-Type', '').lower(): diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 27496e1a239..7786a618de1 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -202,3 +202,16 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: return text_content +def workarounds_for_obfuscations(content): + """ + Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis + This could go into its own Pip package in the future, for faster updates + """ + + # HomeDepot.com style $90.74 + if not content: + return content + + content = re.sub('', '', content) + + return content diff --git a/changedetectionio/tests/test_obfuscations.py b/changedetectionio/tests/test_obfuscations.py new file mode 100644 index 00000000000..17956744ed0 --- /dev/null +++ b/changedetectionio/tests/test_obfuscations.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from .util import live_server_setup + + +def set_original_ignore_response(): + test_return_data = """ + + The price is$90.74 + + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def test_obfuscations(client, live_server): + set_original_ignore_response() + live_server_setup(live_server) + time.sleep(1) + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Give the thread time to pick it up + time.sleep(3) + + # Check HTML conversion detected and workd + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + assert b'$90.74' in res.data From 67d70e75acbf9e18ad0f6f15b82af6a537047184 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 20 Jul 2022 13:29:32 +0200 Subject: [PATCH 2/2] Add link --- changedetectionio/html_tools.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 7786a618de1..cc6a476d683 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -209,9 +209,10 @@ def workarounds_for_obfuscations(content): """ # HomeDepot.com style $90.74 + # https://github.com/weblyzard/inscriptis/issues/45 if not content: return content - + content = re.sub('', '', content) return content