Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle simple obfuscations - HomeDepot.com style price obfuscation #764

Merged
merged 2 commits into from
Jul 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion changedetectionio/fetch_site_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ def run(self, uuid):

if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content = fetcher.content
html_content = html_tools.workarounds_for_obfuscations(fetcher.content)


# If not JSON, and if it's not text/plain..
if 'text/plain' in fetcher.headers.get('Content-Type', '').lower():
Expand Down
14 changes: 14 additions & 0 deletions changedetectionio/html_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,17 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:

return text_content

def workarounds_for_obfuscations(content):
"""
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
This could go into its own Pip package in the future, for faster updates
"""

# HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
# https://github.com/weblyzard/inscriptis/issues/45
if not content:
return content

content = re.sub('<!--\s+-->', '', content)

return content
43 changes: 43 additions & 0 deletions changedetectionio/tests/test_obfuscations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/python3

import time
from flask import url_for
from .util import live_server_setup


def set_original_ignore_response():
test_return_data = """<html>
<body>
<span>The price is</span><span>$<!-- -->90<!-- -->.<!-- -->74</span>
</body>
</html>

"""

with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)


def test_obfuscations(client, live_server):
set_original_ignore_response()
live_server_setup(live_server)
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data

# Give the thread time to pick it up
time.sleep(3)

# Check HTML conversion detected and workd
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)

assert b'$90.74' in res.data