From eaa4fa2ce505101594b760b9b513f3c257e4c787 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 16:00:08 +0100 Subject: [PATCH 01/11] Introduce `normalize` and normalization schema. Properly define how we store entries in zim file. We introduce `normalize` helper function class in place of `canonicalize`. We work on normalization on converter level. So we path the path to the items instead of letting them call `normalize`. Converted `test/data/video-vimeo.warc.gz` to zim was containing : ``` A/404.html A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js A/f.vimeocdn.com/p/3.45.3/css/player.css A/f.vimeocdn.com/p/3.45.3/js/player.js A/i.vimeocdn.com/player/354746.png?mw=200&mh=200 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 A/index.html A/load.js A/oembed.link/favicon.ico A/oembed.link/https://vimeo.com/347119375 A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 A/sw.js A/topFrame.html A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 H/vimeo.fuzzy.replayweb.page/video/347119375 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` With this change it contains: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` --- src/warc2zim/converter.py | 33 ++++++++------- src/warc2zim/items.py | 15 +++---- src/warc2zim/url_rewriting.py | 75 +++++++++++++++++++++++++++-------- src/warc2zim/utils.py | 8 ++++ tests/test_warc_to_zim.py | 33 +++++++++++---- 5 files changed, 118 insertions(+), 46 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index a7387b51..ff7e31ee 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -49,7 +49,7 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter -from warc2zim.url_rewriting import FUZZY_RULES, canonicalize +from warc2zim.url_rewriting import FUZZY_RULES, normalize from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle from warc2zim.utils import ( get_version, @@ -294,19 +294,21 @@ def run(self): self.add_items_for_warc_record(record) # process revisits, headers only - for url, record in self.revisits.items(): - if canonicalize(url) not in self.indexed_urls: + for normalized_url, record in self.revisits.items(): + if normalized_url not in self.indexed_urls: logger.debug( "Adding revisit {0} -> {1}".format( - url, record.rec_headers["WARC-Refers-To-Target-URI"] + normalized_url, record.rec_headers["WARC-Refers-To-Target-URI"] ) ) try: - self.creator.add_item(WARCHeadersItem(record)) + self.creator.add_item( + WARCHeadersItem("H/" + normalized_url, record) + ) except RuntimeError as exc: if not DUPLICATE_EXC_STR.match(str(exc)): raise exc - self.indexed_urls.add(canonicalize(url)) + self.indexed_urls.add(normalized_url) logger.debug(f"Found {self.total_records} records in WARCs") @@ -472,15 +474,16 @@ def is_self_redirect(self, record, url): return False location = record.http_headers.get("Location", "") - return canonicalize(url) == canonicalize(location) + return normalize(url) == normalize(location) def add_items_for_warc_record(self, record): url = get_record_url(record) + normalized_url = normalize(url) if not url: logger.debug(f"Skipping record with empty WARC-Target-URI {record}") return - if canonicalize(url) in self.indexed_urls: + if normalized_url in self.indexed_urls: logger.debug("Skipping duplicate {0}, already added to ZIM".format(url)) return @@ -499,12 +502,14 @@ def add_items_for_warc_record(self, record): return try: - self.creator.add_item(WARCHeadersItem(record)) + self.creator.add_item(WARCHeadersItem("H/" + normalized_url, record)) except RuntimeError as exc: if not DUPLICATE_EXC_STR.match(str(exc)): raise exc - payload_item = WARCPayloadItem(record, self.head_insert, self.css_insert) + payload_item = WARCPayloadItem( + normalized_url, record, self.head_insert, self.css_insert + ) if len(payload_item.content) != 0: try: @@ -515,15 +520,15 @@ def add_items_for_warc_record(self, record): self.total_records += 1 self.update_stats() - self.indexed_urls.add(canonicalize(url)) + self.indexed_urls.add(normalized_url) elif ( record.rec_headers["WARC-Refers-To-Target-URI"] != url - and url not in self.revisits + and normalized_url not in self.revisits ): - self.revisits[url] = record + self.revisits[normalized_url] = record - self.add_fuzzy_match_record(url) + self.add_fuzzy_match_record(normalized_url) def add_fuzzy_match_record(self, url): fuzzy_url = url diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index a6587e06..76bf7dd1 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -16,9 +16,6 @@ from zimscraperlib.zim.items import StaticItem from zimscraperlib.zim.providers import StringProvider -from bs4 import BeautifulSoup - -from warc2zim.url_rewriting import canonicalize from warc2zim.utils import get_record_url, get_record_mime_type, parse_title # Shared logger @@ -36,13 +33,13 @@ class WARCHeadersItem(StaticItem): Usually stored under H namespace """ - def __init__(self, record): + def __init__(self, path, record): super().__init__() self.record = record - self.url = get_record_url(record) + self.path = path def get_path(self): - return "H/" + canonicalize(self.url) + return self.path def get_title(self): return "" @@ -68,10 +65,10 @@ class WARCPayloadItem(StaticItem): Usually stored under A namespace """ - def __init__(self, record, head_insert=None, css_insert=None): + def __init__(self, path, record, head_insert=None, css_insert=None): super().__init__() self.record = record - self.url = get_record_url(record) + self.path = path self.mimetype = get_record_mime_type(record) self.title = "" @@ -89,7 +86,7 @@ def __init__(self, record, head_insert=None, css_insert=None): self.content = CSS_INS.sub(css_insert, self.content) def get_path(self): - return "A/" + canonicalize(self.url) + return self.path def get_title(self): return self.title diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index b9595176..2add2d3f 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -5,10 +5,37 @@ """ warc2zim's url rewriting tools This module is about url and entry path rewriting. + +The global scheme is the following: + +Entries are stored in the zim file using their urldecoded full path properly urlencoded (yes!): +- The full path is the full url without the scheme (ie : "/(?, quote_via=quote)` + """ import logging import re +from urllib.parse import urlsplit, urlunsplit, quote, unquote, parse_qs, urlencode +from warc2zim.utils import to_string # Shared logger logger = logging.getLogger("warc2zim.url_rewriting") @@ -18,27 +45,27 @@ { "match": re.compile( # r"//.*googlevideo.com/(videoplayback\?).*(id=[^&]+).*([&]itag=[^&]+).*" - r"//.*googlevideo.com/(videoplayback\?).*((?<=[?&])id=[^&]+).*" + r".*googlevideo.com/(videoplayback\?).*((?<=[?&])id=[^&]+).*" ), - "replace": r"//youtube.fuzzy.replayweb.page/\1\2", + "replace": r"youtube.fuzzy.replayweb.page/\1\2", }, { "match": re.compile( - r"//(?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?)" + r"(?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?)" r".*(video_id=[^&]+).*" ), - "replace": r"//youtube.fuzzy.replayweb.page/\1\2", + "replace": r"youtube.fuzzy.replayweb.page/\1\2", }, {"match": re.compile(r"(\.[^?]+\?)[\d]+$"), "replace": r"\1"}, { "match": re.compile( - r"//(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*" + r"(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*" ), - "replace": r"//youtube.fuzzy.replayweb.page/\1?\2", + "replace": r"youtube.fuzzy.replayweb.page/\1?\2", }, { - "match": re.compile(r"//(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*"), - "replace": r"//youtube.fuzzy.replayweb.page/embed/\1", + "match": re.compile(r"(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*"), + "replace": r"youtube.fuzzy.replayweb.page/embed/\1", }, { "match": re.compile( @@ -53,14 +80,30 @@ ] -def canonicalize(url): - """Return a 'canonical' version of the url under which it is stored in the ZIM - For now, just removing the scheme http:// or https:// scheme +def normalize(url: str | bytes) -> str: + """Normalize a properly contructed url to a path to use as a entry's key. + + >>> normalize("http://exemple.com/path/to/article?foo=bar") + "exemple.com/path/to/article?foo=bar" + >>> normalize("http://other.com/path to strange ar+t%3Ficle?foo=bar+baz") + "other.com/path to strange ar+t%3Ficle?foo=bar%20baz" """ - if url.startswith("https://"): - return url[8:] - if url.startswith("http://"): - return url[7:] + if not url: + return url + + url = to_string(url) + + url_parts = urlsplit(url) + url_parts = url_parts._replace(scheme="") + + # Remove the netloc (by moving it into path) + if url_parts.netloc: + new_path = url_parts.netloc + url_parts.path + url_parts = url_parts._replace(netloc="", path=new_path) + if url_parts.path and url_parts.path[0] == "/": + url_parts = url_parts._replace(path=url_parts.path[1:]) + + path = urlunsplit(url_parts) - return url + return path diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py index 9e55b8ad..f12adefd 100644 --- a/src/warc2zim/utils.py +++ b/src/warc2zim/utils.py @@ -37,3 +37,11 @@ def parse_title(content): return soup.title.text or "" except Exception: return "" + + +def to_string(input: str | bytes) -> str: + try: + input = input.decode("utf8") + except AttributeError: + pass + return input diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index d46d9807..dfd37977 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -14,7 +14,7 @@ from jinja2 import Environment, PackageLoader from zimscraperlib.zim import Archive -from warc2zim.url_rewriting import canonicalize +from warc2zim.url_rewriting import normalize from warc2zim.converter import iter_warc_records from warc2zim.utils import get_record_url from warc2zim.main import main @@ -170,21 +170,40 @@ def verify_warc_and_zim(self, warcfile, zimfile): warc_urls.add(url) - def test_canonicalize(self): - assert canonicalize("http://example.com/?foo=bar") == "example.com/?foo=bar" + def test_normalize(self): + assert normalize(None) == None + assert normalize("") == "" + assert normalize("https://exemple.com") == "exemple.com" + assert normalize("https://exemple.com/") == "exemple.com/" + assert normalize("http://example.com/?foo=bar") == "example.com/?foo=bar" + assert normalize(b"http://example.com/?foo=bar") == "example.com/?foo=bar" - assert canonicalize("https://example.com/?foo=bar") == "example.com/?foo=bar" + assert normalize("https://example.com/?foo=bar") == "example.com/?foo=bar" assert ( - canonicalize("https://example.com/some/path/http://example.com/?foo=bar") + normalize("https://example.com/some/path/http://example.com/?foo=bar") == "example.com/some/path/http://example.com/?foo=bar" ) assert ( - canonicalize("example.com/some/path/http://example.com/?foo=bar") + normalize("example.com/some/path/http://example.com/?foo=bar") == "example.com/some/path/http://example.com/?foo=bar" ) + assert ( + normalize("http://example.com/path/with/final/slash/") + == "example.com/path/with/final/slash/" + ) + + assert normalize("http://test@example.com/") == "test@example.com/" + + assert ( + normalize( + "http://lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493" + ) + == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493" + ) + def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): zim_output = "zim-out-filename.zim" main( @@ -222,7 +241,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): assert all_articles == { # entries from WARC - "A/example.com/": "Example Domain", + "example.com/": "Example Domain", "H/example.com/": "H/example.com/", # replay system files "A/index.html": "A/index.html", From e8122deb7376c2fdde934137a2ffdb92df098bc8 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 16:12:17 +0100 Subject: [PATCH 02/11] Directly store entries using their potentially reduced path. Before, we were storing a entry using its full path and potentially create a redirect entry (using reduced path) pointing to the full path entry. Now, path reduction is part of normalization and so we directly store entries using their (potentially) reduced path. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4 ``` to : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` Notice that `vod-progressive.akamaized.net` is not present. It is "replaced" by `vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4` which is now a plain entry instead of a redirect to `vod-progressive.akamaized.net[...]`. --- src/warc2zim/converter.py | 38 ++++------------------------------- src/warc2zim/main.py | 18 ----------------- src/warc2zim/url_rewriting.py | 17 +++++++++++++++- tests/test_warc_to_zim.py | 27 ++++++++++++++++--------- 4 files changed, 37 insertions(+), 63 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index ff7e31ee..5a45a6d2 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -5,19 +5,12 @@ """ warc2zim conversion utility This utility provides a conversion from WARC records to ZIM files. -The WARCs are converted in a 'lossless' way, no data from WARC records is lost. -Each WARC record results in two ZIM items: -- The WARC payload is stored under /A/ -- The WARC headers + HTTP headers are stored under the /H/ - -Given a WARC response record for 'https://example.com/', -two ZIM items are created /A/example.com/ and /H/example.com/ are created. - -Only WARC response and resource records are stored. +WARC record are directly stored in a zim file as: +- Response WARC record as item "normalized" +- Revisit record as alias (using "normalized" to) If the WARC contains multiple entries for the same URL, only the first entry is added, and later entries are ignored. A warning is printed as well. - """ import os @@ -49,7 +42,7 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter -from warc2zim.url_rewriting import FUZZY_RULES, normalize +from warc2zim.url_rewriting import normalize from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle from warc2zim.utils import ( get_version, @@ -528,29 +521,6 @@ def add_items_for_warc_record(self, record): ): self.revisits[normalized_url] = record - self.add_fuzzy_match_record(normalized_url) - - def add_fuzzy_match_record(self, url): - fuzzy_url = url - for rule in FUZZY_RULES: - fuzzy_url = rule["match"].sub(rule["replace"], url) - if fuzzy_url != url: - break - - if fuzzy_url == url: - return - - http_headers = StatusAndHeaders("302 Redirect", {"Location": url}) - - date = datetime.datetime.utcnow().isoformat() - builder = RecordBuilder() - record = builder.create_revisit_record( - fuzzy_url, "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", url, date, http_headers - ) - - self.revisits[fuzzy_url] = record - logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url)) - def iter_warc_records(inputs): """iter warc records, including appending request data to matching response""" diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 45bb7e9e..1da7bff3 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -2,24 +2,6 @@ # -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu -""" warc2zim conversion utility - -This utility provides a conversion from WARC records to ZIM files. -The WARCs are converted in a 'lossless' way, no data from WARC records is lost. -Each WARC record results in two ZIM items: -- The WARC payload is stored under /A/ -- The WARC headers + HTTP headers are stored under the /H/ - -Given a WARC response record for 'https://example.com/', -two ZIM items are created /A/example.com/ and /H/example.com/ are created. - -Only WARC response and resource records are stored. - -If the WARC contains multiple entries for the same URL, only the first entry is added, -and later entries are ignored. A warning is printed as well. - -""" - import sys import logging from argparse import ArgumentParser diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index 2add2d3f..0a0ced18 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -30,6 +30,10 @@ - The querystring part must be parsable by `urllib.parse.parse_qs` (even if we don't do it here) - The querystring must be generated as by `urllib.parse.urlencode(, quote_via=quote)` +On top of that, paths are "reduced" using fuzzy rules: +A path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value&other_key=other_value&videoId=xxxx&yet_another_key=yet_another_value" +is reduced to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx" +by slightly simplifying the path and keeping only the usefull arguments in the querystring. """ import logging @@ -56,7 +60,7 @@ ), "replace": r"youtube.fuzzy.replayweb.page/\1\2", }, - {"match": re.compile(r"(\.[^?]+\?)[\d]+$"), "replace": r"\1"}, + {"match": re.compile(r"([^?]+\?)[\d]+$"), "replace": r"\1"}, { "match": re.compile( r"(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*" @@ -80,6 +84,14 @@ ] +def reduce(path: str) -> str: + """Reduce a path""" + for rule in FUZZY_RULES: + if match := rule["match"].match(path): + return match.expand(rule["replace"]) + return path + + def normalize(url: str | bytes) -> str: """Normalize a properly contructed url to a path to use as a entry's key. @@ -87,6 +99,8 @@ def normalize(url: str | bytes) -> str: "exemple.com/path/to/article?foo=bar" >>> normalize("http://other.com/path to strange ar+t%3Ficle?foo=bar+baz") "other.com/path to strange ar+t%3Ficle?foo=bar%20baz" + >>> normalize("http://youtube.com/youtubei/bar?key=value&videoId=xxxx&otherKey=otherValue") + "youtube.fuzzy.replayweb.page/youtubei/bar?videoId=xxxx" """ if not url: @@ -105,5 +119,6 @@ def normalize(url: str | bytes) -> str: url_parts = url_parts._replace(path=url_parts.path[1:]) path = urlunsplit(url_parts) + path = reduce(path) return path diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index dfd37977..a39ce1a5 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -5,6 +5,7 @@ import os import time import json +import re from io import BytesIO import pytest @@ -55,22 +56,22 @@ def cmdline(request): { "filename": "video-yt.warc.gz", "entries": [ - "H/youtube.fuzzy.replayweb.page/get_video_info?video_id=aT-Up5Y4uRI", - "H/youtube.fuzzy.replayweb.page/videoplayback?id=o-AE3bg3qVNY-gAWwYgL52vgpHKJe9ijdbu2eciNi5Uo_w", + "youtube.fuzzy.replayweb.page/get_video_info?video_id=aT-Up5Y4uRI", + "youtube.fuzzy.replayweb.page/videoplayback?id=o-AE3bg3qVNY-gAWwYgL52vgpHKJe9ijdbu2eciNi5Uo_w", ], }, { "filename": "video-yt-2.warc.gz", "entries": [ - "H/youtube.fuzzy.replayweb.page/youtubei/v1/player?videoId=aT-Up5Y4uRI", - "H/youtube.fuzzy.replayweb.page/videoplayback?id=o-AGDtIqpFRmvgVVZk96wgGyFxL_SFSdpBxs0iBHatQpRD", + "youtube.fuzzy.replayweb.page/youtubei/v1/player?videoId=aT-Up5Y4uRI", + "youtube.fuzzy.replayweb.page/videoplayback?id=o-AGDtIqpFRmvgVVZk96wgGyFxL_SFSdpBxs0iBHatQpRD", ], }, { "filename": "video-vimeo.warc.gz", "entries": [ - "H/vimeo.fuzzy.replayweb.page/video/347119375", - "H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4", + "vimeo.fuzzy.replayweb.page/video/347119375", + "vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4", ], }, ] @@ -138,6 +139,12 @@ def verify_warc_and_zim(self, warcfile, zimfile): # parse headers as record, ensure headers match url_no_scheme = url.split("//", 2)[1] print(url_no_scheme) + + if "www.youtube.com/embed" in url_no_scheme: + # We know that those url are rewritten in zim. Don't check for them. + break + + url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme) parsed_record = next( ArchiveIterator(BytesIO(zim_fh.get_content("H/" + url_no_scheme))) ) @@ -201,7 +208,7 @@ def test_normalize(self): normalize( "http://lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493" ) - == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493" + == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?" ) def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): @@ -431,7 +438,7 @@ def test_all_warcs_root_dir(self, tmp_path): ) # timestamp fuzzy match from example-with-timestamp.warc - assert self.get_article(zim_output, "H/example.com/path.txt?") != b"" + assert self.get_article(zim_output, "example.com/path.txt?") != b"" def test_fuzzy_urls(self, tmp_path, fuzzycheck): zim_output = fuzzycheck["filename"] + ".zim" @@ -449,8 +456,8 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck): zim_output = tmp_path / zim_output for entry in fuzzycheck["entries"]: - res = self.get_article(zim_output, entry) - assert b"Location: " in res + # This should be item and get_article_raw is eq to getItem and it will fail if it is not a item + self.get_article_raw(zim_output, entry) def test_local_replay_viewer_url(self, tmp_path): zim_local_sw = "zim-local-sw.zim" From 54f8cd621e22520cfab8661d900d2d4f00901d9d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 16:16:31 +0100 Subject: [PATCH 03/11] Don't store Entry's header. We don't use it and we agree to not store them (at least for now). If we need them, we will see how to readd them. Converted `test/data/video-vimeo.warc.gz` goes from : ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js H/f.vimeocdn.com/p/3.45.3/css/player.css H/f.vimeocdn.com/p/3.45.3/js/player.js H/i.vimeocdn.com/player/354746.png?mw=200&mh=200 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 H/oembed.link/favicon.ico H/oembed.link/https://vimeo.com/347119375 H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` to: ``` A/404.html A/index.html A/load.js A/sw.js A/topFrame.html f.vimeocdn.com/js_opt/modules/utils/vuid.min.js f.vimeocdn.com/p/3.45.3/css/player.css f.vimeocdn.com/p/3.45.3/js/player.js i.vimeocdn.com/player/354746.png?mw=200&mh=200 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85 i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70 oembed.link/favicon.ico oembed.link/https://vimeo.com/347119375 player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963 vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4 vimeo.fuzzy.replayweb.page/video/347119375 ``` --- src/warc2zim/converter.py | 6 ------ tests/test_warc_to_zim.py | 9 --------- 2 files changed, 15 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 5a45a6d2..8e60899c 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -494,12 +494,6 @@ def add_items_for_warc_record(self, record): logger.debug("Skipping self-redirect: " + url) return - try: - self.creator.add_item(WARCHeadersItem("H/" + normalized_url, record)) - except RuntimeError as exc: - if not DUPLICATE_EXC_STR.match(str(exc)): - raise exc - payload_item = WARCPayloadItem( normalized_url, record, self.head_insert, self.css_insert ) diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index a39ce1a5..19b286da 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -353,15 +353,6 @@ def test_skip_self_redirect(self, tmp_path): zim_output = tmp_path / zim_output - for article in self.list_articles(zim_output): - url = article.path - if url.startswith("H/"): - # ensure there is only one H/ record, and its a 200 (not 301) - assert url == "H/kiwix.org/" - assert b"HTTP/1.1 200 OK" in self.get_article( - zim_output, "H/kiwix.org/" - ) - def test_include_domains_favicon_and_language(self, tmp_path): zim_output = "spt.zim" main( From 1cacf88d5b4c09631a51fed3c62213f5aa875a92 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 14 Nov 2023 16:37:40 +0100 Subject: [PATCH 04/11] Store revisits as alias instead of WARCHeadersItem. We don't need `WARCHeadersItem` anymore. --- src/warc2zim/converter.py | 27 ++++++++++++++------------- src/warc2zim/items.py | 33 --------------------------------- tests/test_warc_to_zim.py | 16 ++++++---------- 3 files changed, 20 insertions(+), 56 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 8e60899c..18ec5576 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -43,7 +43,7 @@ from cdxj_indexer import iter_file_or_dir, buffering_record_iter from warc2zim.url_rewriting import normalize -from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle +from warc2zim.items import WARCPayloadItem, StaticArticle from warc2zim.utils import ( get_version, get_record_url, @@ -75,6 +75,11 @@ re.MULTILINE | re.DOTALL, ) +ALIAS_EXC_STR = re.compile( + r"^Impossible to alias(.+)" r"(.+) doesn't exist.", + re.MULTILINE | re.DOTALL, +) + class Converter: def __init__(self, args): @@ -286,20 +291,14 @@ def run(self): for record in self.iter_all_warc_records(): self.add_items_for_warc_record(record) - # process revisits, headers only - for normalized_url, record in self.revisits.items(): + # process revisits + for normalized_url, target_url in self.revisits.items(): if normalized_url not in self.indexed_urls: - logger.debug( - "Adding revisit {0} -> {1}".format( - normalized_url, record.rec_headers["WARC-Refers-To-Target-URI"] - ) - ) + logger.debug(f"Adding alias {normalized_url} -> {target_url}") try: - self.creator.add_item( - WARCHeadersItem("H/" + normalized_url, record) - ) + self.creator.add_alias(normalized_url, "", target_url, {}) except RuntimeError as exc: - if not DUPLICATE_EXC_STR.match(str(exc)): + if not ALIAS_EXC_STR.match(str(exc)): raise exc self.indexed_urls.add(normalized_url) @@ -513,7 +512,9 @@ def add_items_for_warc_record(self, record): record.rec_headers["WARC-Refers-To-Target-URI"] != url and normalized_url not in self.revisits ): - self.revisits[normalized_url] = record + self.revisits[normalized_url] = normalize( + record.rec_headers["WARC-Refers-To-Target-URI"] + ) def iter_warc_records(inputs): diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index 76bf7dd1..d9ea26f8 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -14,7 +14,6 @@ from libzim.writer import Hint from zimscraperlib.types import get_mime_for_name from zimscraperlib.zim.items import StaticItem -from zimscraperlib.zim.providers import StringProvider from warc2zim.utils import get_record_url, get_record_mime_type, parse_title @@ -28,38 +27,6 @@ CSS_INS = re.compile(b"()", re.I) -class WARCHeadersItem(StaticItem): - """WARCHeadersItem used to store the WARC + HTTP headers as text - Usually stored under H namespace - """ - - def __init__(self, path, record): - super().__init__() - self.record = record - self.path = path - - def get_path(self): - return self.path - - def get_title(self): - return "" - - def get_mimetype(self): - return "application/warc-headers" - - def get_hints(self): - return {Hint.FRONT_ARTICLE: False} - - def get_contentprovider(self): - # add WARC headers - buff = self.record.rec_headers.to_bytes(encoding="utf-8") - # add HTTP headers, if present - if self.record.http_headers: - buff += self.record.http_headers.to_bytes(encoding="utf-8") - - return StringProvider(content=buff, ref=self) - - class WARCPayloadItem(StaticItem): """WARCPayloadItem used to store the WARC payload Usually stored under A namespace diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 19b286da..246b96ad 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -145,12 +145,6 @@ def verify_warc_and_zim(self, warcfile, zimfile): break url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme) - parsed_record = next( - ArchiveIterator(BytesIO(zim_fh.get_content("H/" + url_no_scheme))) - ) - - assert record.rec_headers == parsed_record.rec_headers - assert record.http_headers == parsed_record.http_headers # ensure payloads match try: @@ -158,10 +152,13 @@ def verify_warc_and_zim(self, warcfile, zimfile): except KeyError: payload = None - if record.rec_type == "revisit" or ( - record.http_headers and record.http_headers.get("Content-Length") == "0" - ): + if record.http_headers and record.http_headers.get("Content-Length") == "0": assert not payload + elif record.rec_type == "revisit": + # We must have a payload + # We should check with the content of the targeted record... + # But difficult to test as we don't have it + assert payload else: payload_content = payload.content.tobytes() @@ -249,7 +246,6 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): assert all_articles == { # entries from WARC "example.com/": "Example Domain", - "H/example.com/": "H/example.com/", # replay system files "A/index.html": "A/index.html", "A/load.js": "A/load.js", From 09f55eba86c3667841f456f2e287480ef6cefc4d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 26 May 2023 15:39:38 +0300 Subject: [PATCH 05/11] Do not add service worker stuff in zim file. Remove other unnecessary files. --- setup.py | 13 ---- src/warc2zim/converter.py | 41 +++--------- src/warc2zim/main.py | 6 -- src/warc2zim/templates/404.html | 43 ------------- src/warc2zim/templates/index.html | 1 - src/warc2zim/templates/load.js | 69 -------------------- src/warc2zim/templates/sw_check.html | 23 ------- src/warc2zim/templates/topFrame.html | 94 ---------------------------- tests/test_warc_to_zim.py | 66 ++----------------- 9 files changed, 11 insertions(+), 345 deletions(-) delete mode 100644 src/warc2zim/templates/404.html delete mode 100644 src/warc2zim/templates/load.js delete mode 100644 src/warc2zim/templates/sw_check.html delete mode 100644 src/warc2zim/templates/topFrame.html diff --git a/setup.py b/setup.py index 6c2bd52c..320803a9 100644 --- a/setup.py +++ b/setup.py @@ -14,19 +14,6 @@ def read(*names, **kwargs): return fh.read() -REPLAY_SOURCE_URL = "https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.16.11/dist/" - - -def download_replay(name): - print("Downloading " + REPLAY_SOURCE_URL + name) - with urllib.request.urlopen(REPLAY_SOURCE_URL + name) as response: # nosec - with open(root_dir.joinpath("src", "warc2zim", "templates", name), "wb") as fh: - fh.write(response.read()) - - -download_replay("sw.js") - - def get_package_data(): pkgs = ["templates/*"] diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 18ec5576..62423658 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -57,11 +57,8 @@ # HTML mime types HTML_TYPES = ("text/html", "application/xhtml", "application/xhtml+xml") -# external sw.js filename -SW_JS = "sw.js" - # head insert template -HEAD_INSERT_FILE = "sw_check.html" +HEAD_INSERT_FILE = None # Default ZIM metadata tags DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"] @@ -128,7 +125,6 @@ def __init__(self, args): self.inputs = args.inputs self.include_domains = args.include_domains - self.replay_viewer_source = args.replay_viewer_source self.custom_css = args.custom_css self.indexed_urls = set({}) @@ -143,30 +139,6 @@ def __init__(self, args): self.written_records = self.total_records = 0 - def add_replayer(self): - if self.replay_viewer_source and re.match( - r"^https?\:", self.replay_viewer_source - ): - self.creator.add_item( - URLItem( - url=self.replay_viewer_source + SW_JS, - path="A/" + SW_JS, - mimetype="application/javascript", - ) - ) - elif self.replay_viewer_source: - self.creator.add_item_for( - fpath=self.replay_viewer_source + SW_JS, - path="A/" + SW_JS, - mimetype="application/javascript", - ) - else: - self.creator.add_item( - StaticArticle( - self.env, SW_JS, self.main_url, mimetype="application/javascript" - ) - ) - def init_env(self): # autoescape=False to allow injecting html entities from translated text env = Environment( @@ -250,8 +222,11 @@ def run(self): self.env = self.init_env() # init head insert - template = self.env.get_template(HEAD_INSERT_FILE) - self.head_insert = ("" + template.render()).encode("utf-8") + if HEAD_INSERT_FILE: + template = self.env.get_template(HEAD_INSERT_FILE) + self.head_insert = ("" + template.render()).encode("utf-8") + else: + self.head_insert = b"" if self.custom_css: self.css_insert = ( f'\n - - - - - -

- - - diff --git a/src/warc2zim/templates/index.html b/src/warc2zim/templates/index.html index 7e750cb7..bf16af2b 100644 --- a/src/warc2zim/templates/index.html +++ b/src/warc2zim/templates/index.html @@ -4,7 +4,6 @@ -
diff --git a/src/warc2zim/templates/load.js b/src/warc2zim/templates/load.js deleted file mode 100644 index db7c4fa3..00000000 --- a/src/warc2zim/templates/load.js +++ /dev/null @@ -1,69 +0,0 @@ -async function main() { - const sw = navigator.serviceWorker; - - if (!sw) { - - let msg; - // check if service worker doesn't work due to http loading - if (window.location.protocol === "http:" && window.location.hostname !== "localhost") { - const httpsUrl = window.location.href.replace("http:", "https:"); - document.querySelector("#error").innerHTML = "

{{ _("This page must be loaded via an HTTPS URL to support service workers.") }}

" + - `{{ _("Try Loading HTTPS URL?") }}`; - // otherwise, assume service worker not available at all - } else { - document.querySelector("#error").innerHTML = `

{{ _("Error") }}

\n -

{{ _("The requested URL can not be loaded because service workers are not supported here.") }}

-

{{ _("If you use Firefox in Private Mode, try regular mode instead.") }}

-

{{ _("If you use Kiwix-Serve locally, replace the IP in your browser address bar with localhost.") }}

`; - } - - document.querySelector("#loading").style.display = "none"; - return; - } - - // finds '/A/' followed by a domain name with a . - var prefix = window.location.href.slice(0, window.location.href.search(/[/]A[/][^/]+[.]/)); - - const name = prefix.slice(prefix.lastIndexOf("/") + 1).replace(/[\W]+/, ""); - - prefix += "/A/"; - - await sw.register("./sw.js?replayPrefix=&root=" + name, {scope: prefix}); - - sw.addEventListener("message", (event) => { - if (event.data.msg_type === "collAdded" && event.data.name === name) { - if (window.location.hash && window.location.hash.startsWith("#redirect=")) { - prefix += decodeURIComponent(window.location.hash.slice("#redirect=".length)); - } else { - const inx = window.mainUrl.indexOf("//"); - prefix += inx >= 0 ? window.mainUrl.slice(inx + 2) : window.mainUrl; - } - - console.log("final: " + prefix); - window.location.href = prefix; - } - }); - - await new Promise((resolve) => { - if (!sw.controller) { - sw.addEventListener("controllerchange", () => { - resolve(); - }); - } else { - resolve(); - } - }); - - sw.controller.postMessage({ - msg_type: "addColl", - name: name, - file: {"sourceUrl": "proxy:../"}, - root: true, - skipExisting: false, - extraConfig: {"sourceType": "kiwix", notFoundPageUrl: "./404.html"}, - topTemplateUrl: "./topFrame.html" - }); -} - -window.addEventListener("load", main); - diff --git a/src/warc2zim/templates/sw_check.html b/src/warc2zim/templates/sw_check.html deleted file mode 100644 index d759f4ca..00000000 --- a/src/warc2zim/templates/sw_check.html +++ /dev/null @@ -1,23 +0,0 @@ - diff --git a/src/warc2zim/templates/topFrame.html b/src/warc2zim/templates/topFrame.html deleted file mode 100644 index 59428c5e..00000000 --- a/src/warc2zim/templates/topFrame.html +++ /dev/null @@ -1,94 +0,0 @@ - - - - - - - - - - - diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 246b96ad..92c93a70 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -112,7 +112,8 @@ def verify_warc_and_zim(self, warcfile, zimfile): autoescape=False, ) - head_insert = env.get_template("sw_check.html").render().encode("utf-8") + # [TOFIX] + head_insert = b"" # track to avoid checking duplicates, which are not written to ZIM warc_urls = set() @@ -220,8 +221,6 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): str(tmp_path), "--zim-file", zim_output, - "-r", - "https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.16.11/dist/", "--tags", "some", "--tags", @@ -248,10 +247,6 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): "example.com/": "Example Domain", # replay system files "A/index.html": "A/index.html", - "A/load.js": "A/load.js", - "A/404.html": "A/404.html", - "A/sw.js": "A/sw.js", - "A/topFrame.html": "A/topFrame.html", } zim_fh = Archive(zim_output) @@ -408,18 +403,13 @@ def test_all_warcs_root_dir(self, tmp_path): # check articles from different warc records in tests/data dir - # ensure trailing slash added - assert b'window.mainUrl = "http://example.com/"' in self.get_article( - zim_output, "A/index.html" - ) - # from example.warc.gz - assert self.get_article(zim_output, "A/example.com/") != b"" + assert self.get_article(zim_output, "example.com/") != b"" # from single-page-test.warc assert ( self.get_article( - zim_output, "A/lesfondamentaux.reseau-canope.fr/accueil.html" + zim_output, "lesfondamentaux.reseau-canope.fr/accueil.html" ) != b"" ) @@ -446,54 +436,6 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck): # This should be item and get_article_raw is eq to getItem and it will fail if it is not a item self.get_article_raw(zim_output, entry) - def test_local_replay_viewer_url(self, tmp_path): - zim_local_sw = "zim-local-sw.zim" - - res = requests.get( - "https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.16.11/dist/sw.js" - ) - - with open(tmp_path / "sw.js", "wt") as fh: - fh.write(res.text) - - main( - [ - "-v", - os.path.join(TEST_DATA_DIR, "example-response.warc"), - "-r", - str(tmp_path) + "/", - "--output", - str(tmp_path), - "--name", - "local-sw", - "--zim-file", - zim_local_sw, - ] - ) - - assert os.path.isfile(tmp_path / zim_local_sw) - - def test_error_bad_replay_viewer_url(self, tmp_path): - zim_output_not_created = "zim-out-not-created.zim" - with pytest.raises(Exception) as e: - main( - [ - "-v", - os.path.join(TEST_DATA_DIR, "example-response.warc"), - "-r", - "x-invalid-x", - "--output", - str(tmp_path), - "--name", - "bad", - "--zim-file", - zim_output_not_created, - ] - ) - - # zim file should not have been created since replay viewer could not be loaded - assert not os.path.isfile(tmp_path / zim_output_not_created) - def test_error_bad_main_page(self, tmp_path): zim_output_not_created = "zim-out-not-created.zim" with pytest.raises(Exception) as e: From 1113c1a693a3396a38b56996465e3f4e19684d38 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 15 Nov 2023 17:12:26 +0100 Subject: [PATCH 06/11] Properly set the main page. --- src/warc2zim/converter.py | 15 ++++++++------- src/warc2zim/templates/index.html | 28 ---------------------------- tests/test_warc_to_zim.py | 4 +--- 3 files changed, 9 insertions(+), 38 deletions(-) delete mode 100644 src/warc2zim/templates/index.html diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 62423658..0432f307 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -86,14 +86,14 @@ def __init__(self, args): else: logger.setLevel(logging.INFO) - self.main_url = args.url + main_url = args.url # ensure trailing slash is added if missing - parts = urlsplit(self.main_url) + parts = urlsplit(main_url) if parts.path == "": parts = list(parts) # set path parts[2] = "/" - self.main_url = urlunsplit(parts) + main_url = urlunsplit(parts) self.name = args.name self.title = args.title @@ -104,9 +104,10 @@ def __init__(self, args): self.creator_metadata = args.creator self.publisher = args.publisher self.tags = DEFAULT_TAGS + (args.tags or []) - self.source = args.source or self.main_url + self.source = args.source or main_url self.scraper = "warc2zim " + get_version() self.illustration = b"" + self.main_url = normalize(main_url) self.output = args.output self.zim_file = args.zim_file @@ -237,7 +238,7 @@ def run(self): self.creator = Creator( self.full_filename, - main_path="A/index.html", + main_path=self.main_url, ) self.creator.config_metadata( @@ -306,9 +307,9 @@ def find_main_page_metadata(self): or record.http_headers.get_statuscode() == "200" ) ): - self.main_url = url + self.main_url = normalize(url) - if urldefrag(self.main_url).url != url: + if urldefrag(self.main_url).url != normalize(url): continue # if we get here, found record for the main page diff --git a/src/warc2zim/templates/index.html b/src/warc2zim/templates/index.html deleted file mode 100644 index bf16af2b..00000000 --- a/src/warc2zim/templates/index.html +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - -
- - - - - - - - - -
-
- - diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 92c93a70..39097e14 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -244,9 +244,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): assert all_articles == { # entries from WARC - "example.com/": "Example Domain", - # replay system files - "A/index.html": "A/index.html", + "example.com/": "Example Domain" } zim_fh = Archive(zim_output) From 7355b800939fef4bf53dbd7a228d87695263b1b8 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 17 Nov 2023 10:59:48 +0100 Subject: [PATCH 07/11] Store static content in a `_zim_static/` subdir instead of `A/`. We don't have anything now in `A/` or `H/` subdirs. Remove the left over `A/` in test urls (was working thanks to libzim's compatibility layer) --- src/warc2zim/items.py | 2 +- tests/test_warc_to_zim.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index d9ea26f8..dededd02 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -81,7 +81,7 @@ def __init__(self, env, filename, main_url, **kwargs): ).decode("utf-8") def get_path(self): - return "A/" + self.filename + return "_zim_static/" + self.filename def get_mimetype(self): return self.mime diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 39097e14..0468acd2 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -149,7 +149,7 @@ def verify_warc_and_zim(self, warcfile, zimfile): # ensure payloads match try: - payload = zim_fh.get_item("A/" + url_no_scheme) + payload = zim_fh.get_item(url_no_scheme) except KeyError: payload = None @@ -323,8 +323,8 @@ def test_same_domain_only(self, tmp_path): for article in self.list_articles(zim_output): url = article.path # ignore the replay files, which have only one path segment - if url.startswith("A/") and len(url.split("/")) > 2: - assert url.startswith("A/example.com/") + if not url.startswith("_zim_static/"): + assert url.startswith("example.com/") def test_skip_self_redirect(self, tmp_path): zim_output = "self-redir.zim" @@ -363,7 +363,7 @@ def test_include_domains_favicon_and_language(self, tmp_path): for article in self.list_articles(zim_output): url = article.path # ignore the replay files, which have only one path segment - if url.startswith("A/") and len(url.split("/")) > 2: + if not url.startswith("_zim_static/"): assert "reseau-canope.fr/" in url # test detected language @@ -372,7 +372,7 @@ def test_include_domains_favicon_and_language(self, tmp_path): # test detected favicon assert self.get_article( zim_output, - "A/lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico", + "lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico", ) assert self.get_metadata(zim_output, "Illustration_48x48@1") @@ -488,10 +488,10 @@ def test_custom_css(self, tmp_path): ) zim_output = tmp_path / zim_output - res = self.get_article(zim_output, "A/example.com/") + res = self.get_article(zim_output, "example.com/") assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res - res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css") + res = self.get_article(zim_output, "warc2zim.kiwix.app/custom.css") assert custom_css == res def test_custom_css_remote(self, tmp_path): @@ -515,8 +515,8 @@ def test_custom_css_remote(self, tmp_path): ) zim_output = tmp_path / zim_output - res = self.get_article(zim_output, "A/example.com/") + res = self.get_article(zim_output, "example.com/") assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res - res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css") + res = self.get_article(zim_output, "warc2zim.kiwix.app/custom.css") assert res == requests.get(url).content From e86f2b75a3e2a7a291fcbb7b25c5c67e14ccd1b7 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 8 Dec 2023 15:44:03 +0100 Subject: [PATCH 08/11] Deactivate searching file to add in `templates` directory. We don't have files to add (and so, no directory). --- src/warc2zim/converter.py | 19 +++++++++++++------ tests/test_warc_to_zim.py | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 0432f307..9870401a 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -142,8 +142,12 @@ def __init__(self, args): def init_env(self): # autoescape=False to allow injecting html entities from translated text + + # We don't have any files in templates directory. + # So `templates` directory doesn't exist and pkg_resources complains about that. + # Comment this part until we readd new file in `templates` directory env = Environment( - loader=PackageLoader("warc2zim", "templates"), + # loader=PackageLoader("warc2zim", "templates"), extensions=["jinja2.ext.i18n"], autoescape=False, ) @@ -256,11 +260,14 @@ def run(self): Scraper=f"warc2zim {get_version()}", ).start() - for filename in pkg_resources.resource_listdir("warc2zim", "templates"): - if filename == HEAD_INSERT_FILE: - continue - - self.creator.add_item(StaticArticle(self.env, filename, self.main_url)) + # We don't have any files in templates directory. + # So `templates` directory doesn't exist and pkg_resources complains about that. + # Comment this part until we readd new file in `templates` directory + # for filename in pkg_resources.resource_listdir("warc2zim", "templates"): + # if filename == HEAD_INSERT_FILE: + # continue + # + # self.creator.add_item(StaticArticle(self.env, filename, self.main_url)) for record in self.iter_all_warc_records(): self.add_items_for_warc_record(record) diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 0468acd2..f2f18a18 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -107,7 +107,7 @@ def verify_warc_and_zim(self, warcfile, zimfile): # autoescape=False to allow injecting html entities from translated text env = Environment( - loader=PackageLoader("warc2zim", "templates"), + # loader=PackageLoader("warc2zim", "templates"), extensions=["jinja2.ext.i18n"], autoescape=False, ) From 49c49eca611b405f806d0d6ad2ba0ca0cc93fc50 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 8 Dec 2023 15:59:18 +0100 Subject: [PATCH 09/11] Python 3.7 is not supported. Assignement expression[*] is new in python 3.8 And python 3.7 is already end of life. Also add testing on 3.12. [*]https://docs.python.org/3/whatsnew/3.8.html#assignment-expressions --- .github/workflows/ci.yaml | 2 +- requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f8811f4e..7b73f24a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -8,7 +8,7 @@ jobs: strategy: max-parallel: 3 matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - name: checkout diff --git a/requirements.txt b/requirements.txt index 97b2fc17..1ed3000b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ beautifulsoup4==4.9.3 zimscraperlib==3.1.1 Babel==2.12.1 jinja2==3.1.2 +setuptools==68.2.2 # to support possible brotli content in warcs brotlipy==0.7.0 cdxj_indexer==1.4.5 From f706533c81a66402c6461594d17cb51a3767aaa7 Mon Sep 17 00:00:00 2001 From: Emmanuel Engelhart Date: Sat, 16 Dec 2023 20:22:24 +0100 Subject: [PATCH 10/11] Requires zimscraperlib 3.2.0 (with alias feature) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1ed3000b..7888bfdf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ warcio==1.7.4 requests==2.31.0 beautifulsoup4==4.9.3 -zimscraperlib==3.1.1 +zimscraperlib==3.2.0 Babel==2.12.1 jinja2==3.1.2 setuptools==68.2.2 From 0bfddc9216a1807c4c3266db3b0f80b633735275 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Mon, 18 Dec 2023 08:44:13 +0000 Subject: [PATCH 11/11] import future annotations as we support py3.8+ --- src/warc2zim/url_rewriting.py | 2 ++ src/warc2zim/utils.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py index 0a0ced18..37b11ee3 100644 --- a/src/warc2zim/url_rewriting.py +++ b/src/warc2zim/url_rewriting.py @@ -36,6 +36,8 @@ by slightly simplifying the path and keeping only the usefull arguments in the querystring. """ +from __future__ import annotations + import logging import re from urllib.parse import urlsplit, urlunsplit, quote, unquote, parse_qs, urlencode diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py index f12adefd..eea1dedc 100644 --- a/src/warc2zim/utils.py +++ b/src/warc2zim/utils.py @@ -2,6 +2,8 @@ # -*- coding: utf-8 -*- # vim: ai ts=4 sts=4 et sw=4 nu +from __future__ import annotations + import pkg_resources from bs4 import BeautifulSoup