From eaa4fa2ce505101594b760b9b513f3c257e4c787 Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Tue, 14 Nov 2023 16:00:08 +0100
Subject: [PATCH 01/11] Introduce `normalize` and normalization schema.

Properly define how we store entries in zim file.

We introduce `normalize` helper function class in place of `canonicalize`.

We work on normalization on converter level. So we  path the path to the
items instead of letting them call `normalize`.

Converted `test/data/video-vimeo.warc.gz` to zim was containing :

```
A/404.html
A/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
A/f.vimeocdn.com/p/3.45.3/css/player.css
A/f.vimeocdn.com/p/3.45.3/js/player.js
A/i.vimeocdn.com/player/354746.png?mw=200&mh=200
A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
A/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
A/index.html
A/load.js
A/oembed.link/favicon.ico
A/oembed.link/https://vimeo.com/347119375
A/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
A/sw.js
A/topFrame.html
A/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
H/f.vimeocdn.com/p/3.45.3/css/player.css
H/f.vimeocdn.com/p/3.45.3/js/player.js
H/i.vimeocdn.com/player/354746.png?mw=200&mh=200
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
H/oembed.link/favicon.ico
H/oembed.link/https://vimeo.com/347119375
H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
H/vimeo.fuzzy.replayweb.page/video/347119375
H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
```

With this change it contains:

```
A/404.html
A/index.html
A/load.js
A/sw.js
A/topFrame.html
H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
H/f.vimeocdn.com/p/3.45.3/css/player.css
H/f.vimeocdn.com/p/3.45.3/js/player.js
H/i.vimeocdn.com/player/354746.png?mw=200&mh=200
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
H/oembed.link/favicon.ico
H/oembed.link/https://vimeo.com/347119375
H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
f.vimeocdn.com/p/3.45.3/css/player.css
f.vimeocdn.com/p/3.45.3/js/player.js
i.vimeocdn.com/player/354746.png?mw=200&mh=200
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
oembed.link/favicon.ico
oembed.link/https://vimeo.com/347119375
player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
vimeo.fuzzy.replayweb.page/video/347119375
vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
```
---
 src/warc2zim/converter.py     | 33 ++++++++-------
 src/warc2zim/items.py         | 15 +++----
 src/warc2zim/url_rewriting.py | 75 +++++++++++++++++++++++++++--------
 src/warc2zim/utils.py         |  8 ++++
 tests/test_warc_to_zim.py     | 33 +++++++++++----
 5 files changed, 118 insertions(+), 46 deletions(-)

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index a7387b51..ff7e31ee 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -49,7 +49,7 @@
 
 from cdxj_indexer import iter_file_or_dir, buffering_record_iter
 
-from warc2zim.url_rewriting import FUZZY_RULES, canonicalize
+from warc2zim.url_rewriting import FUZZY_RULES, normalize
 from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle
 from warc2zim.utils import (
     get_version,
@@ -294,19 +294,21 @@ def run(self):
             self.add_items_for_warc_record(record)
 
         # process revisits, headers only
-        for url, record in self.revisits.items():
-            if canonicalize(url) not in self.indexed_urls:
+        for normalized_url, record in self.revisits.items():
+            if normalized_url not in self.indexed_urls:
                 logger.debug(
                     "Adding revisit {0} -> {1}".format(
-                        url, record.rec_headers["WARC-Refers-To-Target-URI"]
+                        normalized_url, record.rec_headers["WARC-Refers-To-Target-URI"]
                     )
                 )
                 try:
-                    self.creator.add_item(WARCHeadersItem(record))
+                    self.creator.add_item(
+                        WARCHeadersItem("H/" + normalized_url, record)
+                    )
                 except RuntimeError as exc:
                     if not DUPLICATE_EXC_STR.match(str(exc)):
                         raise exc
-                self.indexed_urls.add(canonicalize(url))
+                self.indexed_urls.add(normalized_url)
 
         logger.debug(f"Found {self.total_records} records in WARCs")
 
@@ -472,15 +474,16 @@ def is_self_redirect(self, record, url):
             return False
 
         location = record.http_headers.get("Location", "")
-        return canonicalize(url) == canonicalize(location)
+        return normalize(url) == normalize(location)
 
     def add_items_for_warc_record(self, record):
         url = get_record_url(record)
+        normalized_url = normalize(url)
         if not url:
             logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
             return
 
-        if canonicalize(url) in self.indexed_urls:
+        if normalized_url in self.indexed_urls:
             logger.debug("Skipping duplicate {0}, already added to ZIM".format(url))
             return
 
@@ -499,12 +502,14 @@ def add_items_for_warc_record(self, record):
                 return
 
             try:
-                self.creator.add_item(WARCHeadersItem(record))
+                self.creator.add_item(WARCHeadersItem("H/" + normalized_url, record))
             except RuntimeError as exc:
                 if not DUPLICATE_EXC_STR.match(str(exc)):
                     raise exc
 
-            payload_item = WARCPayloadItem(record, self.head_insert, self.css_insert)
+            payload_item = WARCPayloadItem(
+                normalized_url, record, self.head_insert, self.css_insert
+            )
 
             if len(payload_item.content) != 0:
                 try:
@@ -515,15 +520,15 @@ def add_items_for_warc_record(self, record):
                 self.total_records += 1
                 self.update_stats()
 
-            self.indexed_urls.add(canonicalize(url))
+            self.indexed_urls.add(normalized_url)
 
         elif (
             record.rec_headers["WARC-Refers-To-Target-URI"] != url
-            and url not in self.revisits
+            and normalized_url not in self.revisits
         ):
-            self.revisits[url] = record
+            self.revisits[normalized_url] = record
 
-        self.add_fuzzy_match_record(url)
+        self.add_fuzzy_match_record(normalized_url)
 
     def add_fuzzy_match_record(self, url):
         fuzzy_url = url
diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
index a6587e06..76bf7dd1 100644
--- a/src/warc2zim/items.py
+++ b/src/warc2zim/items.py
@@ -16,9 +16,6 @@
 from zimscraperlib.zim.items import StaticItem
 from zimscraperlib.zim.providers import StringProvider
 
-from bs4 import BeautifulSoup
-
-from warc2zim.url_rewriting import canonicalize
 from warc2zim.utils import get_record_url, get_record_mime_type, parse_title
 
 # Shared logger
@@ -36,13 +33,13 @@ class WARCHeadersItem(StaticItem):
     Usually stored under H namespace
     """
 
-    def __init__(self, record):
+    def __init__(self, path, record):
         super().__init__()
         self.record = record
-        self.url = get_record_url(record)
+        self.path = path
 
     def get_path(self):
-        return "H/" + canonicalize(self.url)
+        return self.path
 
     def get_title(self):
         return ""
@@ -68,10 +65,10 @@ class WARCPayloadItem(StaticItem):
     Usually stored under A namespace
     """
 
-    def __init__(self, record, head_insert=None, css_insert=None):
+    def __init__(self, path, record, head_insert=None, css_insert=None):
         super().__init__()
         self.record = record
-        self.url = get_record_url(record)
+        self.path = path
         self.mimetype = get_record_mime_type(record)
         self.title = ""
 
@@ -89,7 +86,7 @@ def __init__(self, record, head_insert=None, css_insert=None):
                 self.content = CSS_INS.sub(css_insert, self.content)
 
     def get_path(self):
-        return "A/" + canonicalize(self.url)
+        return self.path
 
     def get_title(self):
         return self.title
diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index b9595176..2add2d3f 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -5,10 +5,37 @@
 """ warc2zim's url rewriting tools
 
 This module is about url and entry path rewriting.
+
+The global scheme is the following:
+
+Entries are stored in the zim file using their urldecoded full path properly urlencoded (yes!):
+- The full path is the full url without the scheme (ie : "<host>/<path>(?<query_string)")
+  The scheme information is lost. We will serve the content using the scheme of the real server,
+  whatever was the scheme of the original url.
+  We probably don't care about different content served from different scheme but with same `host/path`.
+- urldecoded: As most as possible the path itself must not be urlencoded:
+  . This is valid : "foo/part with space/bar?key=value"
+  . This is NOT valid : "foo/part%20with%20space/bar%3Fkey%3Dvalue"
+- Properly urlencoded: However, for correct parsing, some character may still need to be encoded.
+  The querystring components (and others) must be url encoded as needed:
+  . This is valid : "foo/part/file with %3F and +?who=Chip%26Dale&question=Is%20there%20any%20%2B%20here%3F"
+  . This is NOT valid : "foo/part/file with ? and +?who=Chip&Dale&question=It there any + here?"
+- Space in query string must be encoded with `%20` not `+`:
+  . This is valid : "foo/part/file?question=Is%20there%20any%20%2B%20here%3F"
+  . This is NOT valid : "foo/part/file?question=Is+there+any+%2B+here%3F"
+
+In python words :
+- full path are `urllib.parse.ParseResults` with `scheme==''`
+- `urllib.parse.urlparse` must correctly parse the path (generating `ParseResults` with empty scheme)
+- The querystring part must be parsable by `urllib.parse.parse_qs` (even if we don't do it here)
+- The querystring must be generated as by `urllib.parse.urlencode(<query>, quote_via=quote)`
+
 """
 
 import logging
 import re
+from urllib.parse import urlsplit, urlunsplit, quote, unquote, parse_qs, urlencode
+from warc2zim.utils import to_string
 
 # Shared logger
 logger = logging.getLogger("warc2zim.url_rewriting")
@@ -18,27 +45,27 @@
     {
         "match": re.compile(
             # r"//.*googlevideo.com/(videoplayback\?).*(id=[^&]+).*([&]itag=[^&]+).*"
-            r"//.*googlevideo.com/(videoplayback\?).*((?<=[?&])id=[^&]+).*"
+            r".*googlevideo.com/(videoplayback\?).*((?<=[?&])id=[^&]+).*"
         ),
-        "replace": r"//youtube.fuzzy.replayweb.page/\1\2",
+        "replace": r"youtube.fuzzy.replayweb.page/\1\2",
     },
     {
         "match": re.compile(
-            r"//(?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?)"
+            r"(?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?)"
             r".*(video_id=[^&]+).*"
         ),
-        "replace": r"//youtube.fuzzy.replayweb.page/\1\2",
+        "replace": r"youtube.fuzzy.replayweb.page/\1\2",
     },
     {"match": re.compile(r"(\.[^?]+\?)[\d]+$"), "replace": r"\1"},
     {
         "match": re.compile(
-            r"//(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*"
+            r"(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*"
         ),
-        "replace": r"//youtube.fuzzy.replayweb.page/\1?\2",
+        "replace": r"youtube.fuzzy.replayweb.page/\1?\2",
     },
     {
-        "match": re.compile(r"//(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*"),
-        "replace": r"//youtube.fuzzy.replayweb.page/embed/\1",
+        "match": re.compile(r"(?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).*"),
+        "replace": r"youtube.fuzzy.replayweb.page/embed/\1",
     },
     {
         "match": re.compile(
@@ -53,14 +80,30 @@
 ]
 
 
-def canonicalize(url):
-    """Return a 'canonical' version of the url under which it is stored in the ZIM
-    For now, just removing the scheme http:// or https:// scheme
+def normalize(url: str | bytes) -> str:
+    """Normalize a properly contructed url to a path to use as a entry's key.
+
+    >>> normalize("http://exemple.com/path/to/article?foo=bar")
+    "exemple.com/path/to/article?foo=bar"
+    >>> normalize("http://other.com/path to strange ar+t%3Ficle?foo=bar+baz")
+    "other.com/path to strange ar+t%3Ficle?foo=bar%20baz"
     """
-    if url.startswith("https://"):
-        return url[8:]
 
-    if url.startswith("http://"):
-        return url[7:]
+    if not url:
+        return url
+
+    url = to_string(url)
+
+    url_parts = urlsplit(url)
+    url_parts = url_parts._replace(scheme="")
+
+    # Remove the netloc (by moving it into path)
+    if url_parts.netloc:
+        new_path = url_parts.netloc + url_parts.path
+        url_parts = url_parts._replace(netloc="", path=new_path)
+    if url_parts.path and url_parts.path[0] == "/":
+        url_parts = url_parts._replace(path=url_parts.path[1:])
+
+    path = urlunsplit(url_parts)
 
-    return url
+    return path
diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
index 9e55b8ad..f12adefd 100644
--- a/src/warc2zim/utils.py
+++ b/src/warc2zim/utils.py
@@ -37,3 +37,11 @@ def parse_title(content):
         return soup.title.text or ""
     except Exception:
         return ""
+
+
+def to_string(input: str | bytes) -> str:
+    try:
+        input = input.decode("utf8")
+    except AttributeError:
+        pass
+    return input
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index d46d9807..dfd37977 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -14,7 +14,7 @@
 from jinja2 import Environment, PackageLoader
 from zimscraperlib.zim import Archive
 
-from warc2zim.url_rewriting import canonicalize
+from warc2zim.url_rewriting import normalize
 from warc2zim.converter import iter_warc_records
 from warc2zim.utils import get_record_url
 from warc2zim.main import main
@@ -170,21 +170,40 @@ def verify_warc_and_zim(self, warcfile, zimfile):
 
             warc_urls.add(url)
 
-    def test_canonicalize(self):
-        assert canonicalize("http://example.com/?foo=bar") == "example.com/?foo=bar"
+    def test_normalize(self):
+        assert normalize(None) == None
+        assert normalize("") == ""
+        assert normalize("https://exemple.com") == "exemple.com"
+        assert normalize("https://exemple.com/") == "exemple.com/"
+        assert normalize("http://example.com/?foo=bar") == "example.com/?foo=bar"
+        assert normalize(b"http://example.com/?foo=bar") == "example.com/?foo=bar"
 
-        assert canonicalize("https://example.com/?foo=bar") == "example.com/?foo=bar"
+        assert normalize("https://example.com/?foo=bar") == "example.com/?foo=bar"
 
         assert (
-            canonicalize("https://example.com/some/path/http://example.com/?foo=bar")
+            normalize("https://example.com/some/path/http://example.com/?foo=bar")
             == "example.com/some/path/http://example.com/?foo=bar"
         )
 
         assert (
-            canonicalize("example.com/some/path/http://example.com/?foo=bar")
+            normalize("example.com/some/path/http://example.com/?foo=bar")
             == "example.com/some/path/http://example.com/?foo=bar"
         )
 
+        assert (
+            normalize("http://example.com/path/with/final/slash/")
+            == "example.com/path/with/final/slash/"
+        )
+
+        assert normalize("http://test@example.com/") == "test@example.com/"
+
+        assert (
+            normalize(
+                "http://lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493"
+            )
+            == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493"
+        )
+
     def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
         zim_output = "zim-out-filename.zim"
         main(
@@ -222,7 +241,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
 
         assert all_articles == {
             # entries from WARC
-            "A/example.com/": "Example Domain",
+            "example.com/": "Example Domain",
             "H/example.com/": "H/example.com/",
             # replay system files
             "A/index.html": "A/index.html",

From e8122deb7376c2fdde934137a2ffdb92df098bc8 Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Tue, 14 Nov 2023 16:12:17 +0100
Subject: [PATCH 02/11] Directly store entries using their potentially reduced
 path.

Before, we were storing a entry using its full path and potentially
create a redirect entry (using reduced path) pointing to the full path
entry.

Now, path reduction is part of normalization and so we directly store
entries using their (potentially) reduced path.

Converted `test/data/video-vimeo.warc.gz` goes from :

```
A/404.html
A/index.html
A/load.js
A/sw.js
A/topFrame.html
H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
H/f.vimeocdn.com/p/3.45.3/css/player.css
H/f.vimeocdn.com/p/3.45.3/js/player.js
H/i.vimeocdn.com/player/354746.png?mw=200&mh=200
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
H/oembed.link/favicon.ico
H/oembed.link/https://vimeo.com/347119375
H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
H/vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
f.vimeocdn.com/p/3.45.3/css/player.css
f.vimeocdn.com/p/3.45.3/js/player.js
i.vimeocdn.com/player/354746.png?mw=200&mh=200
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
oembed.link/favicon.ico
oembed.link/https://vimeo.com/347119375
player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
vimeo.fuzzy.replayweb.page/video/347119375
vod-progressive.akamaized.net/exp=1635528595~acl=%2Fvimeo-prod-skyfire-std-us%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4
```

to :

```
A/404.html
A/index.html
A/load.js
A/sw.js
A/topFrame.html
H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
H/f.vimeocdn.com/p/3.45.3/css/player.css
H/f.vimeocdn.com/p/3.45.3/js/player.js
H/i.vimeocdn.com/player/354746.png?mw=200&mh=200
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
H/oembed.link/favicon.ico
H/oembed.link/https://vimeo.com/347119375
H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
f.vimeocdn.com/p/3.45.3/css/player.css
f.vimeocdn.com/p/3.45.3/js/player.js
i.vimeocdn.com/player/354746.png?mw=200&mh=200
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
oembed.link/favicon.ico
oembed.link/https://vimeo.com/347119375
player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
vimeo.fuzzy.replayweb.page/video/347119375
```

Notice that `vod-progressive.akamaized.net` is not present.
It is "replaced" by
`vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4`
which is now a plain entry instead of a redirect to
`vod-progressive.akamaized.net[...]`.
---
 src/warc2zim/converter.py     | 38 ++++-------------------------------
 src/warc2zim/main.py          | 18 -----------------
 src/warc2zim/url_rewriting.py | 17 +++++++++++++++-
 tests/test_warc_to_zim.py     | 27 ++++++++++++++++---------
 4 files changed, 37 insertions(+), 63 deletions(-)

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index ff7e31ee..5a45a6d2 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -5,19 +5,12 @@
 """ warc2zim conversion utility
 
 This utility provides a conversion from WARC records to ZIM files.
-The WARCs are converted in a 'lossless' way, no data from WARC records is lost.
-Each WARC record results in two ZIM items:
-- The WARC payload is stored under /A/<url>
-- The WARC headers + HTTP headers are stored under the /H/<url>
-
-Given a WARC response record for 'https://example.com/',
-two ZIM items are created /A/example.com/ and /H/example.com/ are created.
-
-Only WARC response and resource records are stored.
+WARC record are directly stored in a zim file as:
+- Response WARC record as item "normalized" <url>
+- Revisit record as alias (using "normalized" <url> to)
 
 If the WARC contains multiple entries for the same URL, only the first entry is added,
 and later entries are ignored. A warning is printed as well.
-
 """
 
 import os
@@ -49,7 +42,7 @@
 
 from cdxj_indexer import iter_file_or_dir, buffering_record_iter
 
-from warc2zim.url_rewriting import FUZZY_RULES, normalize
+from warc2zim.url_rewriting import normalize
 from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle
 from warc2zim.utils import (
     get_version,
@@ -528,29 +521,6 @@ def add_items_for_warc_record(self, record):
         ):
             self.revisits[normalized_url] = record
 
-        self.add_fuzzy_match_record(normalized_url)
-
-    def add_fuzzy_match_record(self, url):
-        fuzzy_url = url
-        for rule in FUZZY_RULES:
-            fuzzy_url = rule["match"].sub(rule["replace"], url)
-            if fuzzy_url != url:
-                break
-
-        if fuzzy_url == url:
-            return
-
-        http_headers = StatusAndHeaders("302 Redirect", {"Location": url})
-
-        date = datetime.datetime.utcnow().isoformat()
-        builder = RecordBuilder()
-        record = builder.create_revisit_record(
-            fuzzy_url, "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", url, date, http_headers
-        )
-
-        self.revisits[fuzzy_url] = record
-        logger.debug("Adding fuzzy redirect {0} -> {1}".format(fuzzy_url, url))
-
 
 def iter_warc_records(inputs):
     """iter warc records, including appending request data to matching response"""
diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
index 45bb7e9e..1da7bff3 100644
--- a/src/warc2zim/main.py
+++ b/src/warc2zim/main.py
@@ -2,24 +2,6 @@
 # -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
-""" warc2zim conversion utility
-
-This utility provides a conversion from WARC records to ZIM files.
-The WARCs are converted in a 'lossless' way, no data from WARC records is lost.
-Each WARC record results in two ZIM items:
-- The WARC payload is stored under /A/<url>
-- The WARC headers + HTTP headers are stored under the /H/<url>
-
-Given a WARC response record for 'https://example.com/',
-two ZIM items are created /A/example.com/ and /H/example.com/ are created.
-
-Only WARC response and resource records are stored.
-
-If the WARC contains multiple entries for the same URL, only the first entry is added,
-and later entries are ignored. A warning is printed as well.
-
-"""
-
 import sys
 import logging
 from argparse import ArgumentParser
diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index 2add2d3f..0a0ced18 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -30,6 +30,10 @@
 - The querystring part must be parsable by `urllib.parse.parse_qs` (even if we don't do it here)
 - The querystring must be generated as by `urllib.parse.urlencode(<query>, quote_via=quote)`
 
+On top of that, paths are "reduced" using fuzzy rules:
+A path "https://www.youtube.com/youtubei/v1/foo/baz/things?key=value&other_key=other_value&videoId=xxxx&yet_another_key=yet_another_value"
+is reduced to "youtube.fuzzy.replayweb.page/youtubei/v1/foo/baz/things?videoId=xxxx"
+by slightly simplifying the path and keeping only the usefull arguments in the querystring.
 """
 
 import logging
@@ -56,7 +60,7 @@
         ),
         "replace": r"youtube.fuzzy.replayweb.page/\1\2",
     },
-    {"match": re.compile(r"(\.[^?]+\?)[\d]+$"), "replace": r"\1"},
+    {"match": re.compile(r"([^?]+\?)[\d]+$"), "replace": r"\1"},
     {
         "match": re.compile(
             r"(?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).*"
@@ -80,6 +84,14 @@
 ]
 
 
+def reduce(path: str) -> str:
+    """Reduce a path"""
+    for rule in FUZZY_RULES:
+        if match := rule["match"].match(path):
+            return match.expand(rule["replace"])
+    return path
+
+
 def normalize(url: str | bytes) -> str:
     """Normalize a properly contructed url to a path to use as a entry's key.
 
@@ -87,6 +99,8 @@ def normalize(url: str | bytes) -> str:
     "exemple.com/path/to/article?foo=bar"
     >>> normalize("http://other.com/path to strange ar+t%3Ficle?foo=bar+baz")
     "other.com/path to strange ar+t%3Ficle?foo=bar%20baz"
+    >>> normalize("http://youtube.com/youtubei/bar?key=value&videoId=xxxx&otherKey=otherValue")
+    "youtube.fuzzy.replayweb.page/youtubei/bar?videoId=xxxx"
     """
 
     if not url:
@@ -105,5 +119,6 @@ def normalize(url: str | bytes) -> str:
         url_parts = url_parts._replace(path=url_parts.path[1:])
 
     path = urlunsplit(url_parts)
+    path = reduce(path)
 
     return path
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index dfd37977..a39ce1a5 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -5,6 +5,7 @@
 import os
 import time
 import json
+import re
 from io import BytesIO
 
 import pytest
@@ -55,22 +56,22 @@ def cmdline(request):
     {
         "filename": "video-yt.warc.gz",
         "entries": [
-            "H/youtube.fuzzy.replayweb.page/get_video_info?video_id=aT-Up5Y4uRI",
-            "H/youtube.fuzzy.replayweb.page/videoplayback?id=o-AE3bg3qVNY-gAWwYgL52vgpHKJe9ijdbu2eciNi5Uo_w",
+            "youtube.fuzzy.replayweb.page/get_video_info?video_id=aT-Up5Y4uRI",
+            "youtube.fuzzy.replayweb.page/videoplayback?id=o-AE3bg3qVNY-gAWwYgL52vgpHKJe9ijdbu2eciNi5Uo_w",
         ],
     },
     {
         "filename": "video-yt-2.warc.gz",
         "entries": [
-            "H/youtube.fuzzy.replayweb.page/youtubei/v1/player?videoId=aT-Up5Y4uRI",
-            "H/youtube.fuzzy.replayweb.page/videoplayback?id=o-AGDtIqpFRmvgVVZk96wgGyFxL_SFSdpBxs0iBHatQpRD",
+            "youtube.fuzzy.replayweb.page/youtubei/v1/player?videoId=aT-Up5Y4uRI",
+            "youtube.fuzzy.replayweb.page/videoplayback?id=o-AGDtIqpFRmvgVVZk96wgGyFxL_SFSdpBxs0iBHatQpRD",
         ],
     },
     {
         "filename": "video-vimeo.warc.gz",
         "entries": [
-            "H/vimeo.fuzzy.replayweb.page/video/347119375",
-            "H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4",
+            "vimeo.fuzzy.replayweb.page/video/347119375",
+            "vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4",
         ],
     },
 ]
@@ -138,6 +139,12 @@ def verify_warc_and_zim(self, warcfile, zimfile):
             # parse headers as record, ensure headers match
             url_no_scheme = url.split("//", 2)[1]
             print(url_no_scheme)
+
+            if "www.youtube.com/embed" in url_no_scheme:
+                # We know that those url are rewritten in zim. Don't check for them.
+                break
+
+            url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme)
             parsed_record = next(
                 ArchiveIterator(BytesIO(zim_fh.get_content("H/" + url_no_scheme)))
             )
@@ -201,7 +208,7 @@ def test_normalize(self):
             normalize(
                 "http://lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493"
             )
-            == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?1588230493"
+            == "lesfondamentaux.reseau-canope.fr/fileadmin/template/css/main.css?"
         )
 
     def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
@@ -431,7 +438,7 @@ def test_all_warcs_root_dir(self, tmp_path):
         )
 
         # timestamp fuzzy match from example-with-timestamp.warc
-        assert self.get_article(zim_output, "H/example.com/path.txt?") != b""
+        assert self.get_article(zim_output, "example.com/path.txt?") != b""
 
     def test_fuzzy_urls(self, tmp_path, fuzzycheck):
         zim_output = fuzzycheck["filename"] + ".zim"
@@ -449,8 +456,8 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck):
         zim_output = tmp_path / zim_output
 
         for entry in fuzzycheck["entries"]:
-            res = self.get_article(zim_output, entry)
-            assert b"Location: " in res
+            # This should be item and get_article_raw is eq to getItem and it will fail if it is not a item
+            self.get_article_raw(zim_output, entry)
 
     def test_local_replay_viewer_url(self, tmp_path):
         zim_local_sw = "zim-local-sw.zim"

From 54f8cd621e22520cfab8661d900d2d4f00901d9d Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Tue, 14 Nov 2023 16:16:31 +0100
Subject: [PATCH 03/11] Don't store Entry's header.

We don't use it and we agree to not store them (at least for now).
If we need them, we will see how to readd them.

Converted `test/data/video-vimeo.warc.gz` goes from :

```
A/404.html
A/index.html
A/load.js
A/sw.js
A/topFrame.html
H/f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
H/f.vimeocdn.com/p/3.45.3/css/player.css
H/f.vimeocdn.com/p/3.45.3/js/player.js
H/i.vimeocdn.com/player/354746.png?mw=200&mh=200
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
H/i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
H/oembed.link/favicon.ico
H/oembed.link/https://vimeo.com/347119375
H/player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
H/vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
f.vimeocdn.com/p/3.45.3/css/player.css
f.vimeocdn.com/p/3.45.3/js/player.js
i.vimeocdn.com/player/354746.png?mw=200&mh=200
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
oembed.link/favicon.ico
oembed.link/https://vimeo.com/347119375
player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
vimeo.fuzzy.replayweb.page/video/347119375
```

to:

```
A/404.html
A/index.html
A/load.js
A/sw.js
A/topFrame.html
f.vimeocdn.com/js_opt/modules/utils/vuid.min.js
f.vimeocdn.com/p/3.45.3/css/player.css
f.vimeocdn.com/p/3.45.3/js/player.js
i.vimeocdn.com/player/354746.png?mw=200&mh=200
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d.jpg?mw=80&q=85
i.vimeocdn.com/video/797382244-0106ae13e902e09d0f02d8f404fa80581f38d1b8b7846b3f8e87ef391ffb8c99-d?mw=1280&mh=720&q=70
oembed.link/favicon.ico
oembed.link/https://vimeo.com/347119375
player.vimeo.com/video/347119375?h=1699409fe2&app_id=122963
vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4
vimeo.fuzzy.replayweb.page/video/347119375
```
---
 src/warc2zim/converter.py | 6 ------
 tests/test_warc_to_zim.py | 9 ---------
 2 files changed, 15 deletions(-)

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 5a45a6d2..8e60899c 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -494,12 +494,6 @@ def add_items_for_warc_record(self, record):
                 logger.debug("Skipping self-redirect: " + url)
                 return
 
-            try:
-                self.creator.add_item(WARCHeadersItem("H/" + normalized_url, record))
-            except RuntimeError as exc:
-                if not DUPLICATE_EXC_STR.match(str(exc)):
-                    raise exc
-
             payload_item = WARCPayloadItem(
                 normalized_url, record, self.head_insert, self.css_insert
             )
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index a39ce1a5..19b286da 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -353,15 +353,6 @@ def test_skip_self_redirect(self, tmp_path):
 
         zim_output = tmp_path / zim_output
 
-        for article in self.list_articles(zim_output):
-            url = article.path
-            if url.startswith("H/"):
-                # ensure there is only one H/ record, and its a 200 (not 301)
-                assert url == "H/kiwix.org/"
-                assert b"HTTP/1.1 200 OK" in self.get_article(
-                    zim_output, "H/kiwix.org/"
-                )
-
     def test_include_domains_favicon_and_language(self, tmp_path):
         zim_output = "spt.zim"
         main(

From 1cacf88d5b4c09631a51fed3c62213f5aa875a92 Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Tue, 14 Nov 2023 16:37:40 +0100
Subject: [PATCH 04/11] Store revisits as alias instead of WARCHeadersItem.

We don't need `WARCHeadersItem` anymore.
---
 src/warc2zim/converter.py | 27 ++++++++++++++-------------
 src/warc2zim/items.py     | 33 ---------------------------------
 tests/test_warc_to_zim.py | 16 ++++++----------
 3 files changed, 20 insertions(+), 56 deletions(-)

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 8e60899c..18ec5576 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -43,7 +43,7 @@
 from cdxj_indexer import iter_file_or_dir, buffering_record_iter
 
 from warc2zim.url_rewriting import normalize
-from warc2zim.items import WARCHeadersItem, WARCPayloadItem, StaticArticle
+from warc2zim.items import WARCPayloadItem, StaticArticle
 from warc2zim.utils import (
     get_version,
     get_record_url,
@@ -75,6 +75,11 @@
     re.MULTILINE | re.DOTALL,
 )
 
+ALIAS_EXC_STR = re.compile(
+    r"^Impossible to alias(.+)" r"(.+) doesn't exist.",
+    re.MULTILINE | re.DOTALL,
+)
+
 
 class Converter:
     def __init__(self, args):
@@ -286,20 +291,14 @@ def run(self):
         for record in self.iter_all_warc_records():
             self.add_items_for_warc_record(record)
 
-        # process revisits, headers only
-        for normalized_url, record in self.revisits.items():
+        # process revisits
+        for normalized_url, target_url in self.revisits.items():
             if normalized_url not in self.indexed_urls:
-                logger.debug(
-                    "Adding revisit {0} -> {1}".format(
-                        normalized_url, record.rec_headers["WARC-Refers-To-Target-URI"]
-                    )
-                )
+                logger.debug(f"Adding alias {normalized_url} -> {target_url}")
                 try:
-                    self.creator.add_item(
-                        WARCHeadersItem("H/" + normalized_url, record)
-                    )
+                    self.creator.add_alias(normalized_url, "", target_url, {})
                 except RuntimeError as exc:
-                    if not DUPLICATE_EXC_STR.match(str(exc)):
+                    if not ALIAS_EXC_STR.match(str(exc)):
                         raise exc
                 self.indexed_urls.add(normalized_url)
 
@@ -513,7 +512,9 @@ def add_items_for_warc_record(self, record):
             record.rec_headers["WARC-Refers-To-Target-URI"] != url
             and normalized_url not in self.revisits
         ):
-            self.revisits[normalized_url] = record
+            self.revisits[normalized_url] = normalize(
+                record.rec_headers["WARC-Refers-To-Target-URI"]
+            )
 
 
 def iter_warc_records(inputs):
diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
index 76bf7dd1..d9ea26f8 100644
--- a/src/warc2zim/items.py
+++ b/src/warc2zim/items.py
@@ -14,7 +14,6 @@
 from libzim.writer import Hint
 from zimscraperlib.types import get_mime_for_name
 from zimscraperlib.zim.items import StaticItem
-from zimscraperlib.zim.providers import StringProvider
 
 from warc2zim.utils import get_record_url, get_record_mime_type, parse_title
 
@@ -28,38 +27,6 @@
 CSS_INS = re.compile(b"(</head>)", re.I)
 
 
-class WARCHeadersItem(StaticItem):
-    """WARCHeadersItem used to store the WARC + HTTP headers as text
-    Usually stored under H namespace
-    """
-
-    def __init__(self, path, record):
-        super().__init__()
-        self.record = record
-        self.path = path
-
-    def get_path(self):
-        return self.path
-
-    def get_title(self):
-        return ""
-
-    def get_mimetype(self):
-        return "application/warc-headers"
-
-    def get_hints(self):
-        return {Hint.FRONT_ARTICLE: False}
-
-    def get_contentprovider(self):
-        # add WARC headers
-        buff = self.record.rec_headers.to_bytes(encoding="utf-8")
-        # add HTTP headers, if present
-        if self.record.http_headers:
-            buff += self.record.http_headers.to_bytes(encoding="utf-8")
-
-        return StringProvider(content=buff, ref=self)
-
-
 class WARCPayloadItem(StaticItem):
     """WARCPayloadItem used to store the WARC payload
     Usually stored under A namespace
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index 19b286da..246b96ad 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -145,12 +145,6 @@ def verify_warc_and_zim(self, warcfile, zimfile):
                 break
 
             url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme)
-            parsed_record = next(
-                ArchiveIterator(BytesIO(zim_fh.get_content("H/" + url_no_scheme)))
-            )
-
-            assert record.rec_headers == parsed_record.rec_headers
-            assert record.http_headers == parsed_record.http_headers
 
             # ensure payloads match
             try:
@@ -158,10 +152,13 @@ def verify_warc_and_zim(self, warcfile, zimfile):
             except KeyError:
                 payload = None
 
-            if record.rec_type == "revisit" or (
-                record.http_headers and record.http_headers.get("Content-Length") == "0"
-            ):
+            if record.http_headers and record.http_headers.get("Content-Length") == "0":
                 assert not payload
+            elif record.rec_type == "revisit":
+                # We must have a payload
+                # We should check with the content of the targeted record...
+                # But difficult to test as we don't have it
+                assert payload
             else:
                 payload_content = payload.content.tobytes()
 
@@ -249,7 +246,6 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
         assert all_articles == {
             # entries from WARC
             "example.com/": "Example Domain",
-            "H/example.com/": "H/example.com/",
             # replay system files
             "A/index.html": "A/index.html",
             "A/load.js": "A/load.js",

From 09f55eba86c3667841f456f2e287480ef6cefc4d Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Fri, 26 May 2023 15:39:38 +0300
Subject: [PATCH 05/11] Do not add service worker stuff in zim file.

Remove other unnecessary files.
---
 setup.py                             | 13 ----
 src/warc2zim/converter.py            | 41 +++---------
 src/warc2zim/main.py                 |  6 --
 src/warc2zim/templates/404.html      | 43 -------------
 src/warc2zim/templates/index.html    |  1 -
 src/warc2zim/templates/load.js       | 69 --------------------
 src/warc2zim/templates/sw_check.html | 23 -------
 src/warc2zim/templates/topFrame.html | 94 ----------------------------
 tests/test_warc_to_zim.py            | 66 ++-----------------
 9 files changed, 11 insertions(+), 345 deletions(-)
 delete mode 100644 src/warc2zim/templates/404.html
 delete mode 100644 src/warc2zim/templates/load.js
 delete mode 100644 src/warc2zim/templates/sw_check.html
 delete mode 100644 src/warc2zim/templates/topFrame.html

diff --git a/setup.py b/setup.py
index 6c2bd52c..320803a9 100644
--- a/setup.py
+++ b/setup.py
@@ -14,19 +14,6 @@ def read(*names, **kwargs):
         return fh.read()
 
 
-REPLAY_SOURCE_URL = "https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.16.11/dist/"
-
-
-def download_replay(name):
-    print("Downloading " + REPLAY_SOURCE_URL + name)
-    with urllib.request.urlopen(REPLAY_SOURCE_URL + name) as response:  # nosec
-        with open(root_dir.joinpath("src", "warc2zim", "templates", name), "wb") as fh:
-            fh.write(response.read())
-
-
-download_replay("sw.js")
-
-
 def get_package_data():
     pkgs = ["templates/*"]
 
diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 18ec5576..62423658 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -57,11 +57,8 @@
 # HTML mime types
 HTML_TYPES = ("text/html", "application/xhtml", "application/xhtml+xml")
 
-# external sw.js filename
-SW_JS = "sw.js"
-
 # head insert template
-HEAD_INSERT_FILE = "sw_check.html"
+HEAD_INSERT_FILE = None
 
 # Default ZIM metadata tags
 DEFAULT_TAGS = ["_ftindex:yes", "_category:other", "_sw:yes"]
@@ -128,7 +125,6 @@ def __init__(self, args):
         self.inputs = args.inputs
         self.include_domains = args.include_domains
 
-        self.replay_viewer_source = args.replay_viewer_source
         self.custom_css = args.custom_css
 
         self.indexed_urls = set({})
@@ -143,30 +139,6 @@ def __init__(self, args):
 
         self.written_records = self.total_records = 0
 
-    def add_replayer(self):
-        if self.replay_viewer_source and re.match(
-            r"^https?\:", self.replay_viewer_source
-        ):
-            self.creator.add_item(
-                URLItem(
-                    url=self.replay_viewer_source + SW_JS,
-                    path="A/" + SW_JS,
-                    mimetype="application/javascript",
-                )
-            )
-        elif self.replay_viewer_source:
-            self.creator.add_item_for(
-                fpath=self.replay_viewer_source + SW_JS,
-                path="A/" + SW_JS,
-                mimetype="application/javascript",
-            )
-        else:
-            self.creator.add_item(
-                StaticArticle(
-                    self.env, SW_JS, self.main_url, mimetype="application/javascript"
-                )
-            )
-
     def init_env(self):
         # autoescape=False to allow injecting html entities from translated text
         env = Environment(
@@ -250,8 +222,11 @@ def run(self):
         self.env = self.init_env()
 
         # init head insert
-        template = self.env.get_template(HEAD_INSERT_FILE)
-        self.head_insert = ("<head>" + template.render()).encode("utf-8")
+        if HEAD_INSERT_FILE:
+            template = self.env.get_template(HEAD_INSERT_FILE)
+            self.head_insert = ("<head>" + template.render()).encode("utf-8")
+        else:
+            self.head_insert = b""
         if self.custom_css:
             self.css_insert = (
                 f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
@@ -280,10 +255,8 @@ def run(self):
             Scraper=f"warc2zim {get_version()}",
         ).start()
 
-        self.add_replayer()
-
         for filename in pkg_resources.resource_listdir("warc2zim", "templates"):
-            if filename == HEAD_INSERT_FILE or filename == SW_JS:
+            if filename == HEAD_INSERT_FILE:
                 continue
 
             self.creator.add_item(StaticArticle(self.env, filename, self.main_url))
diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
index 1da7bff3..885a7eba 100644
--- a/src/warc2zim/main.py
+++ b/src/warc2zim/main.py
@@ -26,12 +26,6 @@ def main(args=None):
                                 the WARC file.""",
     )
 
-    parser.add_argument(
-        "-r",
-        "--replay-viewer-source",
-        help="""URL from which to load the ReplayWeb.page replay viewer from""",
-    )
-
     parser.add_argument(
         "-u",
         "--url",
diff --git a/src/warc2zim/templates/404.html b/src/warc2zim/templates/404.html
deleted file mode 100644
index 8d625bed..00000000
--- a/src/warc2zim/templates/404.html
+++ /dev/null
@@ -1,43 +0,0 @@
-<html>
-<head>
-<meta charset="utf-8"/>
-<script>
-  var currUrl = "$URL";
-  var mainUrl = "{{ MAIN_URL }}";
-
-  // check if toolbar exists, and hide it
-  // note: kiwix insert is left here to be able to check 'window.block_path'
-  window.addEventListener("load", function() {
-    var toolbar = document.querySelector(".kiwix");
-    if (toolbar) {
-      toolbar.style.display = "none";
-    }
-  });
-
-  if (window.parent === window.top) {
-    // check if URL is from same domain as the main url
-    var mainOrigin;
-
-    try {
-      mainOrigin = new URL(mainUrl).origin;
-    } catch (e) {
-      mainOrigin = mainUrl;
-    }
-
-    if (!currUrl.startsWith(mainOrigin)) {
-      // check if kiwix-serve 'blocking' is enabled, show interstitional if it is
-      if (window.block_path) {
-        window.parent.location.href = encodeURI(block_path + "?source=" + currUrl);
-      } else {
-        // otherwise, just redirect
-        window.parent.location.href = currUrl;
-      }
-    }
-  }
-</script>
-</head>
-<body>
-<h3></h3>
-<script>document.querySelector("h3").outerHTML = `<h3>{{_("Sorry, the url ${currUrl} is not found on this server")}}</h3>`;</script>
-</body>
-</html>
diff --git a/src/warc2zim/templates/index.html b/src/warc2zim/templates/index.html
index 7e750cb7..bf16af2b 100644
--- a/src/warc2zim/templates/index.html
+++ b/src/warc2zim/templates/index.html
@@ -4,7 +4,6 @@
 <script>
 window.mainUrl = "{{ MAIN_URL }}";
 </script>
-<script src="./load.js"></script>
 </head>
 <body>
 <div id="loading" style="width: 100%; text-align: center">
diff --git a/src/warc2zim/templates/load.js b/src/warc2zim/templates/load.js
deleted file mode 100644
index db7c4fa3..00000000
--- a/src/warc2zim/templates/load.js
+++ /dev/null
@@ -1,69 +0,0 @@
-async function main() {
-  const sw = navigator.serviceWorker;
-
-  if (!sw) {
-
-    let msg;
-    // check if service worker doesn't work due to http loading
-    if (window.location.protocol === "http:" && window.location.hostname !== "localhost") {
-      const httpsUrl = window.location.href.replace("http:", "https:");
-      document.querySelector("#error").innerHTML = "<p>{{ _("This page must be loaded via an HTTPS URL to support service workers.") }}</p>" +
-          `<a href="${httpsUrl}">{{ _("Try Loading HTTPS URL?") }}</a>`;
-    // otherwise, assume service worker not available at all
-    } else {
-      document.querySelector("#error").innerHTML =  `<h2>{{ _("Error") }}</h2>\n
-      <p>{{ _("The requested URL can not be loaded because service workers are not supported here.") }}</p>
-      <p>{{ _("If you use Firefox in Private Mode, try regular mode instead.") }}</p>
-      <p>{{ _("If you use Kiwix-Serve locally, replace the IP in your browser address bar with <code>localhost</code>.") }}</p>`;
-    }
-
-    document.querySelector("#loading").style.display = "none";
-    return;
-  }
-
-  // finds  '/A/' followed by a domain name with a .
-  var prefix = window.location.href.slice(0, window.location.href.search(/[/]A[/][^/]+[.]/));
-
-  const name = prefix.slice(prefix.lastIndexOf("/") + 1).replace(/[\W]+/, "");
-
-  prefix += "/A/";
-
-  await sw.register("./sw.js?replayPrefix=&root=" + name, {scope: prefix});
-
-  sw.addEventListener("message", (event) => {
-    if (event.data.msg_type === "collAdded" && event.data.name === name) {
-      if (window.location.hash && window.location.hash.startsWith("#redirect=")) {
-        prefix += decodeURIComponent(window.location.hash.slice("#redirect=".length));
-      } else {
-        const inx = window.mainUrl.indexOf("//");
-        prefix += inx >= 0 ? window.mainUrl.slice(inx + 2) : window.mainUrl;
-      }
-
-      console.log("final: " + prefix);
-      window.location.href = prefix;
-    }
-  });
-
-  await new Promise((resolve) => {
-    if (!sw.controller) {
-      sw.addEventListener("controllerchange", () => {
-        resolve();
-      });
-    } else {
-      resolve();
-    }
-  });
-
-  sw.controller.postMessage({
-    msg_type: "addColl",
-    name: name,
-    file: {"sourceUrl": "proxy:../"},
-    root: true,
-    skipExisting: false,
-    extraConfig: {"sourceType": "kiwix", notFoundPageUrl: "./404.html"},
-    topTemplateUrl: "./topFrame.html"
-  });
-}
-
-window.addEventListener("load", main);
-
diff --git a/src/warc2zim/templates/sw_check.html b/src/warc2zim/templates/sw_check.html
deleted file mode 100644
index d759f4ca..00000000
--- a/src/warc2zim/templates/sw_check.html
+++ /dev/null
@@ -1,23 +0,0 @@
-<script>
-// No SW Fallback check: hit if loaded via direct link (no SW installed or not supported)
-if (!window._WBWombat) {  // WBWombat is injected by service worker
-  if (!navigator.serviceWorker || !navigator.serviceWorker.controller) {
-    // finds  '/A/' followed by a domain name with a .
-    var inx =  window.location.href.search(/[/]A[/][^/]+[.]/);
-    var prefix = window.location.href.slice(0, inx);
-
-    prefix += "/A/index.html#redirect=" + encodeURIComponent(window.location.href.slice(inx + 3));
-
-    setTimeout(() => {
-      window.location.href = prefix;
-    }, 100);
-
-  // SW installed but not available (probably hard-refresh): just refresh again
-  } else if (navigator.serviceWorker && navigator.serviceWorker.controller) {
-
-    setTimeout(() => {
-      window.location.reload();
-    }, 100);
-  }
-}
-</script>
diff --git a/src/warc2zim/templates/topFrame.html b/src/warc2zim/templates/topFrame.html
deleted file mode 100644
index 59428c5e..00000000
--- a/src/warc2zim/templates/topFrame.html
+++ /dev/null
@@ -1,94 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-<meta charset="utf-8"/>
-<style>
-html, body
-{
-  position: fixed;
-  top: 0;
-  left: 0;
-  bottom: 0;
-  right: 0;
-  margin: 0;
-  padding: 0;
-  border: 0;
-  overflow: hidden;
-}
-
-iframe {
-  width: 100%;
-  height: 100%;
-  overflow: scroll;
-}
-
-</style>
-</head>
-<body style="margin: 0; padding: 0;">
-<iframe id="replay_iframe" frameborder="0" seamless="seamless" scrolling="yes" class="wb_iframe" allow="autoplay; fullscreen"></iframe>
-<script>
-  const prefix = "$PREFIX";
-  const startUrl = "$URL";
-
-  const iframe = document.querySelector("iframe");
-
-  // update URL when iframe changes
-  window.addEventListener("message", function() {
-    if (event.data.wb_type === "load" || event.data.wb_type === "replace-url") {
-
-      if (event.data.title) {
-        document.title = event.data.title;
-      }
-
-      // remove scheme to be consistent with current canonicalization
-      const urlNoScheme = event.data.url.slice(event.data.url.indexOf("//") + 2);
-      window.history.replaceState(null, "", prefix + urlNoScheme);
-
-      // if icons received, replace any existing icons with new ones
-      if (event.data.icons) {
-        const head = document.querySelector('head');
-        const oldLinks = document.querySelectorAll("link[rel*='icon']");
-
-        for (const link of oldLinks) {
-          head.removeChild(link);
-        }
-
-        // attempt to load the default "<origin>/favicon.ico" if no other favicon is specified
-        if (!event.data.icons.length) {
-          event.data.icons = [{
-            "href": prefix + "mp_/" + new URL("/favicon.ico", event.data.url),
-            "rel": "icon"
-          }];
-        }
-
-        for (const icon of event.data.icons) {
-          const link = document.createElement('link');
-          link.rel = icon.rel;
-
-          const parts = icon.href.split("/mp_/", 2);
-
-          // probably an invalid URL
-          if (parts.length < 2) {
-            continue;
-          }
-
-          const url = parts[1];
-
-          const urlNoScheme = url.slice(url.indexOf("//") + 2);
-
-          // need to escape utf-8, then % encode the entire string
-          let encodedUrl = encodeURI(urlNoScheme);
-          encodedUrl = encodeURIComponent(urlNoScheme)
-
-          link.href = parts[0] + "/" + encodedUrl;
-
-          head.appendChild(link);
-        }
-      }
-    }
-  });
-
-  iframe.src = prefix + "mp_/" + startUrl;
-</script>
-</body>
-</html>
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index 246b96ad..92c93a70 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -112,7 +112,8 @@ def verify_warc_and_zim(self, warcfile, zimfile):
             autoescape=False,
         )
 
-        head_insert = env.get_template("sw_check.html").render().encode("utf-8")
+        # [TOFIX]
+        head_insert = b""
 
         # track to avoid checking duplicates, which are not written to ZIM
         warc_urls = set()
@@ -220,8 +221,6 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
                 str(tmp_path),
                 "--zim-file",
                 zim_output,
-                "-r",
-                "https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.16.11/dist/",
                 "--tags",
                 "some",
                 "--tags",
@@ -248,10 +247,6 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
             "example.com/": "Example Domain",
             # replay system files
             "A/index.html": "A/index.html",
-            "A/load.js": "A/load.js",
-            "A/404.html": "A/404.html",
-            "A/sw.js": "A/sw.js",
-            "A/topFrame.html": "A/topFrame.html",
         }
 
         zim_fh = Archive(zim_output)
@@ -408,18 +403,13 @@ def test_all_warcs_root_dir(self, tmp_path):
 
         # check articles from different warc records in tests/data dir
 
-        # ensure trailing slash added
-        assert b'window.mainUrl = "http://example.com/"' in self.get_article(
-            zim_output, "A/index.html"
-        )
-
         # from example.warc.gz
-        assert self.get_article(zim_output, "A/example.com/") != b""
+        assert self.get_article(zim_output, "example.com/") != b""
 
         # from single-page-test.warc
         assert (
             self.get_article(
-                zim_output, "A/lesfondamentaux.reseau-canope.fr/accueil.html"
+                zim_output, "lesfondamentaux.reseau-canope.fr/accueil.html"
             )
             != b""
         )
@@ -446,54 +436,6 @@ def test_fuzzy_urls(self, tmp_path, fuzzycheck):
             # This should be item and get_article_raw is eq to getItem and it will fail if it is not a item
             self.get_article_raw(zim_output, entry)
 
-    def test_local_replay_viewer_url(self, tmp_path):
-        zim_local_sw = "zim-local-sw.zim"
-
-        res = requests.get(
-            "https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.16.11/dist/sw.js"
-        )
-
-        with open(tmp_path / "sw.js", "wt") as fh:
-            fh.write(res.text)
-
-        main(
-            [
-                "-v",
-                os.path.join(TEST_DATA_DIR, "example-response.warc"),
-                "-r",
-                str(tmp_path) + "/",
-                "--output",
-                str(tmp_path),
-                "--name",
-                "local-sw",
-                "--zim-file",
-                zim_local_sw,
-            ]
-        )
-
-        assert os.path.isfile(tmp_path / zim_local_sw)
-
-    def test_error_bad_replay_viewer_url(self, tmp_path):
-        zim_output_not_created = "zim-out-not-created.zim"
-        with pytest.raises(Exception) as e:
-            main(
-                [
-                    "-v",
-                    os.path.join(TEST_DATA_DIR, "example-response.warc"),
-                    "-r",
-                    "x-invalid-x",
-                    "--output",
-                    str(tmp_path),
-                    "--name",
-                    "bad",
-                    "--zim-file",
-                    zim_output_not_created,
-                ]
-            )
-
-        # zim file should not have been created since replay viewer could not be loaded
-        assert not os.path.isfile(tmp_path / zim_output_not_created)
-
     def test_error_bad_main_page(self, tmp_path):
         zim_output_not_created = "zim-out-not-created.zim"
         with pytest.raises(Exception) as e:

From 1113c1a693a3396a38b56996465e3f4e19684d38 Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Wed, 15 Nov 2023 17:12:26 +0100
Subject: [PATCH 06/11] Properly set the main page.

---
 src/warc2zim/converter.py         | 15 ++++++++-------
 src/warc2zim/templates/index.html | 28 ----------------------------
 tests/test_warc_to_zim.py         |  4 +---
 3 files changed, 9 insertions(+), 38 deletions(-)
 delete mode 100644 src/warc2zim/templates/index.html

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 62423658..0432f307 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -86,14 +86,14 @@ def __init__(self, args):
         else:
             logger.setLevel(logging.INFO)
 
-        self.main_url = args.url
+        main_url = args.url
         # ensure trailing slash is added if missing
-        parts = urlsplit(self.main_url)
+        parts = urlsplit(main_url)
         if parts.path == "":
             parts = list(parts)
             # set path
             parts[2] = "/"
-            self.main_url = urlunsplit(parts)
+            main_url = urlunsplit(parts)
 
         self.name = args.name
         self.title = args.title
@@ -104,9 +104,10 @@ def __init__(self, args):
         self.creator_metadata = args.creator
         self.publisher = args.publisher
         self.tags = DEFAULT_TAGS + (args.tags or [])
-        self.source = args.source or self.main_url
+        self.source = args.source or main_url
         self.scraper = "warc2zim " + get_version()
         self.illustration = b""
+        self.main_url = normalize(main_url)
 
         self.output = args.output
         self.zim_file = args.zim_file
@@ -237,7 +238,7 @@ def run(self):
 
         self.creator = Creator(
             self.full_filename,
-            main_path="A/index.html",
+            main_path=self.main_url,
         )
 
         self.creator.config_metadata(
@@ -306,9 +307,9 @@ def find_main_page_metadata(self):
                     or record.http_headers.get_statuscode() == "200"
                 )
             ):
-                self.main_url = url
+                self.main_url = normalize(url)
 
-            if urldefrag(self.main_url).url != url:
+            if urldefrag(self.main_url).url != normalize(url):
                 continue
 
             # if we get here, found record for the main page
diff --git a/src/warc2zim/templates/index.html b/src/warc2zim/templates/index.html
deleted file mode 100644
index bf16af2b..00000000
--- a/src/warc2zim/templates/index.html
+++ /dev/null
@@ -1,28 +0,0 @@
-<html>
-<head>
-<meta charset="utf-8"/>
-<script>
-window.mainUrl = "{{ MAIN_URL }}";
-</script>
-</head>
-<body>
-<div id="loading" style="width: 100%; text-align: center">
-    <svg width="38" height="38" viewBox="0 0 38 38" xmlns="http://www.w3.org/2000/svg" stroke="#fff">
-        <g fill="none" fill-rule="evenodd">
-            <g transform="translate(1 1)" stroke-width="2">
-                <path d="M36 18c0-9.94-8.06-18-18-18" stroke="gray">
-                    <animateTransform
-                        attributeName="transform"
-                        type="rotate"
-                        from="0 18 18"
-                        to="360 18 18"
-                        dur="1s"
-                        repeatCount="indefinite"/>
-                </path>
-            </g>
-        </g>
-    </svg>
-</div>
-<div id="error"></div>
-</body>
-</html>
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index 92c93a70..39097e14 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -244,9 +244,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
 
         assert all_articles == {
             # entries from WARC
-            "example.com/": "Example Domain",
-            # replay system files
-            "A/index.html": "A/index.html",
+            "example.com/": "Example Domain"
         }
 
         zim_fh = Archive(zim_output)

From 7355b800939fef4bf53dbd7a228d87695263b1b8 Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Fri, 17 Nov 2023 10:59:48 +0100
Subject: [PATCH 07/11] Store static content in a `_zim_static/` subdir instead
 of `A/`.

We don't have anything now in `A/` or `H/` subdirs.
Remove the left over `A/` in test urls (was working thanks to libzim's
compatibility layer)
---
 src/warc2zim/items.py     |  2 +-
 tests/test_warc_to_zim.py | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
index d9ea26f8..dededd02 100644
--- a/src/warc2zim/items.py
+++ b/src/warc2zim/items.py
@@ -81,7 +81,7 @@ def __init__(self, env, filename, main_url, **kwargs):
             ).decode("utf-8")
 
     def get_path(self):
-        return "A/" + self.filename
+        return "_zim_static/" + self.filename
 
     def get_mimetype(self):
         return self.mime
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index 39097e14..0468acd2 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -149,7 +149,7 @@ def verify_warc_and_zim(self, warcfile, zimfile):
 
             # ensure payloads match
             try:
-                payload = zim_fh.get_item("A/" + url_no_scheme)
+                payload = zim_fh.get_item(url_no_scheme)
             except KeyError:
                 payload = None
 
@@ -323,8 +323,8 @@ def test_same_domain_only(self, tmp_path):
         for article in self.list_articles(zim_output):
             url = article.path
             # ignore the replay files, which have only one path segment
-            if url.startswith("A/") and len(url.split("/")) > 2:
-                assert url.startswith("A/example.com/")
+            if not url.startswith("_zim_static/"):
+                assert url.startswith("example.com/")
 
     def test_skip_self_redirect(self, tmp_path):
         zim_output = "self-redir.zim"
@@ -363,7 +363,7 @@ def test_include_domains_favicon_and_language(self, tmp_path):
         for article in self.list_articles(zim_output):
             url = article.path
             # ignore the replay files, which have only one path segment
-            if url.startswith("A/") and len(url.split("/")) > 2:
+            if not url.startswith("_zim_static/"):
                 assert "reseau-canope.fr/" in url
 
         # test detected language
@@ -372,7 +372,7 @@ def test_include_domains_favicon_and_language(self, tmp_path):
         # test detected favicon
         assert self.get_article(
             zim_output,
-            "A/lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico",
+            "lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico",
         )
         assert self.get_metadata(zim_output, "Illustration_48x48@1")
 
@@ -488,10 +488,10 @@ def test_custom_css(self, tmp_path):
         )
         zim_output = tmp_path / zim_output
 
-        res = self.get_article(zim_output, "A/example.com/")
+        res = self.get_article(zim_output, "example.com/")
         assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res
 
-        res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css")
+        res = self.get_article(zim_output, "warc2zim.kiwix.app/custom.css")
         assert custom_css == res
 
     def test_custom_css_remote(self, tmp_path):
@@ -515,8 +515,8 @@ def test_custom_css_remote(self, tmp_path):
         )
         zim_output = tmp_path / zim_output
 
-        res = self.get_article(zim_output, "A/example.com/")
+        res = self.get_article(zim_output, "example.com/")
         assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res
 
-        res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css")
+        res = self.get_article(zim_output, "warc2zim.kiwix.app/custom.css")
         assert res == requests.get(url).content

From e86f2b75a3e2a7a291fcbb7b25c5c67e14ccd1b7 Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Fri, 8 Dec 2023 15:44:03 +0100
Subject: [PATCH 08/11] Deactivate searching file to add in `templates`
 directory.

We don't have files to add (and so, no directory).
---
 src/warc2zim/converter.py | 19 +++++++++++++------
 tests/test_warc_to_zim.py |  2 +-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
index 0432f307..9870401a 100644
--- a/src/warc2zim/converter.py
+++ b/src/warc2zim/converter.py
@@ -142,8 +142,12 @@ def __init__(self, args):
 
     def init_env(self):
         # autoescape=False to allow injecting html entities from translated text
+
+        # We don't have any files in templates directory.
+        # So `templates` directory doesn't exist and pkg_resources complains about that.
+        # Comment this part until we readd new file in `templates` directory
         env = Environment(
-            loader=PackageLoader("warc2zim", "templates"),
+            # loader=PackageLoader("warc2zim", "templates"),
             extensions=["jinja2.ext.i18n"],
             autoescape=False,
         )
@@ -256,11 +260,14 @@ def run(self):
             Scraper=f"warc2zim {get_version()}",
         ).start()
 
-        for filename in pkg_resources.resource_listdir("warc2zim", "templates"):
-            if filename == HEAD_INSERT_FILE:
-                continue
-
-            self.creator.add_item(StaticArticle(self.env, filename, self.main_url))
+        # We don't have any files in templates directory.
+        # So `templates` directory doesn't exist and pkg_resources complains about that.
+        # Comment this part until we readd new file in `templates` directory
+        # for filename in pkg_resources.resource_listdir("warc2zim", "templates"):
+        #    if filename == HEAD_INSERT_FILE:
+        #        continue
+        #
+        #    self.creator.add_item(StaticArticle(self.env, filename, self.main_url))
 
         for record in self.iter_all_warc_records():
             self.add_items_for_warc_record(record)
diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py
index 0468acd2..f2f18a18 100644
--- a/tests/test_warc_to_zim.py
+++ b/tests/test_warc_to_zim.py
@@ -107,7 +107,7 @@ def verify_warc_and_zim(self, warcfile, zimfile):
 
         # autoescape=False to allow injecting html entities from translated text
         env = Environment(
-            loader=PackageLoader("warc2zim", "templates"),
+            # loader=PackageLoader("warc2zim", "templates"),
             extensions=["jinja2.ext.i18n"],
             autoescape=False,
         )

From 49c49eca611b405f806d0d6ad2ba0ca0cc93fc50 Mon Sep 17 00:00:00 2001
From: Matthieu Gautier <mgautier@kymeria.fr>
Date: Fri, 8 Dec 2023 15:59:18 +0100
Subject: [PATCH 09/11] Python 3.7 is not supported.

Assignement expression[*] is new in python 3.8
And python 3.7 is already end of life.

Also add testing on 3.12.

[*]https://docs.python.org/3/whatsnew/3.8.html#assignment-expressions
---
 .github/workflows/ci.yaml | 2 +-
 requirements.txt          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f8811f4e..7b73f24a 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       max-parallel: 3
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 
     steps:
       - name: checkout
diff --git a/requirements.txt b/requirements.txt
index 97b2fc17..1ed3000b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ beautifulsoup4==4.9.3
 zimscraperlib==3.1.1
 Babel==2.12.1
 jinja2==3.1.2
+setuptools==68.2.2
 # to support possible brotli content in warcs
 brotlipy==0.7.0
 cdxj_indexer==1.4.5

From f706533c81a66402c6461594d17cb51a3767aaa7 Mon Sep 17 00:00:00 2001
From: Emmanuel Engelhart <kelson@kiwix.org>
Date: Sat, 16 Dec 2023 20:22:24 +0100
Subject: [PATCH 10/11] Requires zimscraperlib 3.2.0 (with alias feature)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 1ed3000b..7888bfdf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 warcio==1.7.4
 requests==2.31.0
 beautifulsoup4==4.9.3
-zimscraperlib==3.1.1
+zimscraperlib==3.2.0
 Babel==2.12.1
 jinja2==3.1.2
 setuptools==68.2.2

From 0bfddc9216a1807c4c3266db3b0f80b633735275 Mon Sep 17 00:00:00 2001
From: renaud gaudin <reg@rskg.org>
Date: Mon, 18 Dec 2023 08:44:13 +0000
Subject: [PATCH 11/11] import future annotations as we support py3.8+

---
 src/warc2zim/url_rewriting.py | 2 ++
 src/warc2zim/utils.py         | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
index 0a0ced18..37b11ee3 100644
--- a/src/warc2zim/url_rewriting.py
+++ b/src/warc2zim/url_rewriting.py
@@ -36,6 +36,8 @@
 by slightly simplifying the path and keeping only the usefull arguments in the querystring.
 """
 
+from __future__ import annotations
+
 import logging
 import re
 from urllib.parse import urlsplit, urlunsplit, quote, unquote, parse_qs, urlencode
diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
index f12adefd..eea1dedc 100644
--- a/src/warc2zim/utils.py
+++ b/src/warc2zim/utils.py
@@ -2,6 +2,8 @@
 # -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
+from __future__ import annotations
+
 import pkg_resources
 from bs4 import BeautifulSoup