openzim · mgautierfr · Dec 19, 2023 · Nov 15, 2023 · Dec 8, 2023 · Nov 15, 2023
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ setuptools==68.2.2
 # to support possible brotli content in warcs
 brotlipy==0.7.0
 cdxj_indexer==1.4.5
+tinycss2==1.2.1
diff --git a/src/warc2zim/content_rewriting.py b/src/warc2zim/content_rewriting.py
@@ -0,0 +1,176 @@
+from html import escape
+from html.parser import HTMLParser
+from tinycss2 import (
+    parse_stylesheet,
+    parse_stylesheet_bytes,
+    parse_declaration_list,
+    serialize,
+)
+from tinycss2.serializer import serialize_url
+from tinycss2.ast import Node as TCSS2Node
+import io
+from collections import namedtuple
+from warc2zim.url_rewriting import ArticleUrlRewriter
+from warc2zim.utils import to_string
+from typing import Callable, Optional, Iterable, List, Tuple, Union
+
+AttrsList = List[Tuple[str, Optional[str]]]
+
+
+def process_attr(
+    attr: Tuple[str, Optional[str]],
+    url_rewriter: Callable[[str], str],
+    css_rewriter: "CssRewriter",
+) -> Tuple[str, Optional[str]]:
+    if attr[0] in ("href", "src"):
+        return (attr[0], url_rewriter(attr[1]))
+    if attr[0] == "srcset":
+        value_list = attr[1].split(",")
+        new_value_list = []
+        for value in value_list:
+            url, *other = value.strip().split(" ", maxsplit=1)
+            new_url = url_rewriter(url)
+            new_value = " ".join([new_url, *other])
+            new_value_list.append(new_value)
+        return (attr[0], ", ".join(new_value_list))
+    if attr[0] == "style":
+        return (attr[0], css_rewriter.rewrite_inline(attr[1]))
+    return attr
+
+
+def format_attr(name: str, value: Optional[str]) -> str:
+    if value is None:
+        return name
+    html_escaped_value = escape(value, quote=True)
+    return f'{name}="{html_escaped_value}"'
+
+
+def transform_attrs(
+    attrs: AttrsList, url_rewriter: Callable[[str], str], css_rewriter: "CssRewriter"
+) -> str:
+    processed_attrs = (process_attr(attr, url_rewriter, css_rewriter) for attr in attrs)
+    return " ".join(format_attr(*attr) for attr in processed_attrs)
+
+
+RewritenHtml = namedtuple("RewritenHmtl", ["title", "content"])
+
+
+class HtmlRewriter(HTMLParser):
+    def __init__(self, article_url: str, pre_head_insert: str, post_head_insert: str):
+        super().__init__()
+        self.url_rewriter = ArticleUrlRewriter(article_url)
+        self.css_rewriter = CSSRewriter(article_url)
+        self.title = None
+        self.output = None
+        # This works only for tag without children.
+        # But as we use it to get the title, we are ok
+        self._active_tag = None
+        self.pre_head_insert = pre_head_insert
+        self.post_head_insert = post_head_insert
+
+    def rewrite(self, content: Union[str, bytes]) -> RewritenHtml:
+        assert self.output == None
+        self.output = io.StringIO()
+
+        content = to_string(content)
+
+        self.feed(content)
+        self.close()
+
+        output = self.output.getvalue()
+        self.output = None
+        return RewritenHtml(self.title or "", output)
+
+    def send(self, value: str):
+        self.output.write(value)
+
+    def handle_starttag(self, tag: str, attrs: AttrsList, auto_close: bool = False):
+        self._active_tag = tag
+
+        self.send(f"<{tag}")
+        if attrs:
+            self.send(" ")
+        self.send(transform_attrs(attrs, self.url_rewriter, self.css_rewriter))
+
+        if auto_close:
+            self.send(" />")
+        else:
+            self.send(">")
+        if tag == "head" and self.pre_head_insert:
+            self.send(self.pre_head_insert)
+
+    def handle_endtag(self, tag: str):
+        self._active_tag = None
+        if tag == "head" and self.post_head_insert:
+            self.send(self.post_head_insert)
+        self.send(f"</{tag}>")
+
+    def handle_startendtag(self, tag: str, attrs: AttrsList):
+        self.handle_starttag(tag, attrs, auto_close=True)
+        self._active_tag = None
+
+    def handle_data(self, data: str):
+        if self._active_tag == "title" and self.title is None:
+            self.title = data.strip()
+        elif self._active_tag == "style":
+            data = self.css_rewriter.rewrite(data)
+        self.send(data)
+
+    def handle_comment(self, data: str):
+        self.send(f"<!--{data}-->")
+
+    def handle_decl(self, decl: str):
+        self.send(f"<!{decl}>")
+
+    def handle_pi(self, data: str):
+        self.send(f"<?{data}>")
+
+    def unknown_decl(self, data: str):
+        self.handle_decl(data)
+
+
+class CSSRewriter:
+    def __init__(self, css_url: str):
+        self.url_rewriter = ArticleUrlRewriter(css_url)
+
+    def rewrite(self, content: Union[str, bytes]) -> str:
+        if isinstance(content, bytes):
+            rules = parse_stylesheet_bytes(content)[0]
+        else:
+            rules = parse_stylesheet(content)
+        self.process_list(rules)
+
+        output = serialize(rules)
+        return output
+
+    def rewrite_inline(self, content: str) -> str:
+        rules = parse_declaration_list(content)
+        self.process_list(rules)
+        output = serialize(rules)
+        return output
+
+    def process_list(self, components: Iterable[TCSS2Node]):
+        if components:  # May be null
+            for component in components:
+                self.process(component)
+
+    def process(self, component: TCSS2Node):
+        if component.type in ("qualified-rule", "() block", "[] block", "{} block"):
+            self.process_list(component.content)
+        elif component.type == "function":
+            if component.lower_name == "url":
+                url_component = component.arguments[0]
+                new_url = self.url_rewriter(url_component.value)
+                url_component.value = new_url
+                url_component.representation = f'"{serialize_url(new_url)}"'
+            else:
+                self.process_list(component.arguments)
+        elif component.type == "at-rule":
+            self.process_list(component.prelude)
+            self.process_list(component.content)
+        elif component.type == "declaration":
+            self.process_list(component.value)
+        elif component.type == "url":
+            new_url = self.url_rewriter(component.value)
+            component.value = new_url
+            component.representation = f"url({serialize_url(new_url)})"
diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -234,9 +234,8 @@ def run(self):
             self.head_insert = b""
         if self.custom_css:
             self.css_insert = (
-                f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
-                'rel="Stylesheet" />\n</head>'
-            ).encode("utf-8")
+                f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" rel="Stylesheet" />\n'
+            )
         else:
             self.css_insert = None
 

diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
@@ -15,7 +15,8 @@
 from zimscraperlib.types import get_mime_for_name
 from zimscraperlib.zim.items import StaticItem
 
-from warc2zim.utils import get_record_url, get_record_mime_type, parse_title
+from warc2zim.utils import get_record_url, get_record_mime_type
+from warc2zim.content_rewriting import HtmlRewriter, CSSRewriter
 
 # Shared logger
 logger = logging.getLogger("warc2zim.items")
@@ -32,7 +33,7 @@ class WARCPayloadItem(StaticItem):
     Usually stored under A namespace
     """
 
-    def __init__(self, path, record, head_insert=None, css_insert=None):
+    def __init__(self, path, record, head_insert, css_insert):
         super().__init__()
         self.record = record
         self.path = path
@@ -46,17 +47,11 @@ def __init__(self, path, record, head_insert=None, css_insert=None):
             self.content = self.record.content_stream().read()
 
         if self.mimetype.startswith("text/html"):
-            self.title = parse_title(self.content)
-            if head_insert:
-                self.content = HEAD_INS.sub(head_insert, self.content)
-            if css_insert:
-                self.content = CSS_INS.sub(css_insert, self.content)
-
-    def get_path(self):
-        return self.path
-
-    def get_title(self):
-        return self.title
+            self.title, self.content = HtmlRewriter(
+                self.path, head_insert, css_insert
+            ).rewrite(self.content)
+        elif self.mimetype.startswith("text/css"):
+            self.content = CSSRewriter(self.path).rewrite(self.content)
 
     def get_hints(self):
         is_front = self.mimetype.startswith("text/html")

diff --git a/src/warc2zim/url_rewriting.py b/src/warc2zim/url_rewriting.py
@@ -40,7 +40,16 @@
 
 import logging
 import re
-from urllib.parse import urlsplit, urlunsplit, quote, unquote, parse_qs, urlencode
+import posixpath
+from urllib.parse import (
+    urlsplit,
+    urljoin,
+    urlunsplit,
+    quote,
+    unquote,
+    parse_qs,
+    urlencode,
+)
 from warc2zim.utils import to_string
 
 # Shared logger
@@ -124,3 +133,38 @@ def normalize(url: str | bytes) -> str:
     path = reduce(path)
 
     return path
+
+
+class ArticleUrlRewriter:
+    """Rewrite urls in article."""
+
+    def __init__(self, article_url: str):
+        self.article_url = article_url
+        self.base_path = f"/{urlsplit(normalize(article_url)).path}"
+        if self.base_path[-1] != "/":
+            # We want a directory
+            self.base_path = posixpath.dirname(self.base_path)
+
+    def __call__(self, url: str) -> str:
+        """Rewrite a url contained in a article.
+
+        The url is "fully" rewrited to point to a normalized entry path
+        """
+
+        if url.startswith("data:") or url.startswith("blob:"):
+            return url
+
+        absolute_url = urljoin(self.article_url, url)
+
+        normalized_url = urlsplit(f"/{normalize(absolute_url)}")
+
+        # relative_to will lost our potential last '/'
+        slash_ending = normalized_url.path[-1] == "/"
+        relative_path = posixpath.relpath(normalized_url.path, self.base_path)
+
+        if slash_ending:
+            relative_path += "/"
+        normalized_url = normalized_url._replace(path=relative_path)
+        normalized_url = urlunsplit(normalized_url)
+
+        return quote(normalized_url, safe="/#")
diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
@@ -43,7 +43,7 @@ def parse_title(content):
 
 def to_string(input: str | bytes) -> str:
     try:
-        input = input.decode("utf8")
+        input = input.decode("utf-8-sig")
     except AttributeError:
         pass
     return input
diff --git a/tests/test_css_rewriting.py b/tests/test_css_rewriting.py
@@ -0,0 +1,68 @@
+import pytest
+from warc2zim.content_rewriting import CSSRewriter
+from textwrap import dedent
+
+
+@pytest.fixture(
+    params=[
+        b"p { color: red; }",
+        b"p {\n color: red;\n}",
+        b"p { background: blue; }",
+        b"p { background: rgb(15, 0, 52); }",
+        b"/* See bug issue at http://exemple.com/issue/link */ p { color: blue; }",
+    ]
+)
+def no_rewrite_content(request):
+    yield request.param
+
+
+def test_no_rewrite(no_rewrite_content):
+    assert (
+        CSSRewriter("kiwix.org").rewrite(no_rewrite_content)
+        == no_rewrite_content.decode()
+    )
+
+
+def test_rewrite():
+    content = b"""
+/* A comment with a link : http://foo.com */
+@import url(//fonts.googleapis.com/icon?family=Material+Icons);
+
+p, input {
+    color: rbg(1, 2, 3);
+    background: url('http://kiwix.org/super/img');
+    background-image:url('http://exemple.com/no_space_before_url');
+}
+
+@font-face {
+    src: url(https://fonts.gstatic.com/s/quicksand/v31/6xKtdSZaM9iE8KbpRA_hJFQNcOM.woff2) format('woff2');
+}
+
+@media only screen and (max-width: 40em) {
+    p, input {
+        background-image:url(data:image/png;base64,FooContent);
+    }
+}"""
+
+    expected = """
+    /* A comment with a link : http://foo.com */
+    @import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%2BIcons);
+
+    p, input {
+        color: rbg(1, 2, 3);
+        background: url("super/img");
+        background-image:url("../exemple.com/no_space_before_url");
+    }
+
+    @font-face {
+        src: url(../fonts.gstatic.com/s/quicksand/v31/6xKtdSZaM9iE8KbpRA_hJFQNcOM.woff2) format("woff2");
+    }
+
+    @media only screen and (max-width: 40em) {
+        p, input {
+            background-image:url(data:image/png;base64,FooContent);
+        }
+    }"""
+    expected = dedent(expected)
+
+    assert CSSRewriter("kiwix.org/article").rewrite(content) == expected