Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite (static) content from warc. #133

Merged
merged 9 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ setuptools==68.2.2
# to support possible brotli content in warcs
brotlipy==0.7.0
cdxj_indexer==1.4.5
tinycss2==1.2.1
176 changes: 176 additions & 0 deletions src/warc2zim/content_rewriting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
from html import escape
from html.parser import HTMLParser
from tinycss2 import (
parse_stylesheet,
parse_stylesheet_bytes,
parse_declaration_list,
serialize,
)
from tinycss2.serializer import serialize_url
from tinycss2.ast import Node as TCSS2Node
import io
from collections import namedtuple
from warc2zim.url_rewriting import ArticleUrlRewriter
from warc2zim.utils import to_string
from typing import Callable, Optional, Iterable, List, Tuple, Union

AttrsList = List[Tuple[str, Optional[str]]]


def process_attr(
attr: Tuple[str, Optional[str]],
url_rewriter: Callable[[str], str],
css_rewriter: "CssRewriter",
) -> Tuple[str, Optional[str]]:
if attr[0] in ("href", "src"):
return (attr[0], url_rewriter(attr[1]))
if attr[0] == "srcset":
value_list = attr[1].split(",")
new_value_list = []
for value in value_list:
url, *other = value.strip().split(" ", maxsplit=1)
new_url = url_rewriter(url)
new_value = " ".join([new_url, *other])
rgaudin marked this conversation as resolved.
Show resolved Hide resolved
new_value_list.append(new_value)
return (attr[0], ", ".join(new_value_list))
if attr[0] == "style":
return (attr[0], css_rewriter.rewrite_inline(attr[1]))
return attr


def format_attr(name: str, value: Optional[str]) -> str:
if value is None:
return name
html_escaped_value = escape(value, quote=True)
return f'{name}="{html_escaped_value}"'


def transform_attrs(
attrs: AttrsList, url_rewriter: Callable[[str], str], css_rewriter: "CssRewriter"
) -> str:
processed_attrs = (process_attr(attr, url_rewriter, css_rewriter) for attr in attrs)
return " ".join(format_attr(*attr) for attr in processed_attrs)


RewritenHtml = namedtuple("RewritenHmtl", ["title", "content"])


class HtmlRewriter(HTMLParser):
def __init__(self, article_url: str, pre_head_insert: str, post_head_insert: str):
super().__init__()
self.url_rewriter = ArticleUrlRewriter(article_url)
self.css_rewriter = CSSRewriter(article_url)
self.title = None
self.output = None
# This works only for tag without children.
# But as we use it to get the title, we are ok
self._active_tag = None
self.pre_head_insert = pre_head_insert
self.post_head_insert = post_head_insert

def rewrite(self, content: Union[str, bytes]) -> RewritenHtml:
assert self.output == None
self.output = io.StringIO()

content = to_string(content)

self.feed(content)
self.close()

output = self.output.getvalue()
self.output = None
return RewritenHtml(self.title or "", output)

def send(self, value: str):
self.output.write(value)

def handle_starttag(self, tag: str, attrs: AttrsList, auto_close: bool = False):
self._active_tag = tag

self.send(f"<{tag}")
if attrs:
self.send(" ")
self.send(transform_attrs(attrs, self.url_rewriter, self.css_rewriter))

if auto_close:
self.send(" />")
else:
self.send(">")
if tag == "head" and self.pre_head_insert:
self.send(self.pre_head_insert)

def handle_endtag(self, tag: str):
self._active_tag = None
if tag == "head" and self.post_head_insert:
self.send(self.post_head_insert)
self.send(f"</{tag}>")

def handle_startendtag(self, tag: str, attrs: AttrsList):
self.handle_starttag(tag, attrs, auto_close=True)
self._active_tag = None

def handle_data(self, data: str):
if self._active_tag == "title" and self.title is None:
self.title = data.strip()
elif self._active_tag == "style":
data = self.css_rewriter.rewrite(data)
self.send(data)

def handle_comment(self, data: str):
self.send(f"<!--{data}-->")

def handle_decl(self, decl: str):
self.send(f"<!{decl}>")

def handle_pi(self, data: str):
self.send(f"<?{data}>")

Check warning on line 126 in src/warc2zim/content_rewriting.py

View check run for this annotation

Codecov / codecov/patch

src/warc2zim/content_rewriting.py#L126

Added line #L126 was not covered by tests

def unknown_decl(self, data: str):
self.handle_decl(data)

Check warning on line 129 in src/warc2zim/content_rewriting.py

View check run for this annotation

Codecov / codecov/patch

src/warc2zim/content_rewriting.py#L129

Added line #L129 was not covered by tests


class CSSRewriter:
def __init__(self, css_url: str):
self.url_rewriter = ArticleUrlRewriter(css_url)

def rewrite(self, content: Union[str, bytes]) -> str:
if isinstance(content, bytes):
rules = parse_stylesheet_bytes(content)[0]
else:
rules = parse_stylesheet(content)
self.process_list(rules)

output = serialize(rules)
return output

def rewrite_inline(self, content: str) -> str:
rules = parse_declaration_list(content)
self.process_list(rules)
output = serialize(rules)
return output

def process_list(self, components: Iterable[TCSS2Node]):
if components: # May be null
for component in components:
self.process(component)

def process(self, component: TCSS2Node):
if component.type in ("qualified-rule", "() block", "[] block", "{} block"):
self.process_list(component.content)
elif component.type == "function":
if component.lower_name == "url":
url_component = component.arguments[0]
new_url = self.url_rewriter(url_component.value)
url_component.value = new_url
url_component.representation = f'"{serialize_url(new_url)}"'
else:
self.process_list(component.arguments)
elif component.type == "at-rule":
self.process_list(component.prelude)
self.process_list(component.content)
elif component.type == "declaration":
self.process_list(component.value)
elif component.type == "url":
new_url = self.url_rewriter(component.value)
component.value = new_url
component.representation = f"url({serialize_url(new_url)})"
5 changes: 2 additions & 3 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,8 @@ def run(self):
self.head_insert = b""
if self.custom_css:
self.css_insert = (
f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
'rel="Stylesheet" />\n</head>'
).encode("utf-8")
f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" rel="Stylesheet" />\n'
)
else:
self.css_insert = None

Expand Down
21 changes: 8 additions & 13 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from zimscraperlib.types import get_mime_for_name
from zimscraperlib.zim.items import StaticItem

from warc2zim.utils import get_record_url, get_record_mime_type, parse_title
from warc2zim.utils import get_record_url, get_record_mime_type
from warc2zim.content_rewriting import HtmlRewriter, CSSRewriter

# Shared logger
logger = logging.getLogger("warc2zim.items")
Expand All @@ -32,7 +33,7 @@ class WARCPayloadItem(StaticItem):
Usually stored under A namespace
"""

def __init__(self, path, record, head_insert=None, css_insert=None):
def __init__(self, path, record, head_insert, css_insert):
super().__init__()
self.record = record
self.path = path
Expand All @@ -46,17 +47,11 @@ def __init__(self, path, record, head_insert=None, css_insert=None):
self.content = self.record.content_stream().read()

if self.mimetype.startswith("text/html"):
self.title = parse_title(self.content)
if head_insert:
self.content = HEAD_INS.sub(head_insert, self.content)
if css_insert:
self.content = CSS_INS.sub(css_insert, self.content)

def get_path(self):
return self.path

def get_title(self):
return self.title
self.title, self.content = HtmlRewriter(
self.path, head_insert, css_insert
).rewrite(self.content)
elif self.mimetype.startswith("text/css"):
self.content = CSSRewriter(self.path).rewrite(self.content)

def get_hints(self):
is_front = self.mimetype.startswith("text/html")
Expand Down
46 changes: 45 additions & 1 deletion src/warc2zim/url_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,16 @@

import logging
import re
from urllib.parse import urlsplit, urlunsplit, quote, unquote, parse_qs, urlencode
import posixpath
from urllib.parse import (
urlsplit,
urljoin,
urlunsplit,
quote,
unquote,
parse_qs,
urlencode,
)
from warc2zim.utils import to_string

# Shared logger
Expand Down Expand Up @@ -124,3 +133,38 @@ def normalize(url: str | bytes) -> str:
path = reduce(path)

return path


class ArticleUrlRewriter:
"""Rewrite urls in article."""

def __init__(self, article_url: str):
self.article_url = article_url
self.base_path = f"/{urlsplit(normalize(article_url)).path}"
if self.base_path[-1] != "/":
# We want a directory
self.base_path = posixpath.dirname(self.base_path)

def __call__(self, url: str) -> str:
"""Rewrite a url contained in a article.

The url is "fully" rewrited to point to a normalized entry path
"""

if url.startswith("data:") or url.startswith("blob:"):
return url

absolute_url = urljoin(self.article_url, url)

normalized_url = urlsplit(f"/{normalize(absolute_url)}")

# relative_to will lost our potential last '/'
slash_ending = normalized_url.path[-1] == "/"
relative_path = posixpath.relpath(normalized_url.path, self.base_path)

if slash_ending:
relative_path += "/"
normalized_url = normalized_url._replace(path=relative_path)
normalized_url = urlunsplit(normalized_url)

return quote(normalized_url, safe="/#")
2 changes: 1 addition & 1 deletion src/warc2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def parse_title(content):

def to_string(input: str | bytes) -> str:
try:
input = input.decode("utf8")
input = input.decode("utf-8-sig")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's your need for this? I believe it's deprecated advised against

https://docs.python.org/3/library/codecs.html#encodings-and-unicode

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sadly, we have to handle the BOM anyway as it may be present in the content we get:

import requests

css = requests.get("https://donorbox.org/assets/application_embed-47da8f7456acb6aa58b61f2e5c664fccbf3cae5b0ad587f129dcd2d93caa65e8.css").content

print(content[:20])

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this particular use case it doesn't matter (it's just a zero width space) but for URLs it could be problematic as a typable url/path would become un-typable.
Doesn't harm anyway...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it break the parsing of the css:

import tinycss2

css = requests.get(...)

# print `<QualifiedRule … { … }>` (skipping the first (at) rule)
print(tinycss2.parse_stylesheet(css.decode('utf-8'))[0])

# print `<AtRule @import … { … }>`
print(tinycss2.parse_stylesheet(css.decode('utf-8-sig'))[0])

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed the first rule doesn't appear in content (but is in prelude and thus serialized – but we wouldn't rewrite it!). Looks like we should open a ticket upstream.
Can you add a brief comment explaining tinycss2 doesn't handle it correctly? Including that sample URL would help I think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue opened upstream : Kozea/tinycss2#52

However, I'm not sure it is a bug in tinycss. It is more us passing a not correctly decoded content to tinycss.

Can you add a brief comment explaining tinycss2 doesn't handle it correctly? Including that sample URL would help I think.

I will add a link to the tinycss2 issue as a comment.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I'm not sure it is a bug in tinycss. It is more us passing a not correctly decoded content to tinycss.

We'll see what they think about. It will probably come down to what the CSS spec says. I wonder if browsers take care of it before sending it to parser or not.

except AttributeError:
pass
return input
68 changes: 68 additions & 0 deletions tests/test_css_rewriting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pytest
from warc2zim.content_rewriting import CSSRewriter
from textwrap import dedent


@pytest.fixture(
params=[
b"p { color: red; }",
b"p {\n color: red;\n}",
b"p { background: blue; }",
b"p { background: rgb(15, 0, 52); }",
b"/* See bug issue at http://exemple.com/issue/link */ p { color: blue; }",
]
)
def no_rewrite_content(request):
yield request.param


def test_no_rewrite(no_rewrite_content):
assert (
CSSRewriter("kiwix.org").rewrite(no_rewrite_content)
== no_rewrite_content.decode()
)


def test_rewrite():
content = b"""
/* A comment with a link : http://foo.com */
@import url(//fonts.googleapis.com/icon?family=Material+Icons);

p, input {
color: rbg(1, 2, 3);
background: url('http://kiwix.org/super/img');
background-image:url('http://exemple.com/no_space_before_url');
}

@font-face {
src: url(https://fonts.gstatic.com/s/quicksand/v31/6xKtdSZaM9iE8KbpRA_hJFQNcOM.woff2) format('woff2');
}

@media only screen and (max-width: 40em) {
p, input {
background-image:url();
}
}"""

expected = """
/* A comment with a link : http://foo.com */
@import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%2BIcons);

p, input {
color: rbg(1, 2, 3);
background: url("super/img");
background-image:url("../exemple.com/no_space_before_url");
}

@font-face {
src: url(../fonts.gstatic.com/s/quicksand/v31/6xKtdSZaM9iE8KbpRA_hJFQNcOM.woff2) format("woff2");
}

@media only screen and (max-width: 40em) {
p, input {
background-image:url();
}
}"""
expected = dedent(expected)

assert CSSRewriter("kiwix.org/article").rewrite(content) == expected
Loading
Loading