Skip to content

Commit

Permalink
Rewrite CSS embeded in html.
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Nov 15, 2023
1 parent baebb02 commit 0e52a49
Showing 1 changed file with 18 additions and 6 deletions.
24 changes: 18 additions & 6 deletions src/warc2zim/content_rewriting.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@

from html import escape
from html.parser import HTMLParser
from tinycss2 import parse_stylesheet, serialize
from tinycss2 import parse_stylesheet, parse_declaration_list, serialize
from tinycss2.serializer import serialize_url
import io
from .url_rewriting import ArticleUrlRewriter

def process_attr(attr, rewriter):
def process_attr(attr, url_rewriter, css_rewriter):
if attr[0] in ('href', 'src', 'data-lazy-src'):
return (attr[0], url_rewriter(attr[1]))
if attr[0] in ('srcset', 'data-lazy-srcset'):
Expand All @@ -18,6 +18,8 @@ def process_attr(attr, rewriter):
new_value = " ".join([new_url, *other])
new_value_list.append(new_value)
return (attr[0], ", ".join(new_value_list))
if attr[0] == "style":
return (attr[0], css_rewriter.rewrite_inline(attr[1]))
return attr

def format_attr(name, value):
Expand All @@ -26,8 +28,8 @@ def format_attr(name, value):
html_escaped_value = escape(value, quote=True)
return f"{name}=\"{html_escaped_value}\""

def transform_attrs(attrs, url_rewriter):
processed_attrs = (process_attr(attr, url_rewriter) for attr in attrs)
def transform_attrs(attrs, url_rewriter, css_rewriter):
processed_attrs = (process_attr(attr, url_rewriter, css_rewriter) for attr in attrs)
return " ".join(
format_attr(*attr) for attr in processed_attrs
)
Expand All @@ -36,6 +38,7 @@ class RewriterParser(HTMLParser):
def __init__(self, host, article_url, head_insert, css_insert, sink):
super().__init__()
self.url_rewriter = ArticleUrlRewriter(host, article_url)
self.css_rewriter = CSSRewriter(host, article_url)
self.sink = sink
self.title = ""
# This works only for tag without children.
Expand All @@ -51,7 +54,7 @@ def handle_starttag(self, tag, attrs, auto_close=False):
self._active_tag = tag

self.send(f"<{tag} ")
self.send(transform_attrs(attrs, self.url_rewriter))
self.send(transform_attrs(attrs, self.url_rewriter, self.css_rewriter))

if auto_close:
self.send(" />")
Expand All @@ -73,6 +76,8 @@ def handle_startendtag(self, tag, attrs):
def handle_data(self, data):
if self._active_tag == 'title':
self.title = data.strip()
elif self._active_tag == "style":
data = self.css_rewriter.rewrite(data)
self.send(data)

def handle_comment(self, data):
Expand Down Expand Up @@ -103,13 +108,20 @@ def __init__(self, host, css_url):
self.url_rewriter = ArticleUrlRewriter(host, css_url)

def rewrite(self, content):
content = content.decode()
if isinstance(content, bytes):
content = content.decode()
rules = parse_stylesheet(content)
self.process_list(rules)

output = serialize(rules)
return output

def rewrite_inline(self, content):
rules = parse_declaration_list(content)
self.process_list(rules)
output = serialize(rules)
return output

def process_list(self, components):
if components: # May be null
for component in components:
Expand Down

0 comments on commit 0e52a49

Please sign in to comment.