Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

warc2zim, without service worker. #113

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 100 additions & 57 deletions src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import io
import time
from argparse import ArgumentParser
from urllib.parse import urlsplit, urljoin, urlunsplit, urldefrag, urlparse
from urllib.parse import urlsplit, urljoin, urlunsplit, urldefrag, urlparse, quote

import pkg_resources
import requests
Expand All @@ -52,18 +52,19 @@

from cdxj_indexer import iter_file_or_dir, buffering_record_iter

from pywb.rewrite.default_rewriter import DefaultRewriter
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.warcserver.index.cdxobject import CDXObject

# Shared logger
logger = logging.getLogger("warc2zim")

# HTML mime types
HTML_TYPES = ("text/html", "application/xhtml", "application/xhtml+xml")

# external sw.js filename
SW_JS = "sw.js"

# head insert template
HEAD_INSERT_FILE = "sw_check.html"
HEAD_INSERT_FILE = "head_insert.html"


HEAD_INS = re.compile(b"(<head>)", re.I)
Expand Down Expand Up @@ -122,13 +123,48 @@
)


class MyUrlRewriter(UrlRewriter):
def standardize_url(self, url):
if url.startswith("//"):
return url[1:]
if url.startswith("/"):
# This is a absolute url, we must prefix with current host
host = urlsplit(str(self.wburl))[1]
return "/" + host + url
elif url.startswith("https://"):
url = url[7:]
elif url.startswith("http://"):
url = url[6:]
return url

def rewrite(self, url, mod=None, force_abs=False):
import html
# The url maybe stored in a html and so be html encoded
if mod in ["mp_"]:
url = html.unescape(url)
# Make the url absolute
url = self.standardize_url(url)
if url.startswith("/"):
base_url = self.standardize_url(str(self.wburl))
rel_url = os.path.relpath(url, os.path.dirname(base_url))
if url.endswith("/"):
rel_url+="/"
url = rel_url

else:
# Url is already relative (or is "data:", or ..)
pass
if mod in ["mp_"]:
url = html.escape(url)
return url

# ============================================================================
class WARCHeadersItem(StaticItem):
"""WARCHeadersItem used to store the WARC + HTTP headers as text
Usually stored under H namespace
"""

def __init__(self, record):
def __init__(self, record, status):
super().__init__()
self.record = record
self.url = get_record_url(record)
Expand Down Expand Up @@ -161,28 +197,18 @@ class WARCPayloadItem(StaticItem):
Usually stored under A namespace
"""

def __init__(self, record, head_insert=None, css_insert=None):
def __init__(self, record, content_gen):
super().__init__()
self.record = record
self.url = get_record_url(record)
self.mimetype = get_record_mime_type(record)
self.title = ""

if hasattr(self.record, "buffered_stream"):
self.record.buffered_stream.seek(0)
self.content = self.record.buffered_stream.read()
else:
self.content = self.record.content_stream().read()

self.content = b"".join(content_gen)
if self.mimetype.startswith("text/html"):
self.title = parse_title(self.content)
if head_insert:
self.content = HEAD_INS.sub(head_insert, self.content)
if css_insert:
self.content = CSS_INS.sub(css_insert, self.content)

def get_path(self):
return "A/" + canonicalize(self.url)
return canonicalize(self.url)

def get_title(self):
return self.title
Expand All @@ -202,13 +228,14 @@ def __init__(self, env, filename, main_url, **kwargs):
self.mime = get_mime_for_name(filename)
self.mime = self.mime or "application/octet-stream"

if filename != SW_JS:
template = env.get_template(filename)
self.content = template.render(MAIN_URL=self.main_url)
else:
if kwargs.get("raw", False):
self.content = pkg_resources.resource_string(
"warc2zim", "templates/" + filename
"warc2zim", "statics/" + filename
).decode("utf-8")
else:
template = env.get_template(filename)
self.content = template.render(MAIN_URL=self.main_url)


def get_path(self):
return "A/" + self.filename
Expand Down Expand Up @@ -288,29 +315,6 @@ def __init__(self, args):
self.stats_filename = self.output / self.stats_filename
self.written_records = self.total_records = 0

def add_replayer(self):
if self.replay_viewer_source and re.match(
r"^https?\:", self.replay_viewer_source
):
self.creator.add_item(
URLItem(
url=self.replay_viewer_source + SW_JS,
path="A/" + SW_JS,
mimetype="application/javascript",
)
)
elif self.replay_viewer_source:
self.creator.add_item_for(
fpath=self.replay_viewer_source + SW_JS,
path="A/" + SW_JS,
mimetype="application/javascript",
)
else:
self.creator.add_item(
StaticArticle(
self.env, SW_JS, self.main_url, mimetype="application/javascript"
)
)

def init_env(self):
# autoescape=False to allow injecting html entities from translated text
Expand All @@ -320,6 +324,9 @@ def init_env(self):
autoescape=False,
)

env.filters['urlsplit'] = urlsplit
env.filters['tobool'] = lambda val: 'true' if val else 'false'

try:
env.install_gettext_translations(Locale.translation)
except OSError:
Expand Down Expand Up @@ -390,8 +397,8 @@ def run(self):
self.env = self.init_env()

# init head insert
template = self.env.get_template(HEAD_INSERT_FILE)
self.head_insert = ("<head>" + template.render()).encode("utf-8")
self.head_template = self.env.get_template(HEAD_INSERT_FILE)

if self.custom_css:
self.css_insert = (
f'\n<link type="text/css" href="{CUSTOM_CSS_URL}" '
Expand All @@ -409,27 +416,30 @@ def run(self):
**self.metadata,
).start()

self.add_replayer()

for filename in pkg_resources.resource_listdir("warc2zim", "templates"):
if filename == HEAD_INSERT_FILE or filename == SW_JS:
if filename == HEAD_INSERT_FILE:
continue

self.creator.add_item(StaticArticle(self.env, filename, self.main_url))

for filename in pkg_resources.resource_listdir("warc2zim", "statics"):
self.creator.add_item(StaticArticle(self.env, filename, self.main_url, raw=True))

for record in self.iter_all_warc_records():
self.add_items_for_warc_record(record)

# process revisits, headers only
for url, record in self.revisits.items():
if canonicalize(url) not in self.indexed_urls:
target_url = canonicalize(record.rec_headers["WARC-Refers-To-Target-URI"])
logger.debug(
"Adding revisit {0} -> {1}".format(
url, record.rec_headers["WARC-Refers-To-Target-URI"]
url, target_url
)
)
try:
self.creator.add_item(WARCHeadersItem(record))
self.creator.add_redirection("H/" + canonicalize(url), "", target_url, {})
#self.creator.add_item(WARCHeadersItem(record, None))
except RuntimeError as exc:
if not DUPLICATE_EXC_STR.match(str(exc)):
raise exc
Expand Down Expand Up @@ -647,13 +657,15 @@ def add_items_for_warc_record(self, record):
logger.debug("Skipping self-redirect: " + url)
return

(status, content_gen, rewrited) = self.rewrite(record, self.css_insert)
try:
self.creator.add_item(WARCHeadersItem(record))
self.creator.add_item(WARCHeadersItem(record, status))
except RuntimeError as exc:
if not DUPLICATE_EXC_STR.match(str(exc)):
raise exc

payload_item = WARCPayloadItem(record, self.head_insert, self.css_insert)

payload_item = WARCPayloadItem(record, content_gen)

if len(payload_item.content) != 0:
try:
Expand All @@ -677,7 +689,34 @@ def add_items_for_warc_record(self, record):

self.add_fuzzy_match_record(url)

def rewrite(self, record, css_insert):
rewriter = DefaultRewriter("mp_") # mp_ is the mode for "everything" (detect from mimetype)


record_url = get_record_url(record)
# We want to rewrite our url by inserting at beggining "${RW_PREFIX}/"
url_rewriter = MyUrlRewriter(record_url, "/content/test/")

cdx = CDXObject()
cdx['url'] = record_url
cdx['urlkey'] = canonicalize(record_url)

head_insert = self.head_template.render(cdx=cdx, static_prefix="/content/test/A").encode("utf-8")

def head_insert_func(rule, cdx):
return ((head_insert or b"") + (css_insert or b"")).decode("utf-8")

return rewriter(
record, url_rewriter, cookie_rewriter=None, head_insert_func=head_insert_func, cdx=cdx)

def add_fuzzy_match_record(self, url):
# fuzzy rules expect urls starting with a <scheme>//
for prefix in ("//", "http://", "https://"):
if url.startswith(prefix):
break
else:
url = "//"+url

fuzzy_url = url
for rule in FUZZY_RULES:
fuzzy_url = rule["match"].sub(rule["replace"], url)
Expand Down Expand Up @@ -738,7 +777,10 @@ def iter_warc_records(inputs):
with open(filename, "rb") as fh:
for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
if record.rec_type in ("resource", "response", "revisit"):
__raw_stream = record.raw_stream
record.raw_stream = record.buffered_stream
yield record
record.raw_stream = __raw_stream


# ============================================================================
Expand Down Expand Up @@ -822,8 +864,9 @@ def warc2zim(args=None):
# ============================================================================
def canonicalize(url):
"""Return a 'canonical' version of the url under which it is stored in the ZIM
For now, just removing the scheme http:// or https:// scheme
Remove the scheme.
"""

if url.startswith("https://"):
return url[8:]

Expand Down
Loading