Skip to content

Commit

Permalink
Use scraperlib content rewriting code (imported from warc2zim)
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 22, 2024
1 parent 797edd3 commit 86ba7eb
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 469 deletions.
3 changes: 2 additions & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ readme = "../README.md"
dependencies = [
"yt-dlp", # youtube-dl should be updated as frequently as possible
"jinja2==3.1.4",
"zimscraperlib==4.0.0",
# use zimscraperlib pinned version once content rewriting functions have been released
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
"requests==2.32.3",
"types-requests==2.32.0.20240914",
"kiwixstorage==0.9.0",
Expand Down
143 changes: 0 additions & 143 deletions scraper/src/libretexts2zim/css.py

This file was deleted.

53 changes: 38 additions & 15 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
stream_file, # pyright: ignore[reportUnknownVariableType]
)
from zimscraperlib.image import resize_image
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.url_rewriting import (
ArticleUrlRewriter,
HttpUrl,
ZimPath,
)
from zimscraperlib.zim import Creator
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData
Expand All @@ -22,7 +28,6 @@
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.css import CssProcessor
from libretexts2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -307,44 +312,62 @@ def run(self) -> Path:
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

css_processor = CssProcessor()
items_to_download: dict[ZimPath, HttpUrl] = {}

Check warning on line 315 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L315

Added line #L315 was not covered by tests
screen_css = BytesIO()
stream_file(home.screen_css_url, byte_stream=screen_css)
result = css_processor.process(
css_original_url=home.screen_css_url, css_content=screen_css.getvalue()
url_rewriter = ArticleUrlRewriter(

Check warning on line 318 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L318

Added line #L318 was not covered by tests
article_url=HttpUrl(home.screen_css_url),
article_path=ZimPath("screen.css"),
)
css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None)
result = css_rewriter.rewrite(content=screen_css.getvalue())
items_to_download = {**items_to_download, **url_rewriter.items_to_download}

Check warning on line 324 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L322-L324

Added lines #L322 - L324 were not covered by tests
add_item_for(creator, "content/screen.css", content=result)
del screen_css
del css_rewriter
del url_rewriter

Check warning on line 328 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L327-L328

Added lines #L327 - L328 were not covered by tests

print_css = BytesIO()
stream_file(home.print_css_url, byte_stream=print_css)
result = css_processor.process(
css_original_url=home.print_css_url, css_content=print_css.getvalue()
url_rewriter = ArticleUrlRewriter(

Check warning on line 332 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L332

Added line #L332 was not covered by tests
article_url=HttpUrl(home.print_css_url),
article_path=ZimPath("print.css"),
)
css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None)
result = css_rewriter.rewrite(content=print_css.getvalue())
items_to_download = {**items_to_download, **url_rewriter.items_to_download}

Check warning on line 338 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L336-L338

Added lines #L336 - L338 were not covered by tests
add_item_for(creator, "content/print.css", content=result)
del print_css
del css_rewriter
del url_rewriter

Check warning on line 342 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L341-L342

Added lines #L341 - L342 were not covered by tests

result = css_processor.process(
css_original_url=home.home_url,
css_content=("\n".join(home.inline_css)).encode(),
url_rewriter = ArticleUrlRewriter(

Check warning on line 344 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L344

Added line #L344 was not covered by tests
article_url=HttpUrl(home.home_url), article_path=ZimPath("inline.css")
)
css_rewriter = CssRewriter(url_rewriter=url_rewriter, base_href=None)
result = css_rewriter.rewrite(content=("\n".join(home.inline_css)))
items_to_download = {**items_to_download, **url_rewriter.items_to_download}

Check warning on line 349 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L347-L349

Added lines #L347 - L349 were not covered by tests
add_item_for(creator, "content/inline.css", content=result)

logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...")
for asset_url, asset_path in css_processor.css_assets.items():
logger.info(f" Retrieving {len(items_to_download)} CSS assets...")

Check warning on line 352 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L352

Added line #L352 was not covered by tests
for asset_path, asset_url in items_to_download.items():
try:
css_asset = BytesIO()
stream_file(asset_url, byte_stream=css_asset)
stream_file(asset_url.value, byte_stream=css_asset)
logger.debug(

Check warning on line 357 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L356-L357

Added lines #L356 - L357 were not covered by tests
f"Adding {asset_url.value} to {asset_path.value} in the ZIM"
)
add_item_for(
creator, str(asset_path)[1:], content=css_asset.getvalue()
creator,
"content/" + asset_path.value,
content=css_asset.getvalue(),
)
logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM")
del css_asset
except HTTPError as exc:
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path} due to {exc}")
logger.debug(f"Ignoring {asset_path.value} due to {exc}")

Check warning on line 370 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L370

Added line #L370 was not covered by tests

logger.info("Fetching pages tree")
pages_tree = self.libretexts_client.get_page_tree()
Expand Down
Loading

0 comments on commit 86ba7eb

Please sign in to comment.