diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4205369..d4cefca 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,10 +24,10 @@ To achieve this, first build the Docker image based on current code base. docker build -t local-libretexts2zim . ``` -Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, but you could use any other one of interest for your UI developments). +Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, with only page id 28207 and its children but you could use any other one of interest for your UI developments). ``` -docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --overwrite +docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --root-page-id 28207 --overwrite ``` Extract interesting ZIM content and move it to `public` folder. diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index fdbd0de..97b2ec8 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "beautifulsoup4==4.12.3", "types-beautifulsoup4==4.12.0.20240907", "lxml==5.3.0", + "tinycss2==1.3.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index a08952d..ee4fd22 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -22,6 +22,8 @@ class LibreTextsParsingError(Exception): class LibreTextsHome(BaseModel): welcome_text_paragraphs: list[str] welcome_image_url: str + screen_css_url: str + print_css_url: str LibraryPageId = str @@ -206,6 +208,8 @@ def get_home(self) -> LibreTextsHome: return LibreTextsHome( welcome_text_paragraphs=_get_welcome_text_from_home(soup), welcome_image_url=_get_welcome_image_url_from_home(soup), + screen_css_url=_get_screen_css_url_from_home(soup), + print_css_url=_get_print_css_url_from_home(soup), ) def get_deki_token(self) -> str: @@ -308,7 +312,7 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent: if tree["body"][1]["@target"] != "toc": raise LibreTextsParsingError( f"Unexpected second body element of /pages/{page.id}/contents, " - f"@target property is '{tree["body"][1]["@target"]}' while only 'toc' " + f"@target property is '{tree['body'][1]['@target']}' while only 'toc' " "is expected" ) return LibraryPageContent(html_body=tree["body"][0]) @@ -373,3 +377,26 @@ def _get_deki_token_from_home(soup: BeautifulSoup) -> str: "Failed to retrieve API token to query website API, missing apiToken." ) return x_deki_token + + +def _get_any_css_url_from_home(soup: BeautifulSoup, media: str) -> str: + """Returns the URL of any media CSS found on home page""" + links = soup.find_all("link", {"rel": "stylesheet", "media": media}) + if len(links) != 1: + raise LibreTextsParsingError( + f"Failed to find {media} CSS URL in home page, {len(links)} link(s) found" + ) + css_url = links[0].get("href", None) + if not css_url: + raise LibreTextsParsingError("screen CSS link has no href") + return css_url + + +def _get_screen_css_url_from_home(soup: BeautifulSoup) -> str: + """Returns the URL of screen CSS found on home page""" + return _get_any_css_url_from_home(soup, "screen") + + +def _get_print_css_url_from_home(soup: BeautifulSoup) -> str: + """Returns the URL of print CSS found on home page""" + return _get_any_css_url_from_home(soup, "print") diff --git a/scraper/src/libretexts2zim/css.py b/scraper/src/libretexts2zim/css.py new file mode 100644 index 0000000..7e63a1c --- /dev/null +++ b/scraper/src/libretexts2zim/css.py @@ -0,0 +1,140 @@ +from collections.abc import Iterable +from pathlib import Path +from urllib.parse import urljoin, urlparse + +from tinycss2 import ast, parse_stylesheet_bytes, serialize # pyright: ignore +from tinycss2.serializer import serialize_url # pyright: ignore + +OriginalUrl = str +FullZimPath = Path +RelativeCssPath = Path + + +class CssProcessor: + """Utility to to process CSS, extract assets and rewrite URLs + + This utility can process multiple CSS documents that will be stored in a ZIM + It extracts the list of assets (images, fonts) that are used in the CSS documents + and compute appropriate ZIM paths for each of them. + + Arguments: + css_target_path: "folder" where the CSS documents that will be processed will be + stored in the ZIM + css_assets_root_path: "folder" where the CSS assets referenced in the CSS + documents will be stored in the ZIM + """ + + def __init__( + self, + css_target_path: Path = Path("/content"), + css_assets_root_path: Path = Path("/content/css_assets"), + ) -> None: + self.css_target_path = css_target_path + self.css_assets_root_path = css_assets_root_path + self.css_assets: dict[OriginalUrl, FullZimPath] = {} + self.used_paths: list[RelativeCssPath] = [] + + def process(self, css_original_url: str, css_content: bytes) -> str: + rules, _ = parse_stylesheet_bytes( # pyright: ignore[reportUnknownVariableType] + css_content + ) + self._process_list( + css_original_url, + rules, # pyright: ignore[reportUnknownArgumentType] + ) + return serialize(rules) + + def _process_url( + self, css_original_url: str, css_url: str + ) -> RelativeCssPath | None: + original_url = urljoin(css_original_url, css_url) + original_url_parsed = urlparse(original_url) + if original_url_parsed.scheme.lower() not in ["http", "https"]: + return None + if original_url in self.css_assets: + return self.css_assets[original_url].relative_to(self.css_target_path) + original_path = Path(urlparse(original_url).path) + target_parent = Path( + *[ + parent.name + for parent in reversed(original_path.parents) + if parent.name and parent.name != ".." + ] + ) + + index = 0 + while True: + relative_path = ( + target_parent + / f"{original_path.stem}{'_' + str(index) if index else ''}" + f"{original_path.suffix}" + ) + if relative_path not in self.used_paths: + break + index += 1 + + self.used_paths.append(relative_path) + target_path = self.css_assets_root_path / relative_path + self.css_assets[original_url] = target_path + return target_path.relative_to(self.css_target_path) + + def _process_node(self, css_original_url: str, node: ast.Node): + if isinstance( + node, + ast.QualifiedRule + | ast.SquareBracketsBlock + | ast.ParenthesesBlock + | ast.CurlyBracketsBlock, + ): + self._process_list( + css_original_url, + node.content, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType] + ) + elif isinstance(node, ast.FunctionBlock): + if node.lower_name == "url": # pyright: ignore[reportUnknownMemberType] + url_node: ast.Node = node.arguments[0] # pyright: ignore + relative_css_path = self._process_url( + css_original_url, + url_node.value, # pyright: ignore + ) + if not relative_css_path: + return + url_node.value = str(relative_css_path) # pyright: ignore + url_node.representation = ( # pyright: ignore + f'"{serialize_url(str(relative_css_path))}"' + ) + + else: + self._process_list( + css_original_url, + node.arguments, # pyright: ignore + ) + elif isinstance(node, ast.AtRule): + self._process_list( + css_original_url, + node.prelude, # pyright: ignore + ) + self._process_list( + css_original_url, + node.content, # pyright: ignore + ) + elif isinstance(node, ast.Declaration): + self._process_list( + css_original_url, + node.value, # pyright: ignore + ) + elif isinstance(node, ast.URLToken): + relative_css_path = self._process_url( + css_original_url, + node.value, # pyright: ignore + ) + if not relative_css_path: + return + node.value = str(relative_css_path) + node.representation = f"url({serialize_url(str(relative_css_path))})" + + def _process_list(self, css_original_url: str, nodes: Iterable[ast.Node] | None): + if not nodes: + return + for node in nodes: + self._process_node(css_original_url, node) diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 7d078d6..eafcf26 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -5,6 +5,7 @@ from pathlib import Path from pydantic import BaseModel +from requests.exceptions import HTTPError from zimscraperlib.download import ( stream_file, # pyright: ignore[reportUnknownVariableType] ) @@ -21,6 +22,7 @@ LibreTextsMetadata, ) from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger +from libretexts2zim.css import CssProcessor from libretexts2zim.ui import ( ConfigModel, PageContentModel, @@ -261,11 +263,45 @@ def run(self) -> Path: logger.info(" Fetching and storing home page...") home = self.libretexts_client.get_home() + welcome_image = BytesIO() stream_file(home.welcome_image_url, byte_stream=welcome_image) add_item_for(creator, "content/logo.png", content=welcome_image.getvalue()) del welcome_image + css_processor = CssProcessor() + screen_css = BytesIO() + stream_file(home.screen_css_url, byte_stream=screen_css) + result = css_processor.process( + css_original_url=home.screen_css_url, css_content=screen_css.getvalue() + ) + add_item_for(creator, "content/screen.css", content=result) + del screen_css + + print_css = BytesIO() + stream_file(home.print_css_url, byte_stream=print_css) + result = css_processor.process( + css_original_url=home.print_css_url, css_content=print_css.getvalue() + ) + add_item_for(creator, "content/print.css", content=result) + del print_css + + logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...") + for asset_url, asset_path in css_processor.css_assets.items(): + try: + css_asset = BytesIO() + stream_file(asset_url, byte_stream=css_asset) + add_item_for( + creator, str(asset_path)[1:], content=css_asset.getvalue() + ) + logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM") + del css_asset + except HTTPError as exc: + # would make more sense to be a warning, but this is just too + # verbose, at least on geo.libretexts.org many assets are just + # missing + logger.debug(f"Ignoring {asset_path} due to {exc}") + logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}") for file in self.zimui_dist.rglob("*"): if file.is_dir(): @@ -278,7 +314,8 @@ def run(self) -> Path: creator=creator, path=path, content=index_html_path.read_text(encoding="utf-8").replace( - "Vite App", formatted_config.title_format + "Vite App", + f"{formatted_config.title_format}", ), mimetype="text/html", is_front=True, diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index e6a0251..ca3eed2 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -134,3 +134,19 @@ def test_get_home_welcome_text_paragraphs( def test_get_home_page_content(client: LibreTextsClient, page_tree: LibraryTree): """Ensures we can get content of root page""" assert client.get_page_content(page_tree.root).html_body + + +def test_get_home_screen_css_url(home: LibreTextsHome): + """Ensures proper screen CSS url is retrieved""" + assert ( + home.screen_css_url + == "https://a.mtstatic.com/@cache/layout/anonymous.css?_=715eca8811db7abb8e6f0555936e020d_Z2VvLmxpYnJldGV4dHMub3Jn:site_4038" + ) + + +def test_get_home_print_css_url(home: LibreTextsHome): + """Ensures proper print CSS url is retrieved""" + assert ( + home.print_css_url + == "https://a.mtstatic.com/@cache/layout/print.css?_=99d83fb44eaebe60981933ec554d138d:site_4038" + ) diff --git a/scraper/tests/test_css.py b/scraper/tests/test_css.py new file mode 100644 index 0000000..e6dcc0b --- /dev/null +++ b/scraper/tests/test_css.py @@ -0,0 +1,294 @@ +from pathlib import Path + +import pytest + +from libretexts2zim.css import CssProcessor + + +@pytest.mark.parametrize( + "css_document_content, css_document_url, expected_assets, expected_css_rewritten", + [ + pytest.param( + """ +body { + background-image: url('https://example.com/image.jpg'); +} +""", + "https://www.acme.com/styles/main.css", + {"https://example.com/image.jpg": Path("/content/css_assets/image.jpg")}, + """ +body { + background-image: url("css_assets/image.jpg"); +} +""", + id="basic_full", + ), + pytest.param( + """ +body { + background-image: url('/assets/image.jpg'); +} +""", + "https://www.acme.com/styles/main.css", + { + "https://www.acme.com/assets/image.jpg": Path( + "/content/css_assets/assets/image.jpg" + ) + }, + """ +body { + background-image: url("css_assets/assets/image.jpg"); +} +""", + id="basic_absolute", + ), + pytest.param( + """ +body { + background-image: url('../image.jpg'); +} +""", + "https://www.acme.com/styles/main.css", + {"https://www.acme.com/image.jpg": Path("/content/css_assets/image.jpg")}, + """ +body { + background-image: url("css_assets/image.jpg"); +} +""", + id="basic_relative1", + ), + pytest.param( + """ +body { + background-image: url('./image.jpg'); +} +""", + "https://www.acme.com/styles/main.css", + { + "https://www.acme.com/styles/image.jpg": Path( + "/content/css_assets/styles/image.jpg" + ) + }, + """ +body { + background-image: url("css_assets/styles/image.jpg"); +} +""", + id="basic_relative2", + ), + pytest.param( + """ +@import url("print.css") +""", + "https://www.acme.com/styles/main.css", + { + "https://www.acme.com/styles/print.css": Path( + "/content/css_assets/styles/print.css" + ) + }, + """ +@import url("css_assets/styles/print.css") +;""", + id="import", + ), + pytest.param( + """ +body { + background-image: url('https://example.com/image.jpg'), url('/assets/image.jpg'); +} +""", + "https://www.acme.com/styles/main.css", + { + "https://example.com/image.jpg": Path("/content/css_assets/image.jpg"), + "https://www.acme.com/assets/image.jpg": Path( + "/content/css_assets/assets/image.jpg" + ), + }, + """ +body { + background-image: url("css_assets/image.jpg"), url("css_assets/assets/image.jpg"); +} +""", + id="two_backgrounds", + ), + pytest.param( + """ +.ui-widget-content { + background: #fff url("https://example.com/banner2.png") 50% 50% repeat-x; + color: #222; +} +""", + "https://www.acme.com/styles/main.css", + { + "https://example.com/banner2.png": Path( + "/content/css_assets/banner2.png" + ), + }, + """ +.ui-widget-content { + background: #fff url("css_assets/banner2.png") 50% 50% repeat-x; + color: #222; +} +""", + id="complex_1", + ), + pytest.param( + """ +@font-face { + font-display: swap; + font-family: icomoon; + font-style: normal; + font-weight: 400; + src: url(/@style/icons/icomoon.eot?_=ae123bc); + src: url(/@style/icons/icomoon.eot?_=ae123bc#iefix) + format("embedded-opentype"), + url(/@style/icons/icomoon.woff?_=ae123bc) + format("woff"), + url(/@style/icons/icomoon.ttf?_=ae123bc) + format("truetype"), + url(/@style/icons/icomoon.svg?_=ae123bc#icomoon) + format("svg"); +} +""", + "https://www.acme.com/styles/main.css", + { + "https://www.acme.com/@style/icons/icomoon.eot?_=ae123bc": Path( + "/content/css_assets/@style/icons/icomoon.eot" + ), + "https://www.acme.com/@style/icons/icomoon.eot?_=ae123bc#iefix": Path( + "/content/css_assets/@style/icons/icomoon_1.eot" + ), + "https://www.acme.com/@style/icons/icomoon.woff?_=ae123bc": Path( + "/content/css_assets/@style/icons/icomoon.woff" + ), + "https://www.acme.com/@style/icons/icomoon.ttf?_=ae123bc": Path( + "/content/css_assets/@style/icons/icomoon.ttf" + ), + "https://www.acme.com/@style/icons/icomoon.svg?_=ae123bc#icomoon": Path( + "/content/css_assets/@style/icons/icomoon.svg" + ), + }, + """ +@font-face { + font-display: swap; + font-family: icomoon; + font-style: normal; + font-weight: 400; + src: url(css_assets/@style/icons/icomoon.eot); + src: url(css_assets/@style/icons/icomoon_1.eot) + format("embedded-opentype"), + url(css_assets/@style/icons/icomoon.woff) + format("woff"), + url(css_assets/@style/icons/icomoon.ttf) + format("truetype"), + url(css_assets/@style/icons/icomoon.svg) + format("svg"); +} +""", + id="complex_2", + ), + pytest.param( + """ +body { + background-image: url('https://example.com/image.jpg'); +} +div { + background-image: url('https://example.com/image.jpg'); +} +""", + "https://www.acme.com/styles/main.css", + {"https://example.com/image.jpg": Path("/content/css_assets/image.jpg")}, + """ +body { + background-image: url("css_assets/image.jpg"); +} +div { + background-image: url("css_assets/image.jpg"); +} +""", + id="duplicate", + ), + pytest.param( + """ +.magicBg { +background-image: url(data:image/gif;base64,R0lGODlhAQBkAPcAAAAAAAEBAQICAgMDAwQEBAUFBQ) +} +""", + "https://www.acme.com/styles/main.css", + {}, + """ +.magicBg { +background-image: url(data:image/gif;base64,R0lGODlhAQBkAPcAAAAAAAEBAQICAgMDAwQEBAUFBQ) +} +""", + id="ignore_data", + ), + ], +) +def test_css_processor_single_doc( + css_document_content: str, + css_document_url: str, + expected_assets: dict[str, Path], + expected_css_rewritten: str, +): + processor = CssProcessor() + result = processor.process(css_document_url, css_document_content.encode()) + assert processor.css_assets == expected_assets + assert result == expected_css_rewritten + + +def test_css_processor_multiple_docs(): + doc1 = """ +body { + background-image: url('https://example.com/image.jpg'), url('https://example.com/image.jpg?_=test1'); +} +""" + doc2 = """ +div { + background-image: url('https://example.com/image.jpg'), url('https://example.com/image.jpg?_=test2'); +} +""" + css_1_url = "https://www.acme.com/styles/main1.css" + css_2_url = "https://www.acme.com/styles/main2.css" + processor = CssProcessor() + + # process a first document + result1 = processor.process(css_original_url=css_1_url, css_content=doc1.encode()) + + assert processor.css_assets == { + "https://example.com/image.jpg": Path("/content/css_assets/image.jpg"), + "https://example.com/image.jpg?_=test1": Path( + "/content/css_assets/image_1.jpg" + ), + } + + assert ( + result1 + == """ +body { + background-image: url("css_assets/image.jpg"), url("css_assets/image_1.jpg"); +} +""" + ) + + # process a second document + result2 = processor.process(css_original_url=css_2_url, css_content=doc2.encode()) + + assert processor.css_assets == { + "https://example.com/image.jpg": Path("/content/css_assets/image.jpg"), + "https://example.com/image.jpg?_=test1": Path( + "/content/css_assets/image_1.jpg" + ), + "https://example.com/image.jpg?_=test2": Path( + "/content/css_assets/image_2.jpg" + ), + } + + assert ( + result2 + == """ +div { + background-image: url("css_assets/image.jpg"), url("css_assets/image_2.jpg"); +} +""" + ) diff --git a/zimui/index.html b/zimui/index.html index a888544..4fccf60 100644 --- a/zimui/index.html +++ b/zimui/index.html @@ -1,9 +1,11 @@ - + - - - + + + + + Vite App