diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4205369..d4cefca 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,10 +24,10 @@ To achieve this, first build the Docker image based on current code base. docker build -t local-libretexts2zim . ``` -Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, but you could use any other one of interest for your UI developments). +Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, with only page id 28207 and its children but you could use any other one of interest for your UI developments). ``` -docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --overwrite +docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --root-page-id 28207 --overwrite ``` Extract interesting ZIM content and move it to `public` folder. diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index fdbd0de..97b2ec8 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "beautifulsoup4==4.12.3", "types-beautifulsoup4==4.12.0.20240907", "lxml==5.3.0", + "tinycss2==1.3.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index a08952d..ee4fd22 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -22,6 +22,8 @@ class LibreTextsParsingError(Exception): class LibreTextsHome(BaseModel): welcome_text_paragraphs: list[str] welcome_image_url: str + screen_css_url: str + print_css_url: str LibraryPageId = str @@ -206,6 +208,8 @@ def get_home(self) -> LibreTextsHome: return LibreTextsHome( welcome_text_paragraphs=_get_welcome_text_from_home(soup), welcome_image_url=_get_welcome_image_url_from_home(soup), + screen_css_url=_get_screen_css_url_from_home(soup), + print_css_url=_get_print_css_url_from_home(soup), ) def get_deki_token(self) -> str: @@ -308,7 +312,7 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent: if tree["body"][1]["@target"] != "toc": raise LibreTextsParsingError( f"Unexpected second body element of /pages/{page.id}/contents, " - f"@target property is '{tree["body"][1]["@target"]}' while only 'toc' " + f"@target property is '{tree['body'][1]['@target']}' while only 'toc' " "is expected" ) return LibraryPageContent(html_body=tree["body"][0]) @@ -373,3 +377,26 @@ def _get_deki_token_from_home(soup: BeautifulSoup) -> str: "Failed to retrieve API token to query website API, missing apiToken." ) return x_deki_token + + +def _get_any_css_url_from_home(soup: BeautifulSoup, media: str) -> str: + """Returns the URL of any media CSS found on home page""" + links = soup.find_all("link", {"rel": "stylesheet", "media": media}) + if len(links) != 1: + raise LibreTextsParsingError( + f"Failed to find {media} CSS URL in home page, {len(links)} link(s) found" + ) + css_url = links[0].get("href", None) + if not css_url: + raise LibreTextsParsingError("screen CSS link has no href") + return css_url + + +def _get_screen_css_url_from_home(soup: BeautifulSoup) -> str: + """Returns the URL of screen CSS found on home page""" + return _get_any_css_url_from_home(soup, "screen") + + +def _get_print_css_url_from_home(soup: BeautifulSoup) -> str: + """Returns the URL of print CSS found on home page""" + return _get_any_css_url_from_home(soup, "print") diff --git a/scraper/src/libretexts2zim/css.py b/scraper/src/libretexts2zim/css.py new file mode 100644 index 0000000..7e63a1c --- /dev/null +++ b/scraper/src/libretexts2zim/css.py @@ -0,0 +1,140 @@ +from collections.abc import Iterable +from pathlib import Path +from urllib.parse import urljoin, urlparse + +from tinycss2 import ast, parse_stylesheet_bytes, serialize # pyright: ignore +from tinycss2.serializer import serialize_url # pyright: ignore + +OriginalUrl = str +FullZimPath = Path +RelativeCssPath = Path + + +class CssProcessor: + """Utility to to process CSS, extract assets and rewrite URLs + + This utility can process multiple CSS documents that will be stored in a ZIM + It extracts the list of assets (images, fonts) that are used in the CSS documents + and compute appropriate ZIM paths for each of them. + + Arguments: + css_target_path: "folder" where the CSS documents that will be processed will be + stored in the ZIM + css_assets_root_path: "folder" where the CSS assets referenced in the CSS + documents will be stored in the ZIM + """ + + def __init__( + self, + css_target_path: Path = Path("/content"), + css_assets_root_path: Path = Path("/content/css_assets"), + ) -> None: + self.css_target_path = css_target_path + self.css_assets_root_path = css_assets_root_path + self.css_assets: dict[OriginalUrl, FullZimPath] = {} + self.used_paths: list[RelativeCssPath] = [] + + def process(self, css_original_url: str, css_content: bytes) -> str: + rules, _ = parse_stylesheet_bytes( # pyright: ignore[reportUnknownVariableType] + css_content + ) + self._process_list( + css_original_url, + rules, # pyright: ignore[reportUnknownArgumentType] + ) + return serialize(rules) + + def _process_url( + self, css_original_url: str, css_url: str + ) -> RelativeCssPath | None: + original_url = urljoin(css_original_url, css_url) + original_url_parsed = urlparse(original_url) + if original_url_parsed.scheme.lower() not in ["http", "https"]: + return None + if original_url in self.css_assets: + return self.css_assets[original_url].relative_to(self.css_target_path) + original_path = Path(urlparse(original_url).path) + target_parent = Path( + *[ + parent.name + for parent in reversed(original_path.parents) + if parent.name and parent.name != ".." + ] + ) + + index = 0 + while True: + relative_path = ( + target_parent + / f"{original_path.stem}{'_' + str(index) if index else ''}" + f"{original_path.suffix}" + ) + if relative_path not in self.used_paths: + break + index += 1 + + self.used_paths.append(relative_path) + target_path = self.css_assets_root_path / relative_path + self.css_assets[original_url] = target_path + return target_path.relative_to(self.css_target_path) + + def _process_node(self, css_original_url: str, node: ast.Node): + if isinstance( + node, + ast.QualifiedRule + | ast.SquareBracketsBlock + | ast.ParenthesesBlock + | ast.CurlyBracketsBlock, + ): + self._process_list( + css_original_url, + node.content, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType] + ) + elif isinstance(node, ast.FunctionBlock): + if node.lower_name == "url": # pyright: ignore[reportUnknownMemberType] + url_node: ast.Node = node.arguments[0] # pyright: ignore + relative_css_path = self._process_url( + css_original_url, + url_node.value, # pyright: ignore + ) + if not relative_css_path: + return + url_node.value = str(relative_css_path) # pyright: ignore + url_node.representation = ( # pyright: ignore + f'"{serialize_url(str(relative_css_path))}"' + ) + + else: + self._process_list( + css_original_url, + node.arguments, # pyright: ignore + ) + elif isinstance(node, ast.AtRule): + self._process_list( + css_original_url, + node.prelude, # pyright: ignore + ) + self._process_list( + css_original_url, + node.content, # pyright: ignore + ) + elif isinstance(node, ast.Declaration): + self._process_list( + css_original_url, + node.value, # pyright: ignore + ) + elif isinstance(node, ast.URLToken): + relative_css_path = self._process_url( + css_original_url, + node.value, # pyright: ignore + ) + if not relative_css_path: + return + node.value = str(relative_css_path) + node.representation = f"url({serialize_url(str(relative_css_path))})" + + def _process_list(self, css_original_url: str, nodes: Iterable[ast.Node] | None): + if not nodes: + return + for node in nodes: + self._process_node(css_original_url, node) diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 7d078d6..eafcf26 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -5,6 +5,7 @@ from pathlib import Path from pydantic import BaseModel +from requests.exceptions import HTTPError from zimscraperlib.download import ( stream_file, # pyright: ignore[reportUnknownVariableType] ) @@ -21,6 +22,7 @@ LibreTextsMetadata, ) from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger +from libretexts2zim.css import CssProcessor from libretexts2zim.ui import ( ConfigModel, PageContentModel, @@ -261,11 +263,45 @@ def run(self) -> Path: logger.info(" Fetching and storing home page...") home = self.libretexts_client.get_home() + welcome_image = BytesIO() stream_file(home.welcome_image_url, byte_stream=welcome_image) add_item_for(creator, "content/logo.png", content=welcome_image.getvalue()) del welcome_image + css_processor = CssProcessor() + screen_css = BytesIO() + stream_file(home.screen_css_url, byte_stream=screen_css) + result = css_processor.process( + css_original_url=home.screen_css_url, css_content=screen_css.getvalue() + ) + add_item_for(creator, "content/screen.css", content=result) + del screen_css + + print_css = BytesIO() + stream_file(home.print_css_url, byte_stream=print_css) + result = css_processor.process( + css_original_url=home.print_css_url, css_content=print_css.getvalue() + ) + add_item_for(creator, "content/print.css", content=result) + del print_css + + logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...") + for asset_url, asset_path in css_processor.css_assets.items(): + try: + css_asset = BytesIO() + stream_file(asset_url, byte_stream=css_asset) + add_item_for( + creator, str(asset_path)[1:], content=css_asset.getvalue() + ) + logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM") + del css_asset + except HTTPError as exc: + # would make more sense to be a warning, but this is just too + # verbose, at least on geo.libretexts.org many assets are just + # missing + logger.debug(f"Ignoring {asset_path} due to {exc}") + logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}") for file in self.zimui_dist.rglob("*"): if file.is_dir(): @@ -278,7 +314,8 @@ def run(self) -> Path: creator=creator, path=path, content=index_html_path.read_text(encoding="utf-8").replace( - "