openzim · benoit74 · Oct 3, 2024 · Sep 30, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -44,6 +44,8 @@ RUN pip install --no-cache-dir /src/scraper \
 # Copy zimui build output
 COPY --from=zimui /src/dist /src/zimui
 
-ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui
+ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui \
+    LIBRETEXTS_OUTPUT=/output \
+    LIBRETEXTS_TMP=/tmp
 
 CMD ["libretexts2zim", "--help"]
diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml
@@ -47,7 +47,6 @@ dev = [
 
 [project.scripts]
 libretexts2zim = "libretexts2zim.__main__:main"
-libretexts2zim-playlists = "libretexts2zim.playlists.__main__:main"
 
 [tool.hatch.version]
 path = "src/libretexts2zim/__about__.py"

diff --git a/scraper/src/libretexts2zim/__main__.py b/scraper/src/libretexts2zim/__main__.py
@@ -1,9 +1,12 @@
-#!/usr/bin/env python3
-# vim: ai ts=4 sts=4 et sw=4 nu
+import tempfile
 
-import sys
+from libretexts2zim.entrypoint import main as entrypoint
+
+
+def main():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        entrypoint(tmpdir)
 
-from libretexts2zim.entrypoint import main
 
 if __name__ == "__main__":
-    sys.exit(main())
+    main()
diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py
@@ -1,14 +1,18 @@
 import datetime
+import json
 import re
 from collections.abc import Callable
+from pathlib import Path
+from typing import Any
 
 import requests
 from bs4 import BeautifulSoup, NavigableString
 from pydantic import BaseModel
 
 from libretexts2zim.constants import logger
 
-HTTP_TIMEOUT_SECONDS = 15
+HTTP_TIMEOUT_NORMAL_SECONDS = 15
+HTTP_TIMEOUT_LONG_SECONDS = 30
 
 
 class LibreTextsParsingError(Exception):
@@ -50,48 +54,152 @@
 class LibreTextsClient:
     """Utility functions to read data from libretexts."""
 
-    def __init__(self, library_slug: str) -> None:
+    def __init__(self, library_slug: str, cache_folder: Path) -> None:
         """Initializes LibreTextsClient.
 
         Paremters:
             library_url: Scheme, hostname, and port for the Libretext library
                 e.g. `https://geo.libretexts.org/`.
         """
         self.library_slug = library_slug
+        self.deki_token = None
+        self.cache_folder = cache_folder
 
     @property
     def library_url(self) -> str:
-        return f"https://{self.library_slug}.libretexts.org/"
+        return f"https://{self.library_slug}.libretexts.org"
 
-    def _get_text(self, url: str) -> str:
+    @property
+    def api_url(self) -> str:
+        return f"{self.library_url}/@api/deki"
+
+    def _get_cache_file(self, url_subpath_and_query: str) -> Path:
+        """Get location where HTTP result should be cached"""
+        url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query)
+        if url_subpath_and_query.endswith("/"):
+            url_subpath_and_query += "index"
+        return self.cache_folder / url_subpath_and_query
+
+    def _get_text(self, url_subpath_and_query: str) -> str:
         """Perform a GET request and return the response as decoded text."""
 
-        logger.debug(f"Fetching {url}")
+        cache_file = self._get_cache_file(f"text{url_subpath_and_query}")
+        if cache_file.exists():
+            return cache_file.read_text()
+        cache_file.parent.mkdir(parents=True, exist_ok=True)
+
+        full_url = f"{self.library_url}{url_subpath_and_query}"
+        logger.debug(f"Fetching {full_url}")
 
         resp = requests.get(
-            url=url,
+            url=full_url,
             allow_redirects=True,
-            timeout=HTTP_TIMEOUT_SECONDS,
+            timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
         )
         resp.raise_for_status()
 
+        cache_file.write_text(resp.text)
         return resp.text
 
+    def _get_api_resp(
+        self, api_sub_path_and_query: str, timeout: float
+    ) -> requests.Response:
+        api_url = f"{self.api_url}{api_sub_path_and_query}"
+        logger.debug(f"Calling API at {api_url}")
+        resp = requests.get(
+            url=api_url,
+            headers={"x-deki-token": self.deki_token},
+            timeout=timeout,
+        )
+        resp.raise_for_status()
+        return resp
+
+    def _get_api_json(
+        self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
+    ) -> Any:
+        cache_file = self._get_cache_file(f"api_json{api_sub_path}")
+        if cache_file.exists():
+            return json.loads(cache_file.read_text())
+        cache_file.parent.mkdir(parents=True, exist_ok=True)
+        resp = self._get_api_resp(
+            f"{api_sub_path}?dream.out.format=json", timeout=timeout
+        )
+        result = resp.json()
+        cache_file.write_text(json.dumps(result))
+        return result
+
+    def _get_api_content(
+        self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
+    ) -> bytes | Any:
+        cache_file = self._get_cache_file(f"api_content{api_sub_path}")
+        if cache_file.exists():
+            return cache_file.read_bytes()
+        cache_file.parent.mkdir(parents=True, exist_ok=True)
+        resp = self._get_api_resp(api_sub_path, timeout=timeout)
+        result = resp.content
+        cache_file.write_bytes(result)
+        return result
+
     def get_home(self) -> LibreTextsHome:
-        home_content = self._get_text(self.library_url)
+        """Retrieves data about home page by crawling home page"""
+        home_content = self._get_text("/")
 
         soup = _get_soup(home_content)
+        self.deki_token = _get_deki_token_from_home(soup)
         return LibreTextsHome(
             welcome_text_paragraphs=_get_welcome_text_from_home(soup),
             welcome_image_url=_get_welcome_image_url_from_home(soup),
         )
 
+    def get_deki_token(self) -> str:
+        """Retrieves the API token to use to query the website API"""
+        if self.deki_token:
+            return self.deki_token
+
+        home_content = self._get_text("/")
+
+        soup = _get_soup(home_content)
+        self.deki_token = _get_deki_token_from_home(soup)
+        return self.deki_token
+
+    def get_all_pages_ids(self):
+        """Returns the IDs of all pages on current website, exploring the whole tree"""
+
+        tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
+
+        page_ids: list[str] = []
+
+        def _get_page_ids(page_node: Any) -> None:
+            page_ids.append(page_node["@id"])
+            if not page_node["subpages"]:
+                return
+            if "@id" in page_node["subpages"]["page"]:
+                _get_page_ids(page_node["subpages"]["page"])
+            else:
+                for page in page_node["subpages"]["page"]:
+                    _get_page_ids(page)
+
+        _get_page_ids(tree["page"])
+
+        return page_ids
+
+    def get_root_page_id(self) -> str:
+        """Returns the ID the root of the tree of pages"""
+
+        tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
+        return tree["page"]["@id"]
+
 
 def _get_soup(content: str) -> BeautifulSoup:
+    """Return a BeautifulSoup soup from textual content
+
+    This is a utility function to ensure same parser is used in the whole codebase
+    """
     return BeautifulSoup(content, "lxml")
 
 
 def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
+    """Return the URL of the image found on home header"""
     branding_div = soup.find("div", class_="LTBranding")
     if not branding_div:
         raise LibreTextsParsingError("<div> with class 'LTBranding' not found")
@@ -111,6 +219,7 @@
 
 
 def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
+    """Returns the text found on home page"""
     content_section = soup.find("section", class_="mt-content-container")
     if not content_section or isinstance(content_section, NavigableString):
         raise LibreTextsParsingError(
@@ -121,3 +230,22 @@
         if paragraph_text := paragraph.text:
             welcome_text.append(paragraph_text)
     return welcome_text
+
+
+def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
+    global_settings = soup.find("script", id="mt-global-settings")
+    if not global_settings:
+        logger.debug("home content:")
+        logger.debug(soup)
+        raise Exception(
+            "Failed to retrieve API token to query website API, missing "
+            "mt-global-settings script"
+        )
+    x_deki_token = json.loads(global_settings.text).get("apiToken", None)
+    if not x_deki_token:
+        logger.debug("mt-global-settings script content:")
+        logger.debug(global_settings.text)
+        raise Exception(
+            "Failed to retrieve API token to query website API, missing apiToken."
+        )
+    return x_deki_token
diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py
@@ -8,6 +8,7 @@
     MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
     RECOMMENDED_MAX_TITLE_LENGTH,
 )
+from zimscraperlib.zim.filesystem import validate_zimfile_creatable
 
 from libretexts2zim.client import LibreTextsClient
 from libretexts2zim.constants import (
@@ -137,7 +138,7 @@
     )
 
 
-def main() -> None:
+def main(tmpdir: str) -> None:
     parser = argparse.ArgumentParser(
         prog=NAME,
     )
@@ -177,6 +178,13 @@
         dest="output_folder",
     )
 
+    parser.add_argument(
+        "--tmp",
+        help="Temporary folder for cache, intermediate files, ... Default: tmp",
+        default=os.getenv("LIBRETEXTS_TMP", tmpdir),
+        dest="tmp_folder",
+    )
+
     parser.add_argument(
         "--debug", help="Enable verbose output", action="store_true", default=False
     )
@@ -191,15 +199,35 @@
         default=os.getenv("LIBRETEXTS_ZIMUI_DIST", "../zimui/dist"),
     )
 
+    parser.add_argument(
+        "--keep-cache",
+        help="Keep cache of website responses",
+        action="store_true",
+        default=False,
+    )
+
     args = parser.parse_args()
 
     logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
 
+    output_folder = Path(args.output_folder)
+    output_folder.mkdir(exist_ok=True)
+    validate_zimfile_creatable(output_folder, "test.txt")
+
+    tmp_folder = Path(args.tmp_folder)
+    tmp_folder.mkdir(exist_ok=True)
+    validate_zimfile_creatable(tmp_folder, "test.txt")
+
     try:
         zim_config = ZimConfig.of(args)
         doc_filter = ContentFilter.of(args)
+
+        cache_folder = tmp_folder / "cache"
+        cache_folder.mkdir()
+
         libretexts_client = LibreTextsClient(
             library_slug=args.library_slug,
+            cache_folder=cache_folder,
         )
 
         Processor(
@@ -217,7 +245,3 @@
         logger.exception(exc)
         logger.error(f"Generation failed with the following error: {exc}")
         raise SystemExit(1) from exc
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py
@@ -9,6 +9,7 @@
 )
 from zimscraperlib.image import resize_image
 from zimscraperlib.zim import Creator
+from zimscraperlib.zim.filesystem import validate_zimfile_creatable
 from zimscraperlib.zim.indexing import IndexData
 
 from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
@@ -117,8 +118,6 @@
         self.zimui_dist = zimui_dist
         self.overwrite_existing_zim = overwrite_existing_zim
 
-        self.output_folder.mkdir(exist_ok=True)
-
         self.zim_illustration_path = self.libretexts_newsite_path(
             "header_logo_mini.png"
         )
@@ -145,11 +144,17 @@
             name=self.zim_config.library_name, slug=self.libretexts_client.library_slug
         )
         formatted_config = self.zim_config.format(metadata.placeholders())
-        zim_path = Path(self.output_folder, f"{formatted_config.file_name_format}.zim")
+        zim_file_name = f"{formatted_config.file_name_format}.zim"
+        zim_path = self.output_folder / zim_file_name
+
+        if zim_path.exists():
+            if self.overwrite_existing_zim:
+                zim_path.unlink()
+            else:
+                logger.error(f"  {zim_path} already exists, aborting.")
+                raise SystemExit(2)
 
-        if zim_path.exists() and not self.overwrite_existing_zim:
-            logger.error(f"  {zim_path} already exists, aborting.")
-            raise SystemExit(2)
+        validate_zimfile_creatable(self.output_folder, zim_file_name)
 
         logger.info(f"  Writing to: {zim_path}")
 

diff --git a/scraper/tests-integration/README.md b/scraper/tests-integration/README.md
@@ -0,0 +1,6 @@
+This folder contains integration tests, testing how the scraper behaves:
+
+- with a real libretexts website
+- from end-to-end
+
+They are targetted at being ran from scraper Docker image from Github workflow(s).
diff --git a/scraper/tests-integration/conftest.py b/scraper/tests-integration/conftest.py
@@ -1,3 +1,8 @@
+import tempfile
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+
 import pytest
 
 
@@ -6,6 +11,12 @@ def libretexts_slug() -> str:
     return "geo"
 
 
+@pytest.fixture(scope="module")
+def cache_folder() -> Generator[Path, Any, Any]:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir)
+
+
 @pytest.fixture(scope="module")
 def libretexts_url(libretexts_slug: str) -> str:
     return f"https://{libretexts_slug}.libretexts.org"