From a04e45ef5b5067cfbf0c38f5c99e28ece6282741 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 30 Sep 2024 13:50:44 +0000
Subject: [PATCH] Retrieve list of page IDs and root of the tree from API, and
 introduce caching

---
 scraper/src/libretexts2zim/client.py     | 140 ++++++++++++++++++++++-
 scraper/src/libretexts2zim/entrypoint.py |  36 +++++-
 scraper/src/libretexts2zim/generator.py  |  20 ++--
 scraper/tests-integration/conftest.py    |  11 ++
 scraper/tests-integration/test_client.py |  42 ++++++-
 5 files changed, 230 insertions(+), 19 deletions(-)

diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py
index a88c8d7..13ebe71 100644
--- a/scraper/src/libretexts2zim/client.py
+++ b/scraper/src/libretexts2zim/client.py
@@ -1,6 +1,9 @@
 import datetime
+import json
 import re
 from collections.abc import Callable
+from pathlib import Path
+from typing import Any
 
 import requests
 from bs4 import BeautifulSoup, NavigableString
@@ -57,7 +60,7 @@ def placeholders(
 class LibreTextsClient:
     """Utility functions to read data from libretexts."""
 
-    def __init__(self, library_slug: str) -> None:
+    def __init__(self, library_slug: str, cache_folder: Path) -> None:
         """Initializes LibreTextsClient.
 
         Paremters:
@@ -65,41 +68,146 @@ def __init__(self, library_slug: str) -> None:
                 e.g. `https://geo.libretexts.org/`.
         """
         self.library_slug = library_slug
+        self.deki_token = None
+        self.cache_folder = cache_folder
 
     @property
     def library_url(self) -> str:
-        return f"https://{self.library_slug}.libretexts.org/"
+        return f"https://{self.library_slug}.libretexts.org"
 
-    def _get_text(self, url: str) -> str:
+    @property
+    def api_url(self) -> str:
+        return f"{self.library_url}/@api/deki"
+
+    def _get_cache_file(self, url_subpath_and_query: str) -> Path:
+        """Get location where HTTP result should be cached"""
+        if url_subpath_and_query.startswith("/"):
+            url_subpath_and_query = url_subpath_and_query[1:]
+        if url_subpath_and_query.endswith("/"):
+            url_subpath_and_query += "index"
+        return self.cache_folder / url_subpath_and_query
+
+    def _get_text(self, url_subpath_and_query: str) -> str:
         """Perform a GET request and return the response as decoded text."""
 
-        logger.debug(f"Fetching {url}")
+        cache_file = self._get_cache_file(f"text{url_subpath_and_query}")
+        if cache_file.exists():
+            return cache_file.read_text()
+        cache_file.parent.mkdir(parents=True, exist_ok=True)
+
+        full_url = f"{self.library_url}{url_subpath_and_query}"
+        logger.debug(f"Fetching {full_url}")
 
         resp = requests.get(
-            url=url,
+            url=full_url,
             allow_redirects=True,
             timeout=HTTP_TIMEOUT_SECONDS,
         )
         resp.raise_for_status()
 
+        cache_file.write_text(resp.text)
         return resp.text
 
+    def _get_api_resp(
+        self, api_sub_path_and_query: str, timeout: float
+    ) -> requests.Response:
+        api_url = f"{self.api_url}{api_sub_path_and_query}"
+        logger.debug(f"Calling API at {api_url}")
+        resp = requests.get(
+            url=api_url,
+            headers={"x-deki-token": self.deki_token},
+            timeout=timeout,
+        )
+        resp.raise_for_status()
+        return resp
+
+    def _get_api_json(
+        self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
+    ) -> Any:
+        cache_file = self._get_cache_file(f"api_json{api_sub_path}")
+        if cache_file.exists():
+            return json.loads(cache_file.read_text())
+        cache_file.parent.mkdir(parents=True, exist_ok=True)
+        resp = self._get_api_resp(
+            f"{api_sub_path}?dream.out.format=json", timeout=timeout
+        )
+        result = resp.json()
+        cache_file.write_text(json.dumps(result))
+        return result
+
+    def _get_api_content(
+        self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
+    ) -> bytes | Any:
+        cache_file = self._get_cache_file(f"api_content{api_sub_path}")
+        if cache_file.exists():
+            return json.loads(cache_file.read_text())
+        cache_file.parent.mkdir(parents=True, exist_ok=True)
+        resp = self._get_api_resp(api_sub_path, timeout=timeout)
+        result = resp.content
+        cache_file.write_bytes(result)
+        return result
+
     def get_home(self) -> LibreTextsHome:
-        home_content = self._get_text(self.library_url)
+        """Retrieves data about home page by crawling home page"""
+        home_content = self._get_text("/")
 
         soup = _get_soup(home_content)
+        self.deki_token = _get_deki_token_from_home(soup)
         return LibreTextsHome(
             welcome_text_paragraphs=_get_welcome_text_from_home(soup),
             welcome_image_url=_get_welcome_image_url_from_home(soup),
             shelves=[],
         )
 
+    def get_deki_token(self) -> str:
+        """Retrieves the API token to use to query the website API"""
+        if self.deki_token:
+            return self.deki_token
+
+        home_content = self._get_text("/")
+
+        soup = _get_soup(home_content)
+        self.deki_token = _get_deki_token_from_home(soup)
+        return self.deki_token
+
+    def get_all_pages_ids(self):
+        """Returns the IDs of all pages on current website, exploring the whole tree"""
+
+        tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
+
+        page_ids: list[str] = []
+
+        def _get_page_ids(page_node: Any) -> None:
+            page_ids.append(page_node["@id"])
+            if not page_node["subpages"]:
+                return
+            if "@id" in page_node["subpages"]["page"]:
+                _get_page_ids(page_node["subpages"]["page"])
+            else:
+                for page in page_node["subpages"]["page"]:
+                    _get_page_ids(page)
+
+        _get_page_ids(tree["page"])
+
+        return page_ids
+
+    def get_root_page_id(self) -> str:
+        """Returns the ID the root of the tree of pages"""
+
+        tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
+        return tree["page"]["@id"]
+
 
 def _get_soup(content: str) -> BeautifulSoup:
+    """Return a BeautifulSoup soup from textual content
+
+    This is a utility function to ensure same parser is used in the whole codebase
+    """
     return BeautifulSoup(content, "html.parser")
 
 
 def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
+    """Return the URL of the image found on home header"""
     branding_div = soup.find("div", class_="LTBranding")
     if not branding_div:
         raise LibreTextsParsingError("<div> with class 'LTBranding' not found")
@@ -119,6 +227,7 @@ def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
 
 
 def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
+    """Returns the text found on home page"""
     content_section = soup.find("section", class_="mt-content-container")
     if not content_section or isinstance(content_section, NavigableString):
         raise LibreTextsParsingError(
@@ -133,3 +242,22 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
         if paragraph_text := paragraph.text:
             welcome_text.append(paragraph_text)
     return welcome_text
+
+
+def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
+    global_settings = soup.find("script", id="mt-global-settings")
+    if not global_settings:
+        logger.debug("home content:")
+        logger.debug(soup)
+        raise Exception(
+            "Failed to retrieve API token to query website API, missing "
+            "mt-global-settings script"
+        )
+    x_deki_token = json.loads(global_settings.text).get("apiToken", None)
+    if not x_deki_token:
+        logger.debug("mt-global-settings script content:")
+        logger.debug(global_settings.text)
+        raise Exception(
+            "Failed to retrieve API token to query website API, missing apiToken."
+        )
+    return x_deki_token
diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py
index 2ed9c37..17fe82c 100644
--- a/scraper/src/libretexts2zim/entrypoint.py
+++ b/scraper/src/libretexts2zim/entrypoint.py
@@ -1,6 +1,9 @@
 import argparse
 import logging
 import os
+from pathlib import Path
+
+from zimscraperlib.zim.filesystem import validate_zimfile_creatable
 
 from libretexts2zim.client import LibreTextsClient
 from libretexts2zim.constants import (
@@ -46,11 +49,18 @@ def main() -> None:
 
     parser.add_argument(
         "--output",
-        help="Output folder for ZIMs. Default: /output",
-        default="/output",
+        help="Output folder for ZIMs. Default: output",
+        default=os.getenv("LIBRETEXTS_OUTPUT", "output"),
         dest="output_folder",
     )
 
+    parser.add_argument(
+        "--tmp",
+        help="Temporary folder for cache, intermediate files, ... Default: tmp",
+        default=os.getenv("LIBRETEXTS_TMP", "tmp"),
+        dest="tmp_folder",
+    )
+
     parser.add_argument(
         "--zimui-dist",
         type=str,
@@ -84,21 +94,41 @@ def main() -> None:
         required=True,
     )
 
+    parser.add_argument(
+        "--keep-cache",
+        help="Keep cache of website responses",
+        action="store_true",
+        default=False,
+    )
+
     args = parser.parse_args()
 
     logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
 
+    output_folder = Path(args.output_folder)
+    output_folder.mkdir(exist_ok=True)
+    validate_zimfile_creatable(output_folder, "test.txt")
+
+    tmp_folder = Path(args.tmp_folder)
+    tmp_folder.mkdir(exist_ok=True)
+    validate_zimfile_creatable(tmp_folder, "test.txt")
+
     try:
         zim_config = ZimConfig.of(args)
         doc_filter = ContentFilter.of(args)
+
+        cache_folder = tmp_folder / "cache"
+        cache_folder.mkdir()
+
         libretexts_client = LibreTextsClient(
             library_slug=args.library_slug,
+            cache_folder=cache_folder,
         )
 
         Generator(
             libretexts_client=libretexts_client,
             zim_config=zim_config,
-            output_folder=args.output_folder,
+            output_folder=output_folder,
             zimui_dist=args.zimui_dist,
             content_filter=doc_filter,
             overwrite_existing_zim=args.overwrite,
diff --git a/scraper/src/libretexts2zim/generator.py b/scraper/src/libretexts2zim/generator.py
index 314e03c..e3a4417 100644
--- a/scraper/src/libretexts2zim/generator.py
+++ b/scraper/src/libretexts2zim/generator.py
@@ -1,6 +1,5 @@
 import argparse
 import datetime
-import os
 import re
 from io import BytesIO
 from pathlib import Path
@@ -11,6 +10,7 @@
 )
 from zimscraperlib.image import resize_image
 from zimscraperlib.zim import Creator
+from zimscraperlib.zim.filesystem import validate_zimfile_creatable
 from zimscraperlib.zim.indexing import IndexData
 
 from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
@@ -107,7 +107,7 @@ def __init__(
         libretexts_client: LibreTextsClient,
         zim_config: ZimConfig,
         content_filter: ContentFilter,
-        output_folder: str,
+        output_folder: Path,
         zimui_dist: str,
         *,
         overwrite_existing_zim: bool,
@@ -129,8 +129,6 @@ def __init__(
         self.zimui_dist = Path(zimui_dist)
         self.overwrite_existing_zim = overwrite_existing_zim
 
-        os.makedirs(self.output_folder, exist_ok=True)
-
         self.zim_illustration_path = self.libretexts_newsite_path(
             "header_logo_mini.png"
         )
@@ -157,11 +155,17 @@ def run(self) -> Path:
             name=self.zim_config.library_name, slug=self.libretexts_client.library_slug
         )
         formatted_config = self.zim_config.format(metadata.placeholders())
-        zim_path = Path(self.output_folder, f"{formatted_config.file_name_format}.zim")
+        zim_file_name = f"{formatted_config.file_name_format}.zim"
+        zim_path = self.output_folder / zim_file_name
+
+        if zim_path.exists():
+            if self.overwrite_existing_zim:
+                zim_path.unlink()
+            else:
+                logger.error(f"  {zim_path} already exists, aborting.")
+                raise SystemExit(f"ZIM file already exists at {zim_path}")
 
-        if zim_path.exists() and not self.overwrite_existing_zim:
-            logger.error(f"  {zim_path} already exists, aborting.")
-            raise SystemExit(f"ZIM file already exists at {zim_path}")
+        validate_zimfile_creatable(self.output_folder, zim_file_name)
 
         logger.info(f"  Writing to: {zim_path}")
 
diff --git a/scraper/tests-integration/conftest.py b/scraper/tests-integration/conftest.py
index 98d237a..250300d 100644
--- a/scraper/tests-integration/conftest.py
+++ b/scraper/tests-integration/conftest.py
@@ -1,3 +1,8 @@
+import tempfile
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+
 import pytest
 
 
@@ -6,6 +11,12 @@ def libretexts_slug() -> str:
     return "geo"
 
 
+@pytest.fixture(scope="module")
+def cache_folder() -> Generator[Path, Any, Any]:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir)
+
+
 @pytest.fixture(scope="module")
 def libretexts_url(libretexts_slug: str) -> str:
     return f"https://{libretexts_slug}.libretexts.org"
diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py
index 2e4889c..5cf9ebf 100644
--- a/scraper/tests-integration/test_client.py
+++ b/scraper/tests-integration/test_client.py
@@ -1,4 +1,5 @@
 import io
+from pathlib import Path
 
 import pytest
 from zimscraperlib.download import (
@@ -10,8 +11,8 @@
 
 
 @pytest.fixture(scope="module")
-def client(libretexts_slug: str) -> LibreTextsClient:
-    return LibreTextsClient(library_slug=libretexts_slug)
+def client(libretexts_slug: str, cache_folder: Path) -> LibreTextsClient:
+    return LibreTextsClient(library_slug=libretexts_slug, cache_folder=cache_folder)
 
 
 @pytest.fixture(scope="module")
@@ -19,6 +20,43 @@ def home(client: LibreTextsClient) -> LibreTextsHome:
     return client.get_home()
 
 
+@pytest.fixture(scope="module")
+def deki_token(client: LibreTextsClient) -> str:
+    return client.get_deki_token()
+
+
+@pytest.fixture(scope="module")
+def minimum_number_of_pages() -> int:
+    return 8000
+
+
+@pytest.fixture(scope="module")
+def root_page_id() -> str:
+    return "34"
+
+
+def test_get_deki_token(deki_token: str):
+    """Ensures we achieve to get a deki_token"""
+    assert deki_token
+
+
+def test_get_all_pages_ids(
+    client: LibreTextsClient,
+    minimum_number_of_pages: int,
+    deki_token: str,  # noqa: ARG001
+):
+    pages_ids = client.get_all_pages_ids()
+    assert len(pages_ids) > minimum_number_of_pages
+
+
+def test_get_root_page_id(
+    client: LibreTextsClient,
+    root_page_id: str,
+    deki_token: str,  # noqa: ARG001
+):
+    assert client.get_root_page_id() == root_page_id
+
+
 def test_get_home_image_url(home: LibreTextsHome):
     """Ensures proper image url is retrieved"""
     assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"