diff --git a/Dockerfile b/Dockerfile
index ba36662..0713679 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -44,6 +44,8 @@ RUN pip install --no-cache-dir /src/scraper \
# Copy zimui build output
COPY --from=zimui /src/dist /src/zimui
-ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui
+ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui \
+ LIBRETEXTS_OUTPUT=/output \
+ LIBRETEXTS_TMP=/tmp
CMD ["libretexts2zim", "--help"]
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 0000000..e3ae2cc
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,11 @@
+coverage:
+ status:
+ project:
+ default:
+ informational: true
+ patch:
+ default:
+ informational: true
+ changes:
+ default:
+ informational: true
diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml
index b557333..fdbd0de 100644
--- a/scraper/pyproject.toml
+++ b/scraper/pyproject.toml
@@ -47,7 +47,6 @@ dev = [
[project.scripts]
libretexts2zim = "libretexts2zim.__main__:main"
-libretexts2zim-playlists = "libretexts2zim.playlists.__main__:main"
[tool.hatch.version]
path = "src/libretexts2zim/__about__.py"
diff --git a/scraper/src/libretexts2zim/__main__.py b/scraper/src/libretexts2zim/__main__.py
index 9ee85b7..56d7e47 100644
--- a/scraper/src/libretexts2zim/__main__.py
+++ b/scraper/src/libretexts2zim/__main__.py
@@ -1,9 +1,12 @@
-#!/usr/bin/env python3
-# vim: ai ts=4 sts=4 et sw=4 nu
+import tempfile
-import sys
+from libretexts2zim.entrypoint import main as entrypoint
+
+
+def main():
+ with tempfile.TemporaryDirectory() as tmpdir:
+ entrypoint(tmpdir)
-from libretexts2zim.entrypoint import main
if __name__ == "__main__":
- sys.exit(main())
+ main()
diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py
index c6ea083..794755d 100644
--- a/scraper/src/libretexts2zim/client.py
+++ b/scraper/src/libretexts2zim/client.py
@@ -1,6 +1,9 @@
import datetime
+import json
import re
from collections.abc import Callable
+from pathlib import Path
+from typing import Any
import requests
from bs4 import BeautifulSoup, NavigableString
@@ -8,7 +11,8 @@
from libretexts2zim.constants import logger
-HTTP_TIMEOUT_SECONDS = 15
+HTTP_TIMEOUT_NORMAL_SECONDS = 15
+HTTP_TIMEOUT_LONG_SECONDS = 30
class LibreTextsParsingError(Exception):
@@ -50,7 +54,7 @@ def placeholders(
class LibreTextsClient:
"""Utility functions to read data from libretexts."""
- def __init__(self, library_slug: str) -> None:
+ def __init__(self, library_slug: str, cache_folder: Path) -> None:
"""Initializes LibreTextsClient.
Paremters:
@@ -58,40 +62,144 @@ def __init__(self, library_slug: str) -> None:
e.g. `https://geo.libretexts.org/`.
"""
self.library_slug = library_slug
+ self.deki_token = None
+ self.cache_folder = cache_folder
@property
def library_url(self) -> str:
- return f"https://{self.library_slug}.libretexts.org/"
+ return f"https://{self.library_slug}.libretexts.org"
- def _get_text(self, url: str) -> str:
+ @property
+ def api_url(self) -> str:
+ return f"{self.library_url}/@api/deki"
+
+ def _get_cache_file(self, url_subpath_and_query: str) -> Path:
+ """Get location where HTTP result should be cached"""
+ url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query)
+ if url_subpath_and_query.endswith("/"):
+ url_subpath_and_query += "index"
+ return self.cache_folder / url_subpath_and_query
+
+ def _get_text(self, url_subpath_and_query: str) -> str:
"""Perform a GET request and return the response as decoded text."""
- logger.debug(f"Fetching {url}")
+ cache_file = self._get_cache_file(f"text{url_subpath_and_query}")
+ if cache_file.exists():
+ return cache_file.read_text()
+ cache_file.parent.mkdir(parents=True, exist_ok=True)
+
+ full_url = f"{self.library_url}{url_subpath_and_query}"
+ logger.debug(f"Fetching {full_url}")
resp = requests.get(
- url=url,
+ url=full_url,
allow_redirects=True,
- timeout=HTTP_TIMEOUT_SECONDS,
+ timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
resp.raise_for_status()
+ cache_file.write_text(resp.text)
return resp.text
+ def _get_api_resp(
+ self, api_sub_path_and_query: str, timeout: float
+ ) -> requests.Response:
+ api_url = f"{self.api_url}{api_sub_path_and_query}"
+ logger.debug(f"Calling API at {api_url}")
+ resp = requests.get(
+ url=api_url,
+ headers={"x-deki-token": self.deki_token},
+ timeout=timeout,
+ )
+ resp.raise_for_status()
+ return resp
+
+ def _get_api_json(
+ self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
+ ) -> Any:
+ cache_file = self._get_cache_file(f"api_json{api_sub_path}")
+ if cache_file.exists():
+ return json.loads(cache_file.read_text())
+ cache_file.parent.mkdir(parents=True, exist_ok=True)
+ resp = self._get_api_resp(
+ f"{api_sub_path}?dream.out.format=json", timeout=timeout
+ )
+ result = resp.json()
+ cache_file.write_text(json.dumps(result))
+ return result
+
+ def _get_api_content(
+ self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
+ ) -> bytes | Any:
+ cache_file = self._get_cache_file(f"api_content{api_sub_path}")
+ if cache_file.exists():
+ return cache_file.read_bytes()
+ cache_file.parent.mkdir(parents=True, exist_ok=True)
+ resp = self._get_api_resp(api_sub_path, timeout=timeout)
+ result = resp.content
+ cache_file.write_bytes(result)
+ return result
+
def get_home(self) -> LibreTextsHome:
- home_content = self._get_text(self.library_url)
+ """Retrieves data about home page by crawling home page"""
+ home_content = self._get_text("/")
soup = _get_soup(home_content)
+ self.deki_token = _get_deki_token_from_home(soup)
return LibreTextsHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
)
+ def get_deki_token(self) -> str:
+ """Retrieves the API token to use to query the website API"""
+ if self.deki_token:
+ return self.deki_token
+
+ home_content = self._get_text("/")
+
+ soup = _get_soup(home_content)
+ self.deki_token = _get_deki_token_from_home(soup)
+ return self.deki_token
+
+ def get_all_pages_ids(self):
+ """Returns the IDs of all pages on current website, exploring the whole tree"""
+
+ tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
+
+ page_ids: list[str] = []
+
+ def _get_page_ids(page_node: Any) -> None:
+ page_ids.append(page_node["@id"])
+ if not page_node["subpages"]:
+ return
+ if "@id" in page_node["subpages"]["page"]:
+ _get_page_ids(page_node["subpages"]["page"])
+ else:
+ for page in page_node["subpages"]["page"]:
+ _get_page_ids(page)
+
+ _get_page_ids(tree["page"])
+
+ return page_ids
+
+ def get_root_page_id(self) -> str:
+ """Returns the ID the root of the tree of pages"""
+
+ tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
+ return tree["page"]["@id"]
+
def _get_soup(content: str) -> BeautifulSoup:
+ """Return a BeautifulSoup soup from textual content
+
+ This is a utility function to ensure same parser is used in the whole codebase
+ """
return BeautifulSoup(content, "lxml")
def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
+ """Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
if not branding_div:
raise LibreTextsParsingError("
with class 'LTBranding' not found")
@@ -111,6 +219,7 @@ def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
+ """Returns the text found on home page"""
content_section = soup.find("section", class_="mt-content-container")
if not content_section or isinstance(content_section, NavigableString):
raise LibreTextsParsingError(
@@ -121,3 +230,22 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
if paragraph_text := paragraph.text:
welcome_text.append(paragraph_text)
return welcome_text
+
+
+def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
+ global_settings = soup.find("script", id="mt-global-settings")
+ if not global_settings:
+ logger.debug("home content:")
+ logger.debug(soup)
+ raise Exception(
+ "Failed to retrieve API token to query website API, missing "
+ "mt-global-settings script"
+ )
+ x_deki_token = json.loads(global_settings.text).get("apiToken", None)
+ if not x_deki_token:
+ logger.debug("mt-global-settings script content:")
+ logger.debug(global_settings.text)
+ raise Exception(
+ "Failed to retrieve API token to query website API, missing apiToken."
+ )
+ return x_deki_token
diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py
index 7edcf76..1d8ef52 100644
--- a/scraper/src/libretexts2zim/entrypoint.py
+++ b/scraper/src/libretexts2zim/entrypoint.py
@@ -8,6 +8,7 @@
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
RECOMMENDED_MAX_TITLE_LENGTH,
)
+from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from libretexts2zim.client import LibreTextsClient
from libretexts2zim.constants import (
@@ -137,7 +138,7 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
)
-def main() -> None:
+def main(tmpdir: str) -> None:
parser = argparse.ArgumentParser(
prog=NAME,
)
@@ -177,6 +178,13 @@ def main() -> None:
dest="output_folder",
)
+ parser.add_argument(
+ "--tmp",
+ help="Temporary folder for cache, intermediate files, ... Default: tmp",
+ default=os.getenv("LIBRETEXTS_TMP", tmpdir),
+ dest="tmp_folder",
+ )
+
parser.add_argument(
"--debug", help="Enable verbose output", action="store_true", default=False
)
@@ -191,15 +199,35 @@ def main() -> None:
default=os.getenv("LIBRETEXTS_ZIMUI_DIST", "../zimui/dist"),
)
+ parser.add_argument(
+ "--keep-cache",
+ help="Keep cache of website responses",
+ action="store_true",
+ default=False,
+ )
+
args = parser.parse_args()
logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
+ output_folder = Path(args.output_folder)
+ output_folder.mkdir(exist_ok=True)
+ validate_zimfile_creatable(output_folder, "test.txt")
+
+ tmp_folder = Path(args.tmp_folder)
+ tmp_folder.mkdir(exist_ok=True)
+ validate_zimfile_creatable(tmp_folder, "test.txt")
+
try:
zim_config = ZimConfig.of(args)
doc_filter = ContentFilter.of(args)
+
+ cache_folder = tmp_folder / "cache"
+ cache_folder.mkdir()
+
libretexts_client = LibreTextsClient(
library_slug=args.library_slug,
+ cache_folder=cache_folder,
)
Processor(
@@ -217,7 +245,3 @@ def main() -> None:
logger.exception(exc)
logger.error(f"Generation failed with the following error: {exc}")
raise SystemExit(1) from exc
-
-
-if __name__ == "__main__":
- main()
diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py
index 90bacb6..146259f 100644
--- a/scraper/src/libretexts2zim/processor.py
+++ b/scraper/src/libretexts2zim/processor.py
@@ -9,6 +9,7 @@
)
from zimscraperlib.image import resize_image
from zimscraperlib.zim import Creator
+from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData
from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
@@ -117,8 +118,6 @@ def __init__(
self.zimui_dist = zimui_dist
self.overwrite_existing_zim = overwrite_existing_zim
- self.output_folder.mkdir(exist_ok=True)
-
self.zim_illustration_path = self.libretexts_newsite_path(
"header_logo_mini.png"
)
@@ -145,11 +144,17 @@ def run(self) -> Path:
name=self.zim_config.library_name, slug=self.libretexts_client.library_slug
)
formatted_config = self.zim_config.format(metadata.placeholders())
- zim_path = Path(self.output_folder, f"{formatted_config.file_name_format}.zim")
+ zim_file_name = f"{formatted_config.file_name_format}.zim"
+ zim_path = self.output_folder / zim_file_name
+
+ if zim_path.exists():
+ if self.overwrite_existing_zim:
+ zim_path.unlink()
+ else:
+ logger.error(f" {zim_path} already exists, aborting.")
+ raise SystemExit(2)
- if zim_path.exists() and not self.overwrite_existing_zim:
- logger.error(f" {zim_path} already exists, aborting.")
- raise SystemExit(2)
+ validate_zimfile_creatable(self.output_folder, zim_file_name)
logger.info(f" Writing to: {zim_path}")
diff --git a/scraper/tests-integration/README.md b/scraper/tests-integration/README.md
new file mode 100644
index 0000000..639565c
--- /dev/null
+++ b/scraper/tests-integration/README.md
@@ -0,0 +1,6 @@
+This folder contains integration tests, testing how the scraper behaves:
+
+- with a real libretexts website
+- from end-to-end
+
+They are targetted at being ran from scraper Docker image from Github workflow(s).
diff --git a/scraper/tests-integration/conftest.py b/scraper/tests-integration/conftest.py
index 98d237a..250300d 100644
--- a/scraper/tests-integration/conftest.py
+++ b/scraper/tests-integration/conftest.py
@@ -1,3 +1,8 @@
+import tempfile
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+
import pytest
@@ -6,6 +11,12 @@ def libretexts_slug() -> str:
return "geo"
+@pytest.fixture(scope="module")
+def cache_folder() -> Generator[Path, Any, Any]:
+ with tempfile.TemporaryDirectory() as tmpdir:
+ yield Path(tmpdir)
+
+
@pytest.fixture(scope="module")
def libretexts_url(libretexts_slug: str) -> str:
return f"https://{libretexts_slug}.libretexts.org"
diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py
index 2e4889c..5cf9ebf 100644
--- a/scraper/tests-integration/test_client.py
+++ b/scraper/tests-integration/test_client.py
@@ -1,4 +1,5 @@
import io
+from pathlib import Path
import pytest
from zimscraperlib.download import (
@@ -10,8 +11,8 @@
@pytest.fixture(scope="module")
-def client(libretexts_slug: str) -> LibreTextsClient:
- return LibreTextsClient(library_slug=libretexts_slug)
+def client(libretexts_slug: str, cache_folder: Path) -> LibreTextsClient:
+ return LibreTextsClient(library_slug=libretexts_slug, cache_folder=cache_folder)
@pytest.fixture(scope="module")
@@ -19,6 +20,43 @@ def home(client: LibreTextsClient) -> LibreTextsHome:
return client.get_home()
+@pytest.fixture(scope="module")
+def deki_token(client: LibreTextsClient) -> str:
+ return client.get_deki_token()
+
+
+@pytest.fixture(scope="module")
+def minimum_number_of_pages() -> int:
+ return 8000
+
+
+@pytest.fixture(scope="module")
+def root_page_id() -> str:
+ return "34"
+
+
+def test_get_deki_token(deki_token: str):
+ """Ensures we achieve to get a deki_token"""
+ assert deki_token
+
+
+def test_get_all_pages_ids(
+ client: LibreTextsClient,
+ minimum_number_of_pages: int,
+ deki_token: str, # noqa: ARG001
+):
+ pages_ids = client.get_all_pages_ids()
+ assert len(pages_ids) > minimum_number_of_pages
+
+
+def test_get_root_page_id(
+ client: LibreTextsClient,
+ root_page_id: str,
+ deki_token: str, # noqa: ARG001
+):
+ assert client.get_root_page_id() == root_page_id
+
+
def test_get_home_image_url(home: LibreTextsHome):
"""Ensures proper image url is retrieved"""
assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"