Skip to content

Commit

Permalink
Merge pull request #4 from openzim/add_api_and_caching
Browse files Browse the repository at this point in the history
Add API and caching
  • Loading branch information
benoit74 authored Oct 3, 2024
2 parents 8efbbe5 + 40cb342 commit e7a408c
Show file tree
Hide file tree
Showing 10 changed files with 255 additions and 28 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ RUN pip install --no-cache-dir /src/scraper \
# Copy zimui build output
COPY --from=zimui /src/dist /src/zimui

ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui
ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui \
LIBRETEXTS_OUTPUT=/output \
LIBRETEXTS_TMP=/tmp

CMD ["libretexts2zim", "--help"]
11 changes: 11 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
coverage:
status:
project:
default:
informational: true
patch:
default:
informational: true
changes:
default:
informational: true
1 change: 0 additions & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ dev = [

[project.scripts]
libretexts2zim = "libretexts2zim.__main__:main"
libretexts2zim-playlists = "libretexts2zim.playlists.__main__:main"

[tool.hatch.version]
path = "src/libretexts2zim/__about__.py"
Expand Down
13 changes: 8 additions & 5 deletions scraper/src/libretexts2zim/__main__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#!/usr/bin/env python3
# vim: ai ts=4 sts=4 et sw=4 nu
import tempfile

import sys
from libretexts2zim.entrypoint import main as entrypoint


def main():
with tempfile.TemporaryDirectory() as tmpdir:
entrypoint(tmpdir)

from libretexts2zim.entrypoint import main

if __name__ == "__main__":
sys.exit(main())
main()
144 changes: 136 additions & 8 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import datetime
import json
import re
from collections.abc import Callable
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString
from pydantic import BaseModel

from libretexts2zim.constants import logger

HTTP_TIMEOUT_SECONDS = 15
HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30


class LibreTextsParsingError(Exception):
Expand Down Expand Up @@ -50,48 +54,152 @@ def placeholders(
class LibreTextsClient:
"""Utility functions to read data from libretexts."""

def __init__(self, library_slug: str) -> None:
def __init__(self, library_slug: str, cache_folder: Path) -> None:
"""Initializes LibreTextsClient.
Paremters:
library_url: Scheme, hostname, and port for the Libretext library
e.g. `https://geo.libretexts.org/`.
"""
self.library_slug = library_slug
self.deki_token = None
self.cache_folder = cache_folder

@property
def library_url(self) -> str:
return f"https://{self.library_slug}.libretexts.org/"
return f"https://{self.library_slug}.libretexts.org"

def _get_text(self, url: str) -> str:
@property
def api_url(self) -> str:
return f"{self.library_url}/@api/deki"

def _get_cache_file(self, url_subpath_and_query: str) -> Path:
"""Get location where HTTP result should be cached"""
url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query)
if url_subpath_and_query.endswith("/"):
url_subpath_and_query += "index"
return self.cache_folder / url_subpath_and_query

def _get_text(self, url_subpath_and_query: str) -> str:
"""Perform a GET request and return the response as decoded text."""

logger.debug(f"Fetching {url}")
cache_file = self._get_cache_file(f"text{url_subpath_and_query}")
if cache_file.exists():
return cache_file.read_text()
cache_file.parent.mkdir(parents=True, exist_ok=True)

full_url = f"{self.library_url}{url_subpath_and_query}"
logger.debug(f"Fetching {full_url}")

resp = requests.get(
url=url,
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_SECONDS,
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
resp.raise_for_status()

cache_file.write_text(resp.text)
return resp.text

def _get_api_resp(
self, api_sub_path_and_query: str, timeout: float
) -> requests.Response:
api_url = f"{self.api_url}{api_sub_path_and_query}"
logger.debug(f"Calling API at {api_url}")
resp = requests.get(
url=api_url,
headers={"x-deki-token": self.deki_token},
timeout=timeout,
)
resp.raise_for_status()
return resp

def _get_api_json(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
) -> Any:
cache_file = self._get_cache_file(f"api_json{api_sub_path}")
if cache_file.exists():
return json.loads(cache_file.read_text())
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(
f"{api_sub_path}?dream.out.format=json", timeout=timeout
)
result = resp.json()
cache_file.write_text(json.dumps(result))
return result

def _get_api_content(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
) -> bytes | Any:
cache_file = self._get_cache_file(f"api_content{api_sub_path}")
if cache_file.exists():
return cache_file.read_bytes()
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(api_sub_path, timeout=timeout)
result = resp.content
cache_file.write_bytes(result)
return result

def get_home(self) -> LibreTextsHome:
home_content = self._get_text(self.library_url)
"""Retrieves data about home page by crawling home page"""
home_content = self._get_text("/")

soup = _get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return LibreTextsHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
)

def get_deki_token(self) -> str:
"""Retrieves the API token to use to query the website API"""
if self.deki_token:
return self.deki_token

home_content = self._get_text("/")

soup = _get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

def get_all_pages_ids(self):
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)

page_ids: list[str] = []

def _get_page_ids(page_node: Any) -> None:
page_ids.append(page_node["@id"])
if not page_node["subpages"]:
return
if "@id" in page_node["subpages"]["page"]:
_get_page_ids(page_node["subpages"]["page"])
else:
for page in page_node["subpages"]["page"]:
_get_page_ids(page)

_get_page_ids(tree["page"])

return page_ids

def get_root_page_id(self) -> str:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
return tree["page"]["@id"]


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "lxml")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
if not branding_div:
raise LibreTextsParsingError("<div> with class 'LTBranding' not found")
Expand All @@ -111,6 +219,7 @@ def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:


def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
"""Returns the text found on home page"""
content_section = soup.find("section", class_="mt-content-container")
if not content_section or isinstance(content_section, NavigableString):
raise LibreTextsParsingError(
Expand All @@ -121,3 +230,22 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
if paragraph_text := paragraph.text:
welcome_text.append(paragraph_text)
return welcome_text


def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
global_settings = soup.find("script", id="mt-global-settings")
if not global_settings:
logger.debug("home content:")
logger.debug(soup)
raise Exception(
"Failed to retrieve API token to query website API, missing "
"mt-global-settings script"
)
x_deki_token = json.loads(global_settings.text).get("apiToken", None)
if not x_deki_token:
logger.debug("mt-global-settings script content:")
logger.debug(global_settings.text)
raise Exception(
"Failed to retrieve API token to query website API, missing apiToken."
)
return x_deki_token
34 changes: 29 additions & 5 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
RECOMMENDED_MAX_TITLE_LENGTH,
)
from zimscraperlib.zim.filesystem import validate_zimfile_creatable

from libretexts2zim.client import LibreTextsClient
from libretexts2zim.constants import (
Expand Down Expand Up @@ -137,7 +138,7 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
)


def main() -> None:
def main(tmpdir: str) -> None:
parser = argparse.ArgumentParser(
prog=NAME,
)
Expand Down Expand Up @@ -177,6 +178,13 @@ def main() -> None:
dest="output_folder",
)

parser.add_argument(
"--tmp",
help="Temporary folder for cache, intermediate files, ... Default: tmp",
default=os.getenv("LIBRETEXTS_TMP", tmpdir),
dest="tmp_folder",
)

parser.add_argument(
"--debug", help="Enable verbose output", action="store_true", default=False
)
Expand All @@ -191,15 +199,35 @@ def main() -> None:
default=os.getenv("LIBRETEXTS_ZIMUI_DIST", "../zimui/dist"),
)

parser.add_argument(
"--keep-cache",
help="Keep cache of website responses",
action="store_true",
default=False,
)

args = parser.parse_args()

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)

output_folder = Path(args.output_folder)
output_folder.mkdir(exist_ok=True)
validate_zimfile_creatable(output_folder, "test.txt")

tmp_folder = Path(args.tmp_folder)
tmp_folder.mkdir(exist_ok=True)
validate_zimfile_creatable(tmp_folder, "test.txt")

try:
zim_config = ZimConfig.of(args)
doc_filter = ContentFilter.of(args)

cache_folder = tmp_folder / "cache"
cache_folder.mkdir()

libretexts_client = LibreTextsClient(
library_slug=args.library_slug,
cache_folder=cache_folder,
)

Processor(
Expand All @@ -217,7 +245,3 @@ def main() -> None:
logger.exception(exc)
logger.error(f"Generation failed with the following error: {exc}")
raise SystemExit(1) from exc


if __name__ == "__main__":
main()
17 changes: 11 additions & 6 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
)
from zimscraperlib.image import resize_image
from zimscraperlib.zim import Creator
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData

from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
Expand Down Expand Up @@ -117,8 +118,6 @@ def __init__(
self.zimui_dist = zimui_dist
self.overwrite_existing_zim = overwrite_existing_zim

self.output_folder.mkdir(exist_ok=True)

self.zim_illustration_path = self.libretexts_newsite_path(
"header_logo_mini.png"
)
Expand All @@ -145,11 +144,17 @@ def run(self) -> Path:
name=self.zim_config.library_name, slug=self.libretexts_client.library_slug
)
formatted_config = self.zim_config.format(metadata.placeholders())
zim_path = Path(self.output_folder, f"{formatted_config.file_name_format}.zim")
zim_file_name = f"{formatted_config.file_name_format}.zim"
zim_path = self.output_folder / zim_file_name

if zim_path.exists():
if self.overwrite_existing_zim:
zim_path.unlink()
else:
logger.error(f" {zim_path} already exists, aborting.")
raise SystemExit(2)

if zim_path.exists() and not self.overwrite_existing_zim:
logger.error(f" {zim_path} already exists, aborting.")
raise SystemExit(2)
validate_zimfile_creatable(self.output_folder, zim_file_name)

logger.info(f" Writing to: {zim_path}")

Expand Down
6 changes: 6 additions & 0 deletions scraper/tests-integration/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This folder contains integration tests, testing how the scraper behaves:

- with a real libretexts website
- from end-to-end

They are targetted at being ran from scraper Docker image from Github workflow(s).
Loading

0 comments on commit e7a408c

Please sign in to comment.