Skip to content

Commit

Permalink
Retrieve list of page IDs and root of the tree from API, and introduc…
Browse files Browse the repository at this point in the history
…e caching
  • Loading branch information
benoit74 committed Oct 1, 2024
1 parent 49e1090 commit a04e45e
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 19 deletions.
140 changes: 134 additions & 6 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import datetime
import json
import re
from collections.abc import Callable
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString
Expand Down Expand Up @@ -57,49 +60,154 @@ def placeholders(
class LibreTextsClient:
"""Utility functions to read data from libretexts."""

def __init__(self, library_slug: str) -> None:
def __init__(self, library_slug: str, cache_folder: Path) -> None:
"""Initializes LibreTextsClient.
Paremters:
library_url: Scheme, hostname, and port for the Libretext library
e.g. `https://geo.libretexts.org/`.
"""
self.library_slug = library_slug
self.deki_token = None
self.cache_folder = cache_folder

Check warning on line 72 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L70-L72

Added lines #L70 - L72 were not covered by tests

@property
def library_url(self) -> str:
return f"https://{self.library_slug}.libretexts.org/"
return f"https://{self.library_slug}.libretexts.org"

Check warning on line 76 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L76

Added line #L76 was not covered by tests

def _get_text(self, url: str) -> str:
@property
def api_url(self) -> str:
return f"{self.library_url}/@api/deki"

Check warning on line 80 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L80

Added line #L80 was not covered by tests

def _get_cache_file(self, url_subpath_and_query: str) -> Path:
"""Get location where HTTP result should be cached"""
if url_subpath_and_query.startswith("/"):
url_subpath_and_query = url_subpath_and_query[1:]

Check warning on line 85 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L85

Added line #L85 was not covered by tests
if url_subpath_and_query.endswith("/"):
url_subpath_and_query += "index"
return self.cache_folder / url_subpath_and_query

Check warning on line 88 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L87-L88

Added lines #L87 - L88 were not covered by tests

def _get_text(self, url_subpath_and_query: str) -> str:
"""Perform a GET request and return the response as decoded text."""

logger.debug(f"Fetching {url}")
cache_file = self._get_cache_file(f"text{url_subpath_and_query}")

Check warning on line 93 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L93

Added line #L93 was not covered by tests
if cache_file.exists():
return cache_file.read_text()
cache_file.parent.mkdir(parents=True, exist_ok=True)

Check warning on line 96 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L95-L96

Added lines #L95 - L96 were not covered by tests

full_url = f"{self.library_url}{url_subpath_and_query}"
logger.debug(f"Fetching {full_url}")

Check warning on line 99 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L98-L99

Added lines #L98 - L99 were not covered by tests

resp = requests.get(

Check warning on line 101 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L101

Added line #L101 was not covered by tests
url=url,
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_SECONDS,
)
resp.raise_for_status()

Check warning on line 106 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L106

Added line #L106 was not covered by tests

cache_file.write_text(resp.text)
return resp.text

Check warning on line 109 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L108-L109

Added lines #L108 - L109 were not covered by tests

def _get_api_resp(
self, api_sub_path_and_query: str, timeout: float
) -> requests.Response:
api_url = f"{self.api_url}{api_sub_path_and_query}"
logger.debug(f"Calling API at {api_url}")
resp = requests.get(

Check warning on line 116 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L114-L116

Added lines #L114 - L116 were not covered by tests
url=api_url,
headers={"x-deki-token": self.deki_token},
timeout=timeout,
)
resp.raise_for_status()
return resp

Check warning on line 122 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L121-L122

Added lines #L121 - L122 were not covered by tests

def _get_api_json(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
) -> Any:
cache_file = self._get_cache_file(f"api_json{api_sub_path}")

Check warning on line 127 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L127

Added line #L127 was not covered by tests
if cache_file.exists():
return json.loads(cache_file.read_text())
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(

Check warning on line 131 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L129-L131

Added lines #L129 - L131 were not covered by tests
f"{api_sub_path}?dream.out.format=json", timeout=timeout
)
result = resp.json()
cache_file.write_text(json.dumps(result))
return result

Check warning on line 136 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L134-L136

Added lines #L134 - L136 were not covered by tests

def _get_api_content(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
) -> bytes | Any:
cache_file = self._get_cache_file(f"api_content{api_sub_path}")

Check warning on line 141 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L141

Added line #L141 was not covered by tests
if cache_file.exists():
return json.loads(cache_file.read_text())
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(api_sub_path, timeout=timeout)
result = resp.content
cache_file.write_bytes(result)
return result

Check warning on line 148 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L143-L148

Added lines #L143 - L148 were not covered by tests

def get_home(self) -> LibreTextsHome:
home_content = self._get_text(self.library_url)
"""Retrieves data about home page by crawling home page"""
home_content = self._get_text("/")

Check warning on line 152 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L152

Added line #L152 was not covered by tests

soup = _get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return LibreTextsHome(

Check warning on line 156 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L154-L156

Added lines #L154 - L156 were not covered by tests
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
shelves=[],
)

def get_deki_token(self) -> str:
"""Retrieves the API token to use to query the website API"""
if self.deki_token:
return self.deki_token

Check warning on line 165 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L165

Added line #L165 was not covered by tests

home_content = self._get_text("/")

Check warning on line 167 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L167

Added line #L167 was not covered by tests

soup = _get_soup(home_content)
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

Check warning on line 171 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L169-L171

Added lines #L169 - L171 were not covered by tests

def get_all_pages_ids(self):
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)

Check warning on line 176 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L176

Added line #L176 was not covered by tests

page_ids: list[str] = []

Check warning on line 178 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L178

Added line #L178 was not covered by tests

def _get_page_ids(page_node: Any) -> None:
page_ids.append(page_node["@id"])

Check warning on line 181 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L180-L181

Added lines #L180 - L181 were not covered by tests
if not page_node["subpages"]:
return

Check warning on line 183 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L183

Added line #L183 was not covered by tests
if "@id" in page_node["subpages"]["page"]:
_get_page_ids(page_node["subpages"]["page"])

Check warning on line 185 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L185

Added line #L185 was not covered by tests
else:
for page in page_node["subpages"]["page"]:
_get_page_ids(page)

Check warning on line 188 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L188

Added line #L188 was not covered by tests

_get_page_ids(tree["page"])

Check warning on line 190 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L190

Added line #L190 was not covered by tests

return page_ids

Check warning on line 192 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L192

Added line #L192 was not covered by tests

def get_root_page_id(self) -> str:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
return tree["page"]["@id"]

Check warning on line 198 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L197-L198

Added lines #L197 - L198 were not covered by tests


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
This is a utility function to ensure same parser is used in the whole codebase
"""
return BeautifulSoup(content, "html.parser")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")

Check warning on line 211 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L211

Added line #L211 was not covered by tests
if not branding_div:
raise LibreTextsParsingError("<div> with class 'LTBranding' not found")
Expand All @@ -119,6 +227,7 @@ def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:


def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
"""Returns the text found on home page"""
content_section = soup.find("section", class_="mt-content-container")
if not content_section or isinstance(content_section, NavigableString):
raise LibreTextsParsingError(

Check warning on line 233 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L233

Added line #L233 was not covered by tests
Expand All @@ -133,3 +242,22 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
if paragraph_text := paragraph.text:
welcome_text.append(paragraph_text)
return welcome_text


def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
global_settings = soup.find("script", id="mt-global-settings")

Check warning on line 248 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L248

Added line #L248 was not covered by tests
if not global_settings:
logger.debug("home content:")
logger.debug(soup)
raise Exception(

Check warning on line 252 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L250-L252

Added lines #L250 - L252 were not covered by tests
"Failed to retrieve API token to query website API, missing "
"mt-global-settings script"
)
x_deki_token = json.loads(global_settings.text).get("apiToken", None)

Check warning on line 256 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L256

Added line #L256 was not covered by tests
if not x_deki_token:
logger.debug("mt-global-settings script content:")
logger.debug(global_settings.text)
raise Exception(

Check warning on line 260 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L258-L260

Added lines #L258 - L260 were not covered by tests
"Failed to retrieve API token to query website API, missing apiToken."
)
return x_deki_token

Check warning on line 263 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L263

Added line #L263 was not covered by tests
36 changes: 33 additions & 3 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import argparse
import logging
import os
from pathlib import Path

from zimscraperlib.zim.filesystem import validate_zimfile_creatable

from libretexts2zim.client import LibreTextsClient
from libretexts2zim.constants import (
Expand Down Expand Up @@ -46,11 +49,18 @@ def main() -> None:

parser.add_argument(

Check warning on line 50 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L50

Added line #L50 was not covered by tests
"--output",
help="Output folder for ZIMs. Default: /output",
default="/output",
help="Output folder for ZIMs. Default: output",
default=os.getenv("LIBRETEXTS_OUTPUT", "output"),
dest="output_folder",
)

parser.add_argument(

Check warning on line 57 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L57

Added line #L57 was not covered by tests
"--tmp",
help="Temporary folder for cache, intermediate files, ... Default: tmp",
default=os.getenv("LIBRETEXTS_TMP", "tmp"),
dest="tmp_folder",
)

parser.add_argument(

Check warning on line 64 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L64

Added line #L64 was not covered by tests
"--zimui-dist",
type=str,
Expand Down Expand Up @@ -84,21 +94,41 @@ def main() -> None:
required=True,
)

parser.add_argument(

Check warning on line 97 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L97

Added line #L97 was not covered by tests
"--keep-cache",
help="Keep cache of website responses",
action="store_true",
default=False,
)

args = parser.parse_args()

Check warning on line 104 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L104

Added line #L104 was not covered by tests

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)

Check warning on line 106 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L106

Added line #L106 was not covered by tests

output_folder = Path(args.output_folder)
output_folder.mkdir(exist_ok=True)
validate_zimfile_creatable(output_folder, "test.txt")

Check warning on line 110 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L108-L110

Added lines #L108 - L110 were not covered by tests

tmp_folder = Path(args.tmp_folder)
tmp_folder.mkdir(exist_ok=True)
validate_zimfile_creatable(tmp_folder, "test.txt")

Check warning on line 114 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L112-L114

Added lines #L112 - L114 were not covered by tests

try:
zim_config = ZimConfig.of(args)
doc_filter = ContentFilter.of(args)

Check warning on line 118 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L116-L118

Added lines #L116 - L118 were not covered by tests

cache_folder = tmp_folder / "cache"
cache_folder.mkdir()

Check warning on line 121 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L120-L121

Added lines #L120 - L121 were not covered by tests

libretexts_client = LibreTextsClient(

Check warning on line 123 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L123

Added line #L123 was not covered by tests
library_slug=args.library_slug,
cache_folder=cache_folder,
)

Generator(

Check warning on line 128 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L128

Added line #L128 was not covered by tests
libretexts_client=libretexts_client,
zim_config=zim_config,
output_folder=args.output_folder,
output_folder=output_folder,
zimui_dist=args.zimui_dist,
content_filter=doc_filter,
overwrite_existing_zim=args.overwrite,
Expand Down
20 changes: 12 additions & 8 deletions scraper/src/libretexts2zim/generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import datetime
import os
import re
from io import BytesIO
from pathlib import Path
Expand All @@ -11,6 +10,7 @@
)
from zimscraperlib.image import resize_image
from zimscraperlib.zim import Creator
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData

from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
Expand Down Expand Up @@ -107,7 +107,7 @@ def __init__(
libretexts_client: LibreTextsClient,
zim_config: ZimConfig,
content_filter: ContentFilter,
output_folder: str,
output_folder: Path,
zimui_dist: str,
*,
overwrite_existing_zim: bool,
Expand All @@ -129,8 +129,6 @@ def __init__(
self.zimui_dist = Path(zimui_dist)
self.overwrite_existing_zim = overwrite_existing_zim

Check warning on line 130 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L125-L130

Added lines #L125 - L130 were not covered by tests

os.makedirs(self.output_folder, exist_ok=True)

self.zim_illustration_path = self.libretexts_newsite_path(

Check warning on line 132 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L132

Added line #L132 was not covered by tests
"header_logo_mini.png"
)
Expand All @@ -157,11 +155,17 @@ def run(self) -> Path:
name=self.zim_config.library_name, slug=self.libretexts_client.library_slug
)
formatted_config = self.zim_config.format(metadata.placeholders())
zim_path = Path(self.output_folder, f"{formatted_config.file_name_format}.zim")
zim_file_name = f"{formatted_config.file_name_format}.zim"
zim_path = self.output_folder / zim_file_name

Check warning on line 159 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L157-L159

Added lines #L157 - L159 were not covered by tests

if zim_path.exists():
if self.overwrite_existing_zim:
zim_path.unlink()

Check warning on line 163 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L163

Added line #L163 was not covered by tests
else:
logger.error(f" {zim_path} already exists, aborting.")
raise SystemExit(f"ZIM file already exists at {zim_path}")

Check warning on line 166 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L165-L166

Added lines #L165 - L166 were not covered by tests

if zim_path.exists() and not self.overwrite_existing_zim:
logger.error(f" {zim_path} already exists, aborting.")
raise SystemExit(f"ZIM file already exists at {zim_path}")
validate_zimfile_creatable(self.output_folder, zim_file_name)

Check warning on line 168 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L168

Added line #L168 was not covered by tests

logger.info(f" Writing to: {zim_path}")

Check warning on line 170 in scraper/src/libretexts2zim/generator.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/generator.py#L170

Added line #L170 was not covered by tests

Expand Down
11 changes: 11 additions & 0 deletions scraper/tests-integration/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import tempfile
from collections.abc import Generator
from pathlib import Path
from typing import Any

import pytest


Expand All @@ -6,6 +11,12 @@ def libretexts_slug() -> str:
return "geo"


@pytest.fixture(scope="module")
def cache_folder() -> Generator[Path, Any, Any]:
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)


@pytest.fixture(scope="module")
def libretexts_url(libretexts_slug: str) -> str:
return f"https://{libretexts_slug}.libretexts.org"
Expand Down
Loading

0 comments on commit a04e45e

Please sign in to comment.