Skip to content

Commit

Permalink
fixup! Retrieve list of page IDs and root of the tree from API, and i…
Browse files Browse the repository at this point in the history
…ntroduce caching
  • Loading branch information
benoit74 committed Oct 3, 2024
1 parent aca4c7e commit 2def419
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 22 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ RUN pip install --no-cache-dir /src/scraper \
# Copy zimui build output
COPY --from=zimui /src/dist /src/zimui

ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui LIBRETEXTS_OUTPUT=/output LIBRETEXTS_TMP=/tmp
ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui \
LIBRETEXTS_OUTPUT=/output \
LIBRETEXTS_TMP=/tmp

CMD ["libretexts2zim", "--help"]
1 change: 0 additions & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ dev = [

[project.scripts]
libretexts2zim = "libretexts2zim.__main__:main"
libretexts2zim-playlists = "libretexts2zim.playlists.__main__:main"

[tool.hatch.version]
path = "src/libretexts2zim/__about__.py"
Expand Down
13 changes: 8 additions & 5 deletions scraper/src/libretexts2zim/__main__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#!/usr/bin/env python3
# vim: ai ts=4 sts=4 et sw=4 nu
import tempfile

Check warning on line 1 in scraper/src/libretexts2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/__main__.py#L1

Added line #L1 was not covered by tests

import sys
from libretexts2zim.entrypoint import main as entrypoint

Check warning on line 3 in scraper/src/libretexts2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/__main__.py#L3

Added line #L3 was not covered by tests


def main():

Check warning on line 6 in scraper/src/libretexts2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/__main__.py#L6

Added line #L6 was not covered by tests
with tempfile.TemporaryDirectory() as tmpdir:
entrypoint(tmpdir)

Check warning on line 8 in scraper/src/libretexts2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/__main__.py#L8

Added line #L8 was not covered by tests

from libretexts2zim.entrypoint import main

if __name__ == "__main__":
sys.exit(main())
main()
19 changes: 10 additions & 9 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

from libretexts2zim.constants import logger

HTTP_TIMEOUT_SECONDS = 15
HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30


class LibreTextsParsingError(Exception):
Expand Down Expand Up @@ -74,8 +75,7 @@ def api_url(self) -> str:

def _get_cache_file(self, url_subpath_and_query: str) -> Path:
"""Get location where HTTP result should be cached"""
if url_subpath_and_query.startswith("/"):
url_subpath_and_query = url_subpath_and_query[1:]
url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query)

Check warning on line 78 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L78

Added line #L78 was not covered by tests
if url_subpath_and_query.endswith("/"):
url_subpath_and_query += "index"
return self.cache_folder / url_subpath_and_query

Check warning on line 81 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L80-L81

Added lines #L80 - L81 were not covered by tests
Expand All @@ -94,7 +94,7 @@ def _get_text(self, url_subpath_and_query: str) -> str:
resp = requests.get(
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_SECONDS,
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
)
resp.raise_for_status()

Expand All @@ -115,7 +115,7 @@ def _get_api_resp(
return resp

Check warning on line 115 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L114-L115

Added lines #L114 - L115 were not covered by tests

def _get_api_json(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
) -> Any:
cache_file = self._get_cache_file(f"api_json{api_sub_path}")

Check warning on line 120 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L120

Added line #L120 was not covered by tests
if cache_file.exists():
Expand All @@ -129,11 +129,11 @@ def _get_api_json(
return result

Check warning on line 129 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L127-L129

Added lines #L127 - L129 were not covered by tests

def _get_api_content(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
) -> bytes | Any:
cache_file = self._get_cache_file(f"api_content{api_sub_path}")

Check warning on line 134 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L134

Added line #L134 was not covered by tests
if cache_file.exists():
return json.loads(cache_file.read_text())
return cache_file.read_bytes()
cache_file.parent.mkdir(parents=True, exist_ok=True)
resp = self._get_api_resp(api_sub_path, timeout=timeout)
result = resp.content
Expand Down Expand Up @@ -165,7 +165,7 @@ def get_deki_token(self) -> str:
def get_all_pages_ids(self):
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)

Check warning on line 168 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L168

Added line #L168 was not covered by tests

page_ids: list[str] = []

Check warning on line 170 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L170

Added line #L170 was not covered by tests

Expand All @@ -186,7 +186,7 @@ def _get_page_ids(page_node: Any) -> None:
def get_root_page_id(self) -> str:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2)
tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
return tree["page"]["@id"]

Check warning on line 190 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L189-L190

Added lines #L189 - L190 were not covered by tests


Expand All @@ -197,6 +197,7 @@ def _get_soup(content: str) -> BeautifulSoup:
"""
return BeautifulSoup(content, "lxml")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
"""Return the URL of the image found on home header"""
branding_div = soup.find("div", class_="LTBranding")
Expand Down
8 changes: 2 additions & 6 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
)


def main() -> None:
def main(tmpdir: str) -> None:
parser = argparse.ArgumentParser(
prog=NAME,
)
Expand Down Expand Up @@ -181,7 +181,7 @@ def main() -> None:
parser.add_argument(

Check warning on line 181 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L181

Added line #L181 was not covered by tests
"--tmp",
help="Temporary folder for cache, intermediate files, ... Default: tmp",
default=os.getenv("LIBRETEXTS_TMP", "tmp"),
default=os.getenv("LIBRETEXTS_TMP", tmpdir),
dest="tmp_folder",
)

Expand Down Expand Up @@ -245,7 +245,3 @@ def main() -> None:
logger.exception(exc)
logger.error(f"Generation failed with the following error: {exc}")
raise SystemExit(1) from exc


if __name__ == "__main__":
main()

0 comments on commit 2def419

Please sign in to comment.