diff --git a/Dockerfile b/Dockerfile index df7a428..0713679 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,6 +44,8 @@ RUN pip install --no-cache-dir /src/scraper \ # Copy zimui build output COPY --from=zimui /src/dist /src/zimui -ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui LIBRETEXTS_OUTPUT=/output LIBRETEXTS_TMP=/tmp +ENV LIBRETEXTS_ZIMUI_DIST=/src/zimui \ + LIBRETEXTS_OUTPUT=/output \ + LIBRETEXTS_TMP=/tmp CMD ["libretexts2zim", "--help"] diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index b557333..fdbd0de 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -47,7 +47,6 @@ dev = [ [project.scripts] libretexts2zim = "libretexts2zim.__main__:main" -libretexts2zim-playlists = "libretexts2zim.playlists.__main__:main" [tool.hatch.version] path = "src/libretexts2zim/__about__.py" diff --git a/scraper/src/libretexts2zim/__main__.py b/scraper/src/libretexts2zim/__main__.py index 9ee85b7..56d7e47 100644 --- a/scraper/src/libretexts2zim/__main__.py +++ b/scraper/src/libretexts2zim/__main__.py @@ -1,9 +1,12 @@ -#!/usr/bin/env python3 -# vim: ai ts=4 sts=4 et sw=4 nu +import tempfile -import sys +from libretexts2zim.entrypoint import main as entrypoint + + +def main(): + with tempfile.TemporaryDirectory() as tmpdir: + entrypoint(tmpdir) -from libretexts2zim.entrypoint import main if __name__ == "__main__": - sys.exit(main()) + main() diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index c80cf0c..794755d 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -11,7 +11,8 @@ from libretexts2zim.constants import logger -HTTP_TIMEOUT_SECONDS = 15 +HTTP_TIMEOUT_NORMAL_SECONDS = 15 +HTTP_TIMEOUT_LONG_SECONDS = 30 class LibreTextsParsingError(Exception): @@ -74,8 +75,7 @@ def api_url(self) -> str: def _get_cache_file(self, url_subpath_and_query: str) -> Path: """Get location where HTTP result should be cached""" - if url_subpath_and_query.startswith("/"): - url_subpath_and_query = url_subpath_and_query[1:] + url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query) if url_subpath_and_query.endswith("/"): url_subpath_and_query += "index" return self.cache_folder / url_subpath_and_query @@ -94,7 +94,7 @@ def _get_text(self, url_subpath_and_query: str) -> str: resp = requests.get( url=full_url, allow_redirects=True, - timeout=HTTP_TIMEOUT_SECONDS, + timeout=HTTP_TIMEOUT_NORMAL_SECONDS, ) resp.raise_for_status() @@ -115,7 +115,7 @@ def _get_api_resp( return resp def _get_api_json( - self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS + self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS ) -> Any: cache_file = self._get_cache_file(f"api_json{api_sub_path}") if cache_file.exists(): @@ -129,11 +129,11 @@ def _get_api_json( return result def _get_api_content( - self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_SECONDS + self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS ) -> bytes | Any: cache_file = self._get_cache_file(f"api_content{api_sub_path}") if cache_file.exists(): - return json.loads(cache_file.read_text()) + return cache_file.read_bytes() cache_file.parent.mkdir(parents=True, exist_ok=True) resp = self._get_api_resp(api_sub_path, timeout=timeout) result = resp.content @@ -165,7 +165,7 @@ def get_deki_token(self) -> str: def get_all_pages_ids(self): """Returns the IDs of all pages on current website, exploring the whole tree""" - tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2) + tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) page_ids: list[str] = [] @@ -186,7 +186,7 @@ def _get_page_ids(page_node: Any) -> None: def get_root_page_id(self) -> str: """Returns the ID the root of the tree of pages""" - tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_SECONDS * 2) + tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) return tree["page"]["@id"] @@ -197,6 +197,7 @@ def _get_soup(content: str) -> BeautifulSoup: """ return BeautifulSoup(content, "lxml") + def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str: """Return the URL of the image found on home header""" branding_div = soup.find("div", class_="LTBranding") diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py index 2417e1f..1d8ef52 100644 --- a/scraper/src/libretexts2zim/entrypoint.py +++ b/scraper/src/libretexts2zim/entrypoint.py @@ -138,7 +138,7 @@ def add_content_filter_flags(parser: argparse.ArgumentParser): ) -def main() -> None: +def main(tmpdir: str) -> None: parser = argparse.ArgumentParser( prog=NAME, ) @@ -181,7 +181,7 @@ def main() -> None: parser.add_argument( "--tmp", help="Temporary folder for cache, intermediate files, ... Default: tmp", - default=os.getenv("LIBRETEXTS_TMP", "tmp"), + default=os.getenv("LIBRETEXTS_TMP", tmpdir), dest="tmp_folder", ) @@ -245,7 +245,3 @@ def main() -> None: logger.exception(exc) logger.error(f"Generation failed with the following error: {exc}") raise SystemExit(1) from exc - - -if __name__ == "__main__": - main()