From 0120c2b8e7fdbb68a471d58c50fe54e5ac8ba41f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 1 Oct 2024 08:49:42 +0000 Subject: [PATCH 1/2] Add ability to retrieve and filter the whole page tree --- scraper/src/libretexts2zim/client.py | 76 ++++++++++ scraper/src/libretexts2zim/entrypoint.py | 26 +++- scraper/src/libretexts2zim/processor.py | 88 +++++++++-- scraper/tests-integration/test_client.py | 50 ++++++- scraper/tests/test_processor.py | 182 +++++++++++++++++++++++ 5 files changed, 402 insertions(+), 20 deletions(-) create mode 100644 scraper/tests/test_processor.py diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index 794755d..1689693 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -24,6 +24,49 @@ class LibreTextsHome(BaseModel): welcome_image_url: str +class DekiPage(BaseModel): + id: str + title: str + parent: "DekiPage | None" = None + children: list["DekiPage"] = [] + + def __repr__(self) -> str: + return ( + f"DekiPage(id='{self.id}', title='{self.title}', " + f"parent='{'None' if not self.parent else self.parent.id}', " + f"children='{','.join([child.id for child in self.children])}')" + ) + + @property + def self_and_parents(self) -> list["DekiPage"]: + result: list[DekiPage] = [self] + current = self + while current.parent is not None: + result.append(current.parent) + current = current.parent + return result + + +class DekiTree(BaseModel): + root: DekiPage + pages: dict[str, DekiPage] = {} + + def sub_tree(self, subroot_id: str) -> "DekiTree": + """Returns a sub-tree, starting at give page id""" + new_root = self.pages[subroot_id] + tree = DekiTree(root=new_root) + tree.pages[new_root.id] = new_root + children_to_explore = [*new_root.children] + while len(children_to_explore) > 0: + child = children_to_explore[0] + children_to_explore.remove(child) + if child.id in tree.pages: + continue # safe-guard + tree.pages[child.id] = child + children_to_explore.extend(child.children) + return tree + + class LibreTextsMetadata(BaseModel): """Metadata about a library.""" @@ -189,6 +232,39 @@ def get_root_page_id(self) -> str: tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) return tree["page"]["@id"] + def get_page_tree(self) -> DekiTree: + + tree_data = self._get_api_json( + "/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS + ) + + root = DekiPage(id=tree_data["page"]["@id"], title=tree_data["page"]["title"]) + tree_obj = DekiTree(root=root) + tree_obj.pages[root.id] = root + + def _add_page(page_node: Any, parent: DekiPage) -> DekiPage: + page = DekiPage( + id=page_node["@id"], title=page_node["title"], parent=parent + ) + parent.children.append(page) + tree_obj.pages[page.id] = page + return page + + def _process_tree_data(page_node: Any, parent: DekiPage) -> None: + if not page_node["subpages"]: + return + if "@id" in page_node["subpages"]["page"]: + page = _add_page(page_node["subpages"]["page"], parent=parent) + _process_tree_data(page_node["subpages"]["page"], parent=page) + else: + for subpage_node in page_node["subpages"]["page"]: + page = _add_page(subpage_node, parent=parent) + _process_tree_data(subpage_node, parent=page) + + _process_tree_data(tree_data["page"], parent=root) + + return tree_obj + def _get_soup(content: str) -> BeautifulSoup: """Return a BeautifulSoup soup from textual content diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py index 1d8ef52..04b2ac5 100644 --- a/scraper/src/libretexts2zim/entrypoint.py +++ b/scraper/src/libretexts2zim/entrypoint.py @@ -126,17 +126,33 @@ def add_content_filter_flags(parser: argparse.ArgumentParser): """Adds flags related to content filtering to the given parser.""" parser.add_argument( - "--shelves-include", - help="Includes only shelves matching the given regular expression.", + "--page-title-include", + help="Includes only pages with title matching the given regular " + "expression, and their parent pages for proper navigation. Can be combined" + " with --page-id-include (pages with matching title or id will be included" + ")", metavar="REGEX", ) parser.add_argument( - "--shelves-exclude", - help="Excludes shelves matching the given regular expression.", + "--page-id-include", + help="CSV value of page ids to include. Parent pages will be included as " + "well for proper navigation. Can be combined with --page-title-include " + "(pages with matching title or id will be included)", + ) + + parser.add_argument( + "--page-title-exclude", + help="Excludes pages with title matching the given regular expression", metavar="REGEX", ) + parser.add_argument( + "--root-page-id", + help="ID of the root page to include in ZIM. Only this page and its" + " subpages will be included in the ZIM", + ) + def main(tmpdir: str) -> None: parser = argparse.ArgumentParser( @@ -223,7 +239,7 @@ def main(tmpdir: str) -> None: doc_filter = ContentFilter.of(args) cache_folder = tmp_folder / "cache" - cache_folder.mkdir() + cache_folder.mkdir(exist_ok=True) libretexts_client = LibreTextsClient( library_slug=args.library_slug, diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 146259f..51bacec 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -1,5 +1,6 @@ import argparse import datetime +import re from io import BytesIO from pathlib import Path @@ -12,7 +13,12 @@ from zimscraperlib.zim.filesystem import validate_zimfile_creatable from zimscraperlib.zim.indexing import IndexData -from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata +from libretexts2zim.client import ( + DekiPage, + DekiTree, + LibreTextsClient, + LibreTextsMetadata, +) from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger from libretexts2zim.ui import ConfigModel, HomeModel, SharedModel from libretexts2zim.zimconfig import ZimConfig @@ -33,23 +39,69 @@ class MissingDocumentError(Exception): class ContentFilter(BaseModel): """Supports filtering documents by user provided attributes.""" - # If specified, only shelves matching the regex are included. - shelves_include: str | None - # If specified, shelves matching the regex are excluded. - shelves_exclude: str | None + # If specified, only pages with title matching the regex are included. + page_title_include: str | None + # If specified, only page with matching ids are included. + page_id_include: str | None + # If specified, page with title matching the regex are excluded. + page_title_exclude: str | None + # If specified, only this page and its subpages will be included. + root_page_id: str | None @staticmethod def of(namespace: argparse.Namespace) -> "ContentFilter": """Parses a namespace to create a new DocFilter.""" return ContentFilter.model_validate(namespace, from_attributes=True) - # TODO: implement filtering of shelves based on configured regex - # def filter(self, shelves: list[LibretextsShelve]) -> list[LibretextsShelve]: - # """Filters docs based on the user's choices.""" - # selected: list[LibretextsShelve] = [] - # for shelve in shelves: - # .... - # return selected + def filter(self, page_tree: DekiTree) -> list[DekiPage]: + """Filters pages based on the user's choices.""" + + if self.root_page_id: + page_tree = page_tree.sub_tree(self.root_page_id) + + title_include_re = ( + re.compile(self.page_title_include, re.IGNORECASE) + if self.page_title_include + else None + ) + title_exclude_re = ( + re.compile(self.page_title_exclude, re.IGNORECASE) + if self.page_title_exclude + else None + ) + id_include = ( + [page_id.strip() for page_id in self.page_id_include.split(",")] + if self.page_id_include + else None + ) + + def is_selected( + title_include_re: re.Pattern[str] | None, + title_exclude_re: re.Pattern[str] | None, + id_include: list[str] | None, + page: DekiPage, + ) -> bool: + return ( + ( + not title_include_re + or title_include_re.search(page.title) is not None + ) + and (not id_include or page.id in id_include) + and ( + not title_exclude_re or title_exclude_re.search(page.title) is None + ) + ) + + # Find selected pages and their parent, and create a set of unique ids + selected_ids = { + selected_page.id + for page in page_tree.pages.values() + for selected_page in page.self_and_parents + if is_selected(title_include_re, title_exclude_re, id_include, page) + } + + # Then transform set of ids into list of pages + return [page for page in page_tree.pages.values() if page.id in selected_ids] def add_item_for( @@ -113,7 +165,7 @@ def __init__( """ self.libretexts_client = libretexts_client self.zim_config = zim_config - self.doc_filter = content_filter + self.content_filter = content_filter self.output_folder = output_folder self.zimui_dist = zimui_dist self.overwrite_existing_zim = overwrite_existing_zim @@ -222,7 +274,7 @@ def run(self) -> Path: ).model_dump_json(by_alias=True), ) - logger.info(f"Adding files in {self.zimui_dist}") + logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}") for file in self.zimui_dist.rglob("*"): if file.is_dir(): continue @@ -247,4 +299,12 @@ def run(self) -> Path: is_front=False, ) + logger.info("Fetching pages tree") + pages_tree = self.libretexts_client.get_page_tree() + selected_pages = self.content_filter.filter(pages_tree) + logger.info( + f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be " + "fetched and pushed to the ZIM" + ) + return zim_path diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index 5cf9ebf..b4f45f9 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -7,7 +7,7 @@ ) from zimscraperlib.image.probing import format_for -from libretexts2zim.client import LibreTextsClient, LibreTextsHome +from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome @pytest.fixture(scope="module") @@ -35,6 +35,19 @@ def root_page_id() -> str: return "34" +@pytest.fixture(scope="module") +def nb_root_children() -> int: + return 6 + + +@pytest.fixture(scope="module") +def page_tree( + client: LibreTextsClient, + deki_token: str, # noqa: ARG001 +) -> DekiTree: + return client.get_page_tree() + + def test_get_deki_token(deki_token: str): """Ensures we achieve to get a deki_token""" assert deki_token @@ -57,6 +70,41 @@ def test_get_root_page_id( assert client.get_root_page_id() == root_page_id +def test_get_page_tree_pages( + page_tree: DekiTree, + minimum_number_of_pages: int, +): + assert len(page_tree.pages.keys()) > minimum_number_of_pages + + +def test_get_page_tree_root( + page_tree: DekiTree, + root_page_id: str, + nb_root_children: int, +): + assert page_tree.root.id == root_page_id + assert len(page_tree.root.children) == nb_root_children + assert page_tree.root.title + for child in page_tree.root.children: + assert child.title + + +def test_get_page_tree_subtree( + page_tree: DekiTree, +): + + # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science + subtree1 = page_tree.sub_tree("28207") + # 4 = "1. Understransding Science" + "1.1: What is Science?" + # + "1.2: The Scientific Method" + "1.3: The Study of Geology" + assert len(subtree1.pages.keys()) == 4 + + # 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College + subtree2 = page_tree.sub_tree("28196") + # 94 is number retrieved in Oct. 2024, might change + assert len(subtree2.pages.keys()) == 94 + + def test_get_home_image_url(home: LibreTextsHome): """Ensures proper image url is retrieved""" assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png" diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py new file mode 100644 index 0000000..9953738 --- /dev/null +++ b/scraper/tests/test_processor.py @@ -0,0 +1,182 @@ +import pytest + +from libretexts2zim.client import DekiPage, DekiTree +from libretexts2zim.processor import ContentFilter + + +@pytest.fixture(scope="module") +def deki_tree() -> DekiTree: + root = DekiPage(id="24", title="Home page") + topic1 = DekiPage(id="25", title="1: First topic", parent=root) + root.children.append(topic1) + topic1_1 = DekiPage(id="26", title="1.1: Cloud", parent=topic1) + topic1.children.append(topic1_1) + topic1_2 = DekiPage(id="27", title="1.2: Tree", parent=topic1) + topic1.children.append(topic1_2) + topic1_3 = DekiPage(id="28", title="1.3: Bees", parent=topic1) + topic1.children.append(topic1_3) + topic2 = DekiPage(id="29", title="2: Second topic", parent=root) + root.children.append(topic2) + topic2_1 = DekiPage(id="30", title="2.1: Underground", parent=topic2) + topic2.children.append(topic2_1) + topic2_2 = DekiPage(id="31", title="2.2: Lava", parent=topic2) + topic2.children.append(topic2_2) + topic2_3 = DekiPage(id="32", title="2.3: Volcano", parent=topic2) + topic2.children.append(topic2_3) + topic3 = DekiPage(id="33", title="3: Third topic", parent=root) + root.children.append(topic3) + topic3_1 = DekiPage(id="34", title="3.1: Ground", parent=topic3) + topic3.children.append(topic3_1) + topic3_2 = DekiPage(id="35", title="3.2: Earth", parent=topic3) + topic3.children.append(topic3_2) + topic3_3 = DekiPage(id="36", title="3.3: Sky", parent=topic3) + topic3.children.append(topic3_3) + return DekiTree( + root=root, + pages={ + root.id: root, + topic1.id: topic1, + topic1_1.id: topic1_1, + topic1_2.id: topic1_2, + topic1_3.id: topic1_3, + topic2.id: topic2, + topic2_1.id: topic2_1, + topic2_2.id: topic2_2, + topic2_3.id: topic2_3, + topic3.id: topic3, + topic3_1.id: topic3_1, + topic3_2.id: topic3_2, + topic3_3.id: topic3_3, + }, + ) + + +@pytest.mark.parametrize( + "content_filter,expected_ids", + [ + pytest.param( + ContentFilter( + page_title_include=r"^1\..*", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "26", "27", "28"], + id="include_1", + ), + pytest.param( + ContentFilter( + page_title_include=r"^2\..*", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "29", "30", "31", "32"], + id="include_2", + ), + pytest.param( + ContentFilter( + page_title_include=None, + page_title_exclude=None, + page_id_include="26,27,28", + root_page_id=None, + ), + ["24", "25", "26", "27", "28"], + id="include_3", + ), + pytest.param( + ContentFilter( + page_title_include="ground", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "29", "30", "33", "34"], + id="include_4", + ), + pytest.param( + ContentFilter( + page_title_include=r"^1\..*", + page_title_exclude="Tree", + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "26", "28"], + id="include_exclude_1", + ), + pytest.param( + ContentFilter( + page_title_include=None, + page_title_exclude="Tree", + page_id_include="26,27,28", + root_page_id=None, + ), + ["24", "25", "26", "28"], + id="include_exclude_2", + ), + pytest.param( + ContentFilter( + page_title_include="ground", + page_title_exclude="^2", + page_id_include=None, + root_page_id=None, + ), + ["24", "33", "34"], + id="include_exclude_3", + ), + pytest.param( + ContentFilter( + page_title_include=r"^1\..*", + page_title_exclude="tree", + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "26", "28"], + id="include_exclude_case_insensitive", + ), + pytest.param( + ContentFilter( + page_title_include="tree", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "27"], + id="include_case_insensitive", + ), + pytest.param( + ContentFilter( + page_title_include="^tree", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + [], + id="include_no_match", + ), + pytest.param( + ContentFilter( + page_title_include=None, + page_title_exclude=None, + page_id_include=None, + root_page_id="25", + ), + ["25", "26", "27", "28"], + id="root_page_id", + ), + pytest.param( + ContentFilter( + page_title_include=r"^1\.1.*", + page_title_exclude=None, + page_id_include=None, + root_page_id="25", + ), + ["25", "26"], + id="root_page_id_and_include", + ), + ], +) +def test_content_filter( + content_filter: ContentFilter, expected_ids: list[str], deki_tree: DekiTree +): + assert [page.id for page in content_filter.filter(deki_tree)] == expected_ids From 35b5448389466efa9d4f36ce9f7ae0586fd71c83 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 3 Oct 2024 15:22:30 +0000 Subject: [PATCH 2/2] fixup! Add ability to retrieve and filter the whole page tree --- scraper/src/libretexts2zim/client.py | 51 ++++++++++++++---------- scraper/src/libretexts2zim/entrypoint.py | 13 +++--- scraper/src/libretexts2zim/processor.py | 11 ++--- scraper/tests-integration/test_client.py | 19 +++++---- scraper/tests/test_processor.py | 36 ++++++++--------- 5 files changed, 73 insertions(+), 57 deletions(-) diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index 1689693..99bd7b7 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -24,22 +24,27 @@ class LibreTextsHome(BaseModel): welcome_image_url: str -class DekiPage(BaseModel): - id: str +LibraryPageId = str + + +class LibraryPage(BaseModel): + """Class holding information about a given library page on the library tree""" + + id: LibraryPageId title: str - parent: "DekiPage | None" = None - children: list["DekiPage"] = [] + parent: "LibraryPage | None" = None + children: list["LibraryPage"] = [] def __repr__(self) -> str: return ( - f"DekiPage(id='{self.id}', title='{self.title}', " + f"WikiPage(id='{self.id}', title='{self.title}', " f"parent='{'None' if not self.parent else self.parent.id}', " f"children='{','.join([child.id for child in self.children])}')" ) @property - def self_and_parents(self) -> list["DekiPage"]: - result: list[DekiPage] = [self] + def self_and_parents(self) -> list["LibraryPage"]: + result: list[LibraryPage] = [self] current = self while current.parent is not None: result.append(current.parent) @@ -47,14 +52,16 @@ def self_and_parents(self) -> list["DekiPage"]: return result -class DekiTree(BaseModel): - root: DekiPage - pages: dict[str, DekiPage] = {} +class LibraryTree(BaseModel): + """Class holding information about the tree of pages on a given library""" - def sub_tree(self, subroot_id: str) -> "DekiTree": + root: LibraryPage + pages: dict[LibraryPageId, LibraryPage] = {} + + def sub_tree(self, subroot_id: LibraryPageId) -> "LibraryTree": """Returns a sub-tree, starting at give page id""" new_root = self.pages[subroot_id] - tree = DekiTree(root=new_root) + tree = LibraryTree(root=new_root) tree.pages[new_root.id] = new_root children_to_explore = [*new_root.children] while len(children_to_explore) > 0: @@ -205,12 +212,12 @@ def get_deki_token(self) -> str: self.deki_token = _get_deki_token_from_home(soup) return self.deki_token - def get_all_pages_ids(self): + def get_all_pages_ids(self) -> list[LibraryPageId]: """Returns the IDs of all pages on current website, exploring the whole tree""" tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) - page_ids: list[str] = [] + page_ids: list[LibraryPageId] = [] def _get_page_ids(page_node: Any) -> None: page_ids.append(page_node["@id"]) @@ -226,31 +233,33 @@ def _get_page_ids(page_node: Any) -> None: return page_ids - def get_root_page_id(self) -> str: + def get_root_page_id(self) -> LibraryPageId: """Returns the ID the root of the tree of pages""" tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) return tree["page"]["@id"] - def get_page_tree(self) -> DekiTree: + def get_page_tree(self) -> LibraryTree: tree_data = self._get_api_json( "/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS ) - root = DekiPage(id=tree_data["page"]["@id"], title=tree_data["page"]["title"]) - tree_obj = DekiTree(root=root) + root = LibraryPage( + id=tree_data["page"]["@id"], title=tree_data["page"]["title"] + ) + tree_obj = LibraryTree(root=root) tree_obj.pages[root.id] = root - def _add_page(page_node: Any, parent: DekiPage) -> DekiPage: - page = DekiPage( + def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage: + page = LibraryPage( id=page_node["@id"], title=page_node["title"], parent=parent ) parent.children.append(page) tree_obj.pages[page.id] = page return page - def _process_tree_data(page_node: Any, parent: DekiPage) -> None: + def _process_tree_data(page_node: Any, parent: LibraryPage) -> None: if not page_node["subpages"]: return if "@id" in page_node["subpages"]["page"]: diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py index 04b2ac5..80a6ca2 100644 --- a/scraper/src/libretexts2zim/entrypoint.py +++ b/scraper/src/libretexts2zim/entrypoint.py @@ -128,17 +128,18 @@ def add_content_filter_flags(parser: argparse.ArgumentParser): parser.add_argument( "--page-title-include", help="Includes only pages with title matching the given regular " - "expression, and their parent pages for proper navigation. Can be combined" - " with --page-id-include (pages with matching title or id will be included" - ")", + "expression, and their parent pages for proper navigation, up to root (or " + "subroot if --root-page-id is set). Can be combined with --page-id-include " + "(pages with matching title or id will be included)", metavar="REGEX", ) parser.add_argument( "--page-id-include", - help="CSV value of page ids to include. Parent pages will be included as " - "well for proper navigation. Can be combined with --page-title-include " - "(pages with matching title or id will be included)", + help="CSV of page ids to include. Parent pages will be included as " + "well for proper navigation, up to root (or subroot if --root-page-id is set). " + "Can be combined with --page-title-include (pages with matching title or id " + "will be included)", ) parser.add_argument( diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 51bacec..4862afb 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -14,8 +14,9 @@ from zimscraperlib.zim.indexing import IndexData from libretexts2zim.client import ( - DekiPage, - DekiTree, + LibraryPage, + LibraryPageId, + LibraryTree, LibreTextsClient, LibreTextsMetadata, ) @@ -53,7 +54,7 @@ def of(namespace: argparse.Namespace) -> "ContentFilter": """Parses a namespace to create a new DocFilter.""" return ContentFilter.model_validate(namespace, from_attributes=True) - def filter(self, page_tree: DekiTree) -> list[DekiPage]: + def filter(self, page_tree: LibraryTree) -> list[LibraryPage]: """Filters pages based on the user's choices.""" if self.root_page_id: @@ -78,8 +79,8 @@ def filter(self, page_tree: DekiTree) -> list[DekiPage]: def is_selected( title_include_re: re.Pattern[str] | None, title_exclude_re: re.Pattern[str] | None, - id_include: list[str] | None, - page: DekiPage, + id_include: list[LibraryPageId] | None, + page: LibraryPage, ) -> bool: return ( ( diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index b4f45f9..5217470 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -7,7 +7,12 @@ ) from zimscraperlib.image.probing import format_for -from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome +from libretexts2zim.client import ( + LibraryPageId, + LibraryTree, + LibreTextsClient, + LibreTextsHome, +) @pytest.fixture(scope="module") @@ -31,7 +36,7 @@ def minimum_number_of_pages() -> int: @pytest.fixture(scope="module") -def root_page_id() -> str: +def root_page_id() -> LibraryPageId: return "34" @@ -44,7 +49,7 @@ def nb_root_children() -> int: def page_tree( client: LibreTextsClient, deki_token: str, # noqa: ARG001 -) -> DekiTree: +) -> LibraryTree: return client.get_page_tree() @@ -64,21 +69,21 @@ def test_get_all_pages_ids( def test_get_root_page_id( client: LibreTextsClient, - root_page_id: str, + root_page_id: LibraryPageId, deki_token: str, # noqa: ARG001 ): assert client.get_root_page_id() == root_page_id def test_get_page_tree_pages( - page_tree: DekiTree, + page_tree: LibraryTree, minimum_number_of_pages: int, ): assert len(page_tree.pages.keys()) > minimum_number_of_pages def test_get_page_tree_root( - page_tree: DekiTree, + page_tree: LibraryTree, root_page_id: str, nb_root_children: int, ): @@ -90,7 +95,7 @@ def test_get_page_tree_root( def test_get_page_tree_subtree( - page_tree: DekiTree, + page_tree: LibraryTree, ): # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py index 9953738..50e6c4f 100644 --- a/scraper/tests/test_processor.py +++ b/scraper/tests/test_processor.py @@ -1,37 +1,37 @@ import pytest -from libretexts2zim.client import DekiPage, DekiTree +from libretexts2zim.client import LibraryPage, LibraryTree from libretexts2zim.processor import ContentFilter @pytest.fixture(scope="module") -def deki_tree() -> DekiTree: - root = DekiPage(id="24", title="Home page") - topic1 = DekiPage(id="25", title="1: First topic", parent=root) +def library_tree() -> LibraryTree: + root = LibraryPage(id="24", title="Home page") + topic1 = LibraryPage(id="25", title="1: First topic", parent=root) root.children.append(topic1) - topic1_1 = DekiPage(id="26", title="1.1: Cloud", parent=topic1) + topic1_1 = LibraryPage(id="26", title="1.1: Cloud", parent=topic1) topic1.children.append(topic1_1) - topic1_2 = DekiPage(id="27", title="1.2: Tree", parent=topic1) + topic1_2 = LibraryPage(id="27", title="1.2: Tree", parent=topic1) topic1.children.append(topic1_2) - topic1_3 = DekiPage(id="28", title="1.3: Bees", parent=topic1) + topic1_3 = LibraryPage(id="28", title="1.3: Bees", parent=topic1) topic1.children.append(topic1_3) - topic2 = DekiPage(id="29", title="2: Second topic", parent=root) + topic2 = LibraryPage(id="29", title="2: Second topic", parent=root) root.children.append(topic2) - topic2_1 = DekiPage(id="30", title="2.1: Underground", parent=topic2) + topic2_1 = LibraryPage(id="30", title="2.1: Underground", parent=topic2) topic2.children.append(topic2_1) - topic2_2 = DekiPage(id="31", title="2.2: Lava", parent=topic2) + topic2_2 = LibraryPage(id="31", title="2.2: Lava", parent=topic2) topic2.children.append(topic2_2) - topic2_3 = DekiPage(id="32", title="2.3: Volcano", parent=topic2) + topic2_3 = LibraryPage(id="32", title="2.3: Volcano", parent=topic2) topic2.children.append(topic2_3) - topic3 = DekiPage(id="33", title="3: Third topic", parent=root) + topic3 = LibraryPage(id="33", title="3: Third topic", parent=root) root.children.append(topic3) - topic3_1 = DekiPage(id="34", title="3.1: Ground", parent=topic3) + topic3_1 = LibraryPage(id="34", title="3.1: Ground", parent=topic3) topic3.children.append(topic3_1) - topic3_2 = DekiPage(id="35", title="3.2: Earth", parent=topic3) + topic3_2 = LibraryPage(id="35", title="3.2: Earth", parent=topic3) topic3.children.append(topic3_2) - topic3_3 = DekiPage(id="36", title="3.3: Sky", parent=topic3) + topic3_3 = LibraryPage(id="36", title="3.3: Sky", parent=topic3) topic3.children.append(topic3_3) - return DekiTree( + return LibraryTree( root=root, pages={ root.id: root, @@ -177,6 +177,6 @@ def deki_tree() -> DekiTree: ], ) def test_content_filter( - content_filter: ContentFilter, expected_ids: list[str], deki_tree: DekiTree + content_filter: ContentFilter, expected_ids: list[str], library_tree: LibraryTree ): - assert [page.id for page in content_filter.filter(deki_tree)] == expected_ids + assert [page.id for page in content_filter.filter(library_tree)] == expected_ids