diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py index 794755d..99bd7b7 100644 --- a/scraper/src/libretexts2zim/client.py +++ b/scraper/src/libretexts2zim/client.py @@ -24,6 +24,56 @@ class LibreTextsHome(BaseModel): welcome_image_url: str +LibraryPageId = str + + +class LibraryPage(BaseModel): + """Class holding information about a given library page on the library tree""" + + id: LibraryPageId + title: str + parent: "LibraryPage | None" = None + children: list["LibraryPage"] = [] + + def __repr__(self) -> str: + return ( + f"WikiPage(id='{self.id}', title='{self.title}', " + f"parent='{'None' if not self.parent else self.parent.id}', " + f"children='{','.join([child.id for child in self.children])}')" + ) + + @property + def self_and_parents(self) -> list["LibraryPage"]: + result: list[LibraryPage] = [self] + current = self + while current.parent is not None: + result.append(current.parent) + current = current.parent + return result + + +class LibraryTree(BaseModel): + """Class holding information about the tree of pages on a given library""" + + root: LibraryPage + pages: dict[LibraryPageId, LibraryPage] = {} + + def sub_tree(self, subroot_id: LibraryPageId) -> "LibraryTree": + """Returns a sub-tree, starting at give page id""" + new_root = self.pages[subroot_id] + tree = LibraryTree(root=new_root) + tree.pages[new_root.id] = new_root + children_to_explore = [*new_root.children] + while len(children_to_explore) > 0: + child = children_to_explore[0] + children_to_explore.remove(child) + if child.id in tree.pages: + continue # safe-guard + tree.pages[child.id] = child + children_to_explore.extend(child.children) + return tree + + class LibreTextsMetadata(BaseModel): """Metadata about a library.""" @@ -162,12 +212,12 @@ def get_deki_token(self) -> str: self.deki_token = _get_deki_token_from_home(soup) return self.deki_token - def get_all_pages_ids(self): + def get_all_pages_ids(self) -> list[LibraryPageId]: """Returns the IDs of all pages on current website, exploring the whole tree""" tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) - page_ids: list[str] = [] + page_ids: list[LibraryPageId] = [] def _get_page_ids(page_node: Any) -> None: page_ids.append(page_node["@id"]) @@ -183,12 +233,47 @@ def _get_page_ids(page_node: Any) -> None: return page_ids - def get_root_page_id(self) -> str: + def get_root_page_id(self) -> LibraryPageId: """Returns the ID the root of the tree of pages""" tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS) return tree["page"]["@id"] + def get_page_tree(self) -> LibraryTree: + + tree_data = self._get_api_json( + "/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS + ) + + root = LibraryPage( + id=tree_data["page"]["@id"], title=tree_data["page"]["title"] + ) + tree_obj = LibraryTree(root=root) + tree_obj.pages[root.id] = root + + def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage: + page = LibraryPage( + id=page_node["@id"], title=page_node["title"], parent=parent + ) + parent.children.append(page) + tree_obj.pages[page.id] = page + return page + + def _process_tree_data(page_node: Any, parent: LibraryPage) -> None: + if not page_node["subpages"]: + return + if "@id" in page_node["subpages"]["page"]: + page = _add_page(page_node["subpages"]["page"], parent=parent) + _process_tree_data(page_node["subpages"]["page"], parent=page) + else: + for subpage_node in page_node["subpages"]["page"]: + page = _add_page(subpage_node, parent=parent) + _process_tree_data(subpage_node, parent=page) + + _process_tree_data(tree_data["page"], parent=root) + + return tree_obj + def _get_soup(content: str) -> BeautifulSoup: """Return a BeautifulSoup soup from textual content diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py index 1d8ef52..80a6ca2 100644 --- a/scraper/src/libretexts2zim/entrypoint.py +++ b/scraper/src/libretexts2zim/entrypoint.py @@ -126,17 +126,34 @@ def add_content_filter_flags(parser: argparse.ArgumentParser): """Adds flags related to content filtering to the given parser.""" parser.add_argument( - "--shelves-include", - help="Includes only shelves matching the given regular expression.", + "--page-title-include", + help="Includes only pages with title matching the given regular " + "expression, and their parent pages for proper navigation, up to root (or " + "subroot if --root-page-id is set). Can be combined with --page-id-include " + "(pages with matching title or id will be included)", metavar="REGEX", ) parser.add_argument( - "--shelves-exclude", - help="Excludes shelves matching the given regular expression.", + "--page-id-include", + help="CSV of page ids to include. Parent pages will be included as " + "well for proper navigation, up to root (or subroot if --root-page-id is set). " + "Can be combined with --page-title-include (pages with matching title or id " + "will be included)", + ) + + parser.add_argument( + "--page-title-exclude", + help="Excludes pages with title matching the given regular expression", metavar="REGEX", ) + parser.add_argument( + "--root-page-id", + help="ID of the root page to include in ZIM. Only this page and its" + " subpages will be included in the ZIM", + ) + def main(tmpdir: str) -> None: parser = argparse.ArgumentParser( @@ -223,7 +240,7 @@ def main(tmpdir: str) -> None: doc_filter = ContentFilter.of(args) cache_folder = tmp_folder / "cache" - cache_folder.mkdir() + cache_folder.mkdir(exist_ok=True) libretexts_client = LibreTextsClient( library_slug=args.library_slug, diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py index 146259f..4862afb 100644 --- a/scraper/src/libretexts2zim/processor.py +++ b/scraper/src/libretexts2zim/processor.py @@ -1,5 +1,6 @@ import argparse import datetime +import re from io import BytesIO from pathlib import Path @@ -12,7 +13,13 @@ from zimscraperlib.zim.filesystem import validate_zimfile_creatable from zimscraperlib.zim.indexing import IndexData -from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata +from libretexts2zim.client import ( + LibraryPage, + LibraryPageId, + LibraryTree, + LibreTextsClient, + LibreTextsMetadata, +) from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger from libretexts2zim.ui import ConfigModel, HomeModel, SharedModel from libretexts2zim.zimconfig import ZimConfig @@ -33,23 +40,69 @@ class MissingDocumentError(Exception): class ContentFilter(BaseModel): """Supports filtering documents by user provided attributes.""" - # If specified, only shelves matching the regex are included. - shelves_include: str | None - # If specified, shelves matching the regex are excluded. - shelves_exclude: str | None + # If specified, only pages with title matching the regex are included. + page_title_include: str | None + # If specified, only page with matching ids are included. + page_id_include: str | None + # If specified, page with title matching the regex are excluded. + page_title_exclude: str | None + # If specified, only this page and its subpages will be included. + root_page_id: str | None @staticmethod def of(namespace: argparse.Namespace) -> "ContentFilter": """Parses a namespace to create a new DocFilter.""" return ContentFilter.model_validate(namespace, from_attributes=True) - # TODO: implement filtering of shelves based on configured regex - # def filter(self, shelves: list[LibretextsShelve]) -> list[LibretextsShelve]: - # """Filters docs based on the user's choices.""" - # selected: list[LibretextsShelve] = [] - # for shelve in shelves: - # .... - # return selected + def filter(self, page_tree: LibraryTree) -> list[LibraryPage]: + """Filters pages based on the user's choices.""" + + if self.root_page_id: + page_tree = page_tree.sub_tree(self.root_page_id) + + title_include_re = ( + re.compile(self.page_title_include, re.IGNORECASE) + if self.page_title_include + else None + ) + title_exclude_re = ( + re.compile(self.page_title_exclude, re.IGNORECASE) + if self.page_title_exclude + else None + ) + id_include = ( + [page_id.strip() for page_id in self.page_id_include.split(",")] + if self.page_id_include + else None + ) + + def is_selected( + title_include_re: re.Pattern[str] | None, + title_exclude_re: re.Pattern[str] | None, + id_include: list[LibraryPageId] | None, + page: LibraryPage, + ) -> bool: + return ( + ( + not title_include_re + or title_include_re.search(page.title) is not None + ) + and (not id_include or page.id in id_include) + and ( + not title_exclude_re or title_exclude_re.search(page.title) is None + ) + ) + + # Find selected pages and their parent, and create a set of unique ids + selected_ids = { + selected_page.id + for page in page_tree.pages.values() + for selected_page in page.self_and_parents + if is_selected(title_include_re, title_exclude_re, id_include, page) + } + + # Then transform set of ids into list of pages + return [page for page in page_tree.pages.values() if page.id in selected_ids] def add_item_for( @@ -113,7 +166,7 @@ def __init__( """ self.libretexts_client = libretexts_client self.zim_config = zim_config - self.doc_filter = content_filter + self.content_filter = content_filter self.output_folder = output_folder self.zimui_dist = zimui_dist self.overwrite_existing_zim = overwrite_existing_zim @@ -222,7 +275,7 @@ def run(self) -> Path: ).model_dump_json(by_alias=True), ) - logger.info(f"Adding files in {self.zimui_dist}") + logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}") for file in self.zimui_dist.rglob("*"): if file.is_dir(): continue @@ -247,4 +300,12 @@ def run(self) -> Path: is_front=False, ) + logger.info("Fetching pages tree") + pages_tree = self.libretexts_client.get_page_tree() + selected_pages = self.content_filter.filter(pages_tree) + logger.info( + f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be " + "fetched and pushed to the ZIM" + ) + return zim_path diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py index 5cf9ebf..5217470 100644 --- a/scraper/tests-integration/test_client.py +++ b/scraper/tests-integration/test_client.py @@ -7,7 +7,12 @@ ) from zimscraperlib.image.probing import format_for -from libretexts2zim.client import LibreTextsClient, LibreTextsHome +from libretexts2zim.client import ( + LibraryPageId, + LibraryTree, + LibreTextsClient, + LibreTextsHome, +) @pytest.fixture(scope="module") @@ -31,10 +36,23 @@ def minimum_number_of_pages() -> int: @pytest.fixture(scope="module") -def root_page_id() -> str: +def root_page_id() -> LibraryPageId: return "34" +@pytest.fixture(scope="module") +def nb_root_children() -> int: + return 6 + + +@pytest.fixture(scope="module") +def page_tree( + client: LibreTextsClient, + deki_token: str, # noqa: ARG001 +) -> LibraryTree: + return client.get_page_tree() + + def test_get_deki_token(deki_token: str): """Ensures we achieve to get a deki_token""" assert deki_token @@ -51,12 +69,47 @@ def test_get_all_pages_ids( def test_get_root_page_id( client: LibreTextsClient, - root_page_id: str, + root_page_id: LibraryPageId, deki_token: str, # noqa: ARG001 ): assert client.get_root_page_id() == root_page_id +def test_get_page_tree_pages( + page_tree: LibraryTree, + minimum_number_of_pages: int, +): + assert len(page_tree.pages.keys()) > minimum_number_of_pages + + +def test_get_page_tree_root( + page_tree: LibraryTree, + root_page_id: str, + nb_root_children: int, +): + assert page_tree.root.id == root_page_id + assert len(page_tree.root.children) == nb_root_children + assert page_tree.root.title + for child in page_tree.root.children: + assert child.title + + +def test_get_page_tree_subtree( + page_tree: LibraryTree, +): + + # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science + subtree1 = page_tree.sub_tree("28207") + # 4 = "1. Understransding Science" + "1.1: What is Science?" + # + "1.2: The Scientific Method" + "1.3: The Study of Geology" + assert len(subtree1.pages.keys()) == 4 + + # 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College + subtree2 = page_tree.sub_tree("28196") + # 94 is number retrieved in Oct. 2024, might change + assert len(subtree2.pages.keys()) == 94 + + def test_get_home_image_url(home: LibreTextsHome): """Ensures proper image url is retrieved""" assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png" diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py new file mode 100644 index 0000000..50e6c4f --- /dev/null +++ b/scraper/tests/test_processor.py @@ -0,0 +1,182 @@ +import pytest + +from libretexts2zim.client import LibraryPage, LibraryTree +from libretexts2zim.processor import ContentFilter + + +@pytest.fixture(scope="module") +def library_tree() -> LibraryTree: + root = LibraryPage(id="24", title="Home page") + topic1 = LibraryPage(id="25", title="1: First topic", parent=root) + root.children.append(topic1) + topic1_1 = LibraryPage(id="26", title="1.1: Cloud", parent=topic1) + topic1.children.append(topic1_1) + topic1_2 = LibraryPage(id="27", title="1.2: Tree", parent=topic1) + topic1.children.append(topic1_2) + topic1_3 = LibraryPage(id="28", title="1.3: Bees", parent=topic1) + topic1.children.append(topic1_3) + topic2 = LibraryPage(id="29", title="2: Second topic", parent=root) + root.children.append(topic2) + topic2_1 = LibraryPage(id="30", title="2.1: Underground", parent=topic2) + topic2.children.append(topic2_1) + topic2_2 = LibraryPage(id="31", title="2.2: Lava", parent=topic2) + topic2.children.append(topic2_2) + topic2_3 = LibraryPage(id="32", title="2.3: Volcano", parent=topic2) + topic2.children.append(topic2_3) + topic3 = LibraryPage(id="33", title="3: Third topic", parent=root) + root.children.append(topic3) + topic3_1 = LibraryPage(id="34", title="3.1: Ground", parent=topic3) + topic3.children.append(topic3_1) + topic3_2 = LibraryPage(id="35", title="3.2: Earth", parent=topic3) + topic3.children.append(topic3_2) + topic3_3 = LibraryPage(id="36", title="3.3: Sky", parent=topic3) + topic3.children.append(topic3_3) + return LibraryTree( + root=root, + pages={ + root.id: root, + topic1.id: topic1, + topic1_1.id: topic1_1, + topic1_2.id: topic1_2, + topic1_3.id: topic1_3, + topic2.id: topic2, + topic2_1.id: topic2_1, + topic2_2.id: topic2_2, + topic2_3.id: topic2_3, + topic3.id: topic3, + topic3_1.id: topic3_1, + topic3_2.id: topic3_2, + topic3_3.id: topic3_3, + }, + ) + + +@pytest.mark.parametrize( + "content_filter,expected_ids", + [ + pytest.param( + ContentFilter( + page_title_include=r"^1\..*", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "26", "27", "28"], + id="include_1", + ), + pytest.param( + ContentFilter( + page_title_include=r"^2\..*", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "29", "30", "31", "32"], + id="include_2", + ), + pytest.param( + ContentFilter( + page_title_include=None, + page_title_exclude=None, + page_id_include="26,27,28", + root_page_id=None, + ), + ["24", "25", "26", "27", "28"], + id="include_3", + ), + pytest.param( + ContentFilter( + page_title_include="ground", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "29", "30", "33", "34"], + id="include_4", + ), + pytest.param( + ContentFilter( + page_title_include=r"^1\..*", + page_title_exclude="Tree", + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "26", "28"], + id="include_exclude_1", + ), + pytest.param( + ContentFilter( + page_title_include=None, + page_title_exclude="Tree", + page_id_include="26,27,28", + root_page_id=None, + ), + ["24", "25", "26", "28"], + id="include_exclude_2", + ), + pytest.param( + ContentFilter( + page_title_include="ground", + page_title_exclude="^2", + page_id_include=None, + root_page_id=None, + ), + ["24", "33", "34"], + id="include_exclude_3", + ), + pytest.param( + ContentFilter( + page_title_include=r"^1\..*", + page_title_exclude="tree", + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "26", "28"], + id="include_exclude_case_insensitive", + ), + pytest.param( + ContentFilter( + page_title_include="tree", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + ["24", "25", "27"], + id="include_case_insensitive", + ), + pytest.param( + ContentFilter( + page_title_include="^tree", + page_title_exclude=None, + page_id_include=None, + root_page_id=None, + ), + [], + id="include_no_match", + ), + pytest.param( + ContentFilter( + page_title_include=None, + page_title_exclude=None, + page_id_include=None, + root_page_id="25", + ), + ["25", "26", "27", "28"], + id="root_page_id", + ), + pytest.param( + ContentFilter( + page_title_include=r"^1\.1.*", + page_title_exclude=None, + page_id_include=None, + root_page_id="25", + ), + ["25", "26"], + id="root_page_id_and_include", + ), + ], +) +def test_content_filter( + content_filter: ContentFilter, expected_ids: list[str], library_tree: LibraryTree +): + assert [page.id for page in content_filter.filter(library_tree)] == expected_ids