From 0120c2b8e7fdbb68a471d58c50fe54e5ac8ba41f Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 1 Oct 2024 08:49:42 +0000
Subject: [PATCH 1/2] Add ability to retrieve and filter the whole page tree

---
 scraper/src/libretexts2zim/client.py     |  76 ++++++++++
 scraper/src/libretexts2zim/entrypoint.py |  26 +++-
 scraper/src/libretexts2zim/processor.py  |  88 +++++++++--
 scraper/tests-integration/test_client.py |  50 ++++++-
 scraper/tests/test_processor.py          | 182 +++++++++++++++++++++++
 5 files changed, 402 insertions(+), 20 deletions(-)
 create mode 100644 scraper/tests/test_processor.py

diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py
index 794755d..1689693 100644
--- a/scraper/src/libretexts2zim/client.py
+++ b/scraper/src/libretexts2zim/client.py
@@ -24,6 +24,49 @@ class LibreTextsHome(BaseModel):
     welcome_image_url: str
 
 
+class DekiPage(BaseModel):
+    id: str
+    title: str
+    parent: "DekiPage | None" = None
+    children: list["DekiPage"] = []
+
+    def __repr__(self) -> str:
+        return (
+            f"DekiPage(id='{self.id}', title='{self.title}', "
+            f"parent='{'None' if not self.parent else self.parent.id}', "
+            f"children='{','.join([child.id for child in self.children])}')"
+        )
+
+    @property
+    def self_and_parents(self) -> list["DekiPage"]:
+        result: list[DekiPage] = [self]
+        current = self
+        while current.parent is not None:
+            result.append(current.parent)
+            current = current.parent
+        return result
+
+
+class DekiTree(BaseModel):
+    root: DekiPage
+    pages: dict[str, DekiPage] = {}
+
+    def sub_tree(self, subroot_id: str) -> "DekiTree":
+        """Returns a sub-tree, starting at give page id"""
+        new_root = self.pages[subroot_id]
+        tree = DekiTree(root=new_root)
+        tree.pages[new_root.id] = new_root
+        children_to_explore = [*new_root.children]
+        while len(children_to_explore) > 0:
+            child = children_to_explore[0]
+            children_to_explore.remove(child)
+            if child.id in tree.pages:
+                continue  # safe-guard
+            tree.pages[child.id] = child
+            children_to_explore.extend(child.children)
+        return tree
+
+
 class LibreTextsMetadata(BaseModel):
     """Metadata about a library."""
 
@@ -189,6 +232,39 @@ def get_root_page_id(self) -> str:
         tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
         return tree["page"]["@id"]
 
+    def get_page_tree(self) -> DekiTree:
+
+        tree_data = self._get_api_json(
+            "/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS
+        )
+
+        root = DekiPage(id=tree_data["page"]["@id"], title=tree_data["page"]["title"])
+        tree_obj = DekiTree(root=root)
+        tree_obj.pages[root.id] = root
+
+        def _add_page(page_node: Any, parent: DekiPage) -> DekiPage:
+            page = DekiPage(
+                id=page_node["@id"], title=page_node["title"], parent=parent
+            )
+            parent.children.append(page)
+            tree_obj.pages[page.id] = page
+            return page
+
+        def _process_tree_data(page_node: Any, parent: DekiPage) -> None:
+            if not page_node["subpages"]:
+                return
+            if "@id" in page_node["subpages"]["page"]:
+                page = _add_page(page_node["subpages"]["page"], parent=parent)
+                _process_tree_data(page_node["subpages"]["page"], parent=page)
+            else:
+                for subpage_node in page_node["subpages"]["page"]:
+                    page = _add_page(subpage_node, parent=parent)
+                    _process_tree_data(subpage_node, parent=page)
+
+        _process_tree_data(tree_data["page"], parent=root)
+
+        return tree_obj
+
 
 def _get_soup(content: str) -> BeautifulSoup:
     """Return a BeautifulSoup soup from textual content
diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py
index 1d8ef52..04b2ac5 100644
--- a/scraper/src/libretexts2zim/entrypoint.py
+++ b/scraper/src/libretexts2zim/entrypoint.py
@@ -126,17 +126,33 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
     """Adds flags related to content filtering to the given parser."""
 
     parser.add_argument(
-        "--shelves-include",
-        help="Includes only shelves matching the given regular expression.",
+        "--page-title-include",
+        help="Includes only pages with title matching the given regular "
+        "expression, and their parent pages for proper navigation. Can be combined"
+        " with --page-id-include (pages with matching title or id will be included"
+        ")",
         metavar="REGEX",
     )
 
     parser.add_argument(
-        "--shelves-exclude",
-        help="Excludes shelves matching the given regular expression.",
+        "--page-id-include",
+        help="CSV value of page ids to include. Parent pages will be included as "
+        "well for proper navigation. Can be combined with --page-title-include "
+        "(pages with matching title or id will be included)",
+    )
+
+    parser.add_argument(
+        "--page-title-exclude",
+        help="Excludes pages with title matching the given regular expression",
         metavar="REGEX",
     )
 
+    parser.add_argument(
+        "--root-page-id",
+        help="ID of the root page to include in ZIM. Only this page and its"
+        " subpages will be included in the ZIM",
+    )
+
 
 def main(tmpdir: str) -> None:
     parser = argparse.ArgumentParser(
@@ -223,7 +239,7 @@ def main(tmpdir: str) -> None:
         doc_filter = ContentFilter.of(args)
 
         cache_folder = tmp_folder / "cache"
-        cache_folder.mkdir()
+        cache_folder.mkdir(exist_ok=True)
 
         libretexts_client = LibreTextsClient(
             library_slug=args.library_slug,
diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py
index 146259f..51bacec 100644
--- a/scraper/src/libretexts2zim/processor.py
+++ b/scraper/src/libretexts2zim/processor.py
@@ -1,5 +1,6 @@
 import argparse
 import datetime
+import re
 from io import BytesIO
 from pathlib import Path
 
@@ -12,7 +13,12 @@
 from zimscraperlib.zim.filesystem import validate_zimfile_creatable
 from zimscraperlib.zim.indexing import IndexData
 
-from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
+from libretexts2zim.client import (
+    DekiPage,
+    DekiTree,
+    LibreTextsClient,
+    LibreTextsMetadata,
+)
 from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
 from libretexts2zim.ui import ConfigModel, HomeModel, SharedModel
 from libretexts2zim.zimconfig import ZimConfig
@@ -33,23 +39,69 @@ class MissingDocumentError(Exception):
 class ContentFilter(BaseModel):
     """Supports filtering documents by user provided attributes."""
 
-    # If specified, only shelves matching the regex are included.
-    shelves_include: str | None
-    # If specified, shelves matching the regex are excluded.
-    shelves_exclude: str | None
+    # If specified, only pages with title matching the regex are included.
+    page_title_include: str | None
+    # If specified, only page with matching ids are included.
+    page_id_include: str | None
+    # If specified, page with title matching the regex are excluded.
+    page_title_exclude: str | None
+    # If specified, only this page and its subpages will be included.
+    root_page_id: str | None
 
     @staticmethod
     def of(namespace: argparse.Namespace) -> "ContentFilter":
         """Parses a namespace to create a new DocFilter."""
         return ContentFilter.model_validate(namespace, from_attributes=True)
 
-    # TODO: implement filtering of shelves based on configured regex
-    # def filter(self, shelves: list[LibretextsShelve]) -> list[LibretextsShelve]:
-    #     """Filters docs based on the user's choices."""
-    #     selected: list[LibretextsShelve] = []
-    #     for shelve in shelves:
-    #       ....
-    #     return selected
+    def filter(self, page_tree: DekiTree) -> list[DekiPage]:
+        """Filters pages based on the user's choices."""
+
+        if self.root_page_id:
+            page_tree = page_tree.sub_tree(self.root_page_id)
+
+        title_include_re = (
+            re.compile(self.page_title_include, re.IGNORECASE)
+            if self.page_title_include
+            else None
+        )
+        title_exclude_re = (
+            re.compile(self.page_title_exclude, re.IGNORECASE)
+            if self.page_title_exclude
+            else None
+        )
+        id_include = (
+            [page_id.strip() for page_id in self.page_id_include.split(",")]
+            if self.page_id_include
+            else None
+        )
+
+        def is_selected(
+            title_include_re: re.Pattern[str] | None,
+            title_exclude_re: re.Pattern[str] | None,
+            id_include: list[str] | None,
+            page: DekiPage,
+        ) -> bool:
+            return (
+                (
+                    not title_include_re
+                    or title_include_re.search(page.title) is not None
+                )
+                and (not id_include or page.id in id_include)
+                and (
+                    not title_exclude_re or title_exclude_re.search(page.title) is None
+                )
+            )
+
+        # Find selected pages and their parent, and create a set of unique ids
+        selected_ids = {
+            selected_page.id
+            for page in page_tree.pages.values()
+            for selected_page in page.self_and_parents
+            if is_selected(title_include_re, title_exclude_re, id_include, page)
+        }
+
+        # Then transform set of ids into list of pages
+        return [page for page in page_tree.pages.values() if page.id in selected_ids]
 
 
 def add_item_for(
@@ -113,7 +165,7 @@ def __init__(
         """
         self.libretexts_client = libretexts_client
         self.zim_config = zim_config
-        self.doc_filter = content_filter
+        self.content_filter = content_filter
         self.output_folder = output_folder
         self.zimui_dist = zimui_dist
         self.overwrite_existing_zim = overwrite_existing_zim
@@ -222,7 +274,7 @@ def run(self) -> Path:
                 ).model_dump_json(by_alias=True),
             )
 
-            logger.info(f"Adding files in {self.zimui_dist}")
+            logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")
             for file in self.zimui_dist.rglob("*"):
                 if file.is_dir():
                     continue
@@ -247,4 +299,12 @@ def run(self) -> Path:
                         is_front=False,
                     )
 
+            logger.info("Fetching pages tree")
+            pages_tree = self.libretexts_client.get_page_tree()
+            selected_pages = self.content_filter.filter(pages_tree)
+            logger.info(
+                f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be "
+                "fetched and pushed to the ZIM"
+            )
+
         return zim_path
diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py
index 5cf9ebf..b4f45f9 100644
--- a/scraper/tests-integration/test_client.py
+++ b/scraper/tests-integration/test_client.py
@@ -7,7 +7,7 @@
 )
 from zimscraperlib.image.probing import format_for
 
-from libretexts2zim.client import LibreTextsClient, LibreTextsHome
+from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome
 
 
 @pytest.fixture(scope="module")
@@ -35,6 +35,19 @@ def root_page_id() -> str:
     return "34"
 
 
+@pytest.fixture(scope="module")
+def nb_root_children() -> int:
+    return 6
+
+
+@pytest.fixture(scope="module")
+def page_tree(
+    client: LibreTextsClient,
+    deki_token: str,  # noqa: ARG001
+) -> DekiTree:
+    return client.get_page_tree()
+
+
 def test_get_deki_token(deki_token: str):
     """Ensures we achieve to get a deki_token"""
     assert deki_token
@@ -57,6 +70,41 @@ def test_get_root_page_id(
     assert client.get_root_page_id() == root_page_id
 
 
+def test_get_page_tree_pages(
+    page_tree: DekiTree,
+    minimum_number_of_pages: int,
+):
+    assert len(page_tree.pages.keys()) > minimum_number_of_pages
+
+
+def test_get_page_tree_root(
+    page_tree: DekiTree,
+    root_page_id: str,
+    nb_root_children: int,
+):
+    assert page_tree.root.id == root_page_id
+    assert len(page_tree.root.children) == nb_root_children
+    assert page_tree.root.title
+    for child in page_tree.root.children:
+        assert child.title
+
+
+def test_get_page_tree_subtree(
+    page_tree: DekiTree,
+):
+
+    # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science
+    subtree1 = page_tree.sub_tree("28207")
+    # 4 = "1. Understransding Science" + "1.1: What is Science?"
+    #  + "1.2: The Scientific Method" + "1.3: The Study of Geology"
+    assert len(subtree1.pages.keys()) == 4
+
+    # 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College
+    subtree2 = page_tree.sub_tree("28196")
+    # 94 is number retrieved in Oct. 2024, might change
+    assert len(subtree2.pages.keys()) == 94
+
+
 def test_get_home_image_url(home: LibreTextsHome):
     """Ensures proper image url is retrieved"""
     assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"
diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py
new file mode 100644
index 0000000..9953738
--- /dev/null
+++ b/scraper/tests/test_processor.py
@@ -0,0 +1,182 @@
+import pytest
+
+from libretexts2zim.client import DekiPage, DekiTree
+from libretexts2zim.processor import ContentFilter
+
+
+@pytest.fixture(scope="module")
+def deki_tree() -> DekiTree:
+    root = DekiPage(id="24", title="Home page")
+    topic1 = DekiPage(id="25", title="1: First topic", parent=root)
+    root.children.append(topic1)
+    topic1_1 = DekiPage(id="26", title="1.1: Cloud", parent=topic1)
+    topic1.children.append(topic1_1)
+    topic1_2 = DekiPage(id="27", title="1.2: Tree", parent=topic1)
+    topic1.children.append(topic1_2)
+    topic1_3 = DekiPage(id="28", title="1.3: Bees", parent=topic1)
+    topic1.children.append(topic1_3)
+    topic2 = DekiPage(id="29", title="2: Second topic", parent=root)
+    root.children.append(topic2)
+    topic2_1 = DekiPage(id="30", title="2.1: Underground", parent=topic2)
+    topic2.children.append(topic2_1)
+    topic2_2 = DekiPage(id="31", title="2.2: Lava", parent=topic2)
+    topic2.children.append(topic2_2)
+    topic2_3 = DekiPage(id="32", title="2.3: Volcano", parent=topic2)
+    topic2.children.append(topic2_3)
+    topic3 = DekiPage(id="33", title="3: Third topic", parent=root)
+    root.children.append(topic3)
+    topic3_1 = DekiPage(id="34", title="3.1: Ground", parent=topic3)
+    topic3.children.append(topic3_1)
+    topic3_2 = DekiPage(id="35", title="3.2: Earth", parent=topic3)
+    topic3.children.append(topic3_2)
+    topic3_3 = DekiPage(id="36", title="3.3: Sky", parent=topic3)
+    topic3.children.append(topic3_3)
+    return DekiTree(
+        root=root,
+        pages={
+            root.id: root,
+            topic1.id: topic1,
+            topic1_1.id: topic1_1,
+            topic1_2.id: topic1_2,
+            topic1_3.id: topic1_3,
+            topic2.id: topic2,
+            topic2_1.id: topic2_1,
+            topic2_2.id: topic2_2,
+            topic2_3.id: topic2_3,
+            topic3.id: topic3,
+            topic3_1.id: topic3_1,
+            topic3_2.id: topic3_2,
+            topic3_3.id: topic3_3,
+        },
+    )
+
+
+@pytest.mark.parametrize(
+    "content_filter,expected_ids",
+    [
+        pytest.param(
+            ContentFilter(
+                page_title_include=r"^1\..*",
+                page_title_exclude=None,
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            ["24", "25", "26", "27", "28"],
+            id="include_1",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include=r"^2\..*",
+                page_title_exclude=None,
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            ["24", "29", "30", "31", "32"],
+            id="include_2",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include=None,
+                page_title_exclude=None,
+                page_id_include="26,27,28",
+                root_page_id=None,
+            ),
+            ["24", "25", "26", "27", "28"],
+            id="include_3",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include="ground",
+                page_title_exclude=None,
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            ["24", "29", "30", "33", "34"],
+            id="include_4",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include=r"^1\..*",
+                page_title_exclude="Tree",
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            ["24", "25", "26", "28"],
+            id="include_exclude_1",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include=None,
+                page_title_exclude="Tree",
+                page_id_include="26,27,28",
+                root_page_id=None,
+            ),
+            ["24", "25", "26", "28"],
+            id="include_exclude_2",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include="ground",
+                page_title_exclude="^2",
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            ["24", "33", "34"],
+            id="include_exclude_3",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include=r"^1\..*",
+                page_title_exclude="tree",
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            ["24", "25", "26", "28"],
+            id="include_exclude_case_insensitive",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include="tree",
+                page_title_exclude=None,
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            ["24", "25", "27"],
+            id="include_case_insensitive",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include="^tree",
+                page_title_exclude=None,
+                page_id_include=None,
+                root_page_id=None,
+            ),
+            [],
+            id="include_no_match",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include=None,
+                page_title_exclude=None,
+                page_id_include=None,
+                root_page_id="25",
+            ),
+            ["25", "26", "27", "28"],
+            id="root_page_id",
+        ),
+        pytest.param(
+            ContentFilter(
+                page_title_include=r"^1\.1.*",
+                page_title_exclude=None,
+                page_id_include=None,
+                root_page_id="25",
+            ),
+            ["25", "26"],
+            id="root_page_id_and_include",
+        ),
+    ],
+)
+def test_content_filter(
+    content_filter: ContentFilter, expected_ids: list[str], deki_tree: DekiTree
+):
+    assert [page.id for page in content_filter.filter(deki_tree)] == expected_ids

From 35b5448389466efa9d4f36ce9f7ae0586fd71c83 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 3 Oct 2024 15:22:30 +0000
Subject: [PATCH 2/2] fixup! Add ability to retrieve and filter the whole page
 tree

---
 scraper/src/libretexts2zim/client.py     | 51 ++++++++++++++----------
 scraper/src/libretexts2zim/entrypoint.py | 13 +++---
 scraper/src/libretexts2zim/processor.py  | 11 ++---
 scraper/tests-integration/test_client.py | 19 +++++----
 scraper/tests/test_processor.py          | 36 ++++++++---------
 5 files changed, 73 insertions(+), 57 deletions(-)

diff --git a/scraper/src/libretexts2zim/client.py b/scraper/src/libretexts2zim/client.py
index 1689693..99bd7b7 100644
--- a/scraper/src/libretexts2zim/client.py
+++ b/scraper/src/libretexts2zim/client.py
@@ -24,22 +24,27 @@ class LibreTextsHome(BaseModel):
     welcome_image_url: str
 
 
-class DekiPage(BaseModel):
-    id: str
+LibraryPageId = str
+
+
+class LibraryPage(BaseModel):
+    """Class holding information about a given library page on the library tree"""
+
+    id: LibraryPageId
     title: str
-    parent: "DekiPage | None" = None
-    children: list["DekiPage"] = []
+    parent: "LibraryPage | None" = None
+    children: list["LibraryPage"] = []
 
     def __repr__(self) -> str:
         return (
-            f"DekiPage(id='{self.id}', title='{self.title}', "
+            f"WikiPage(id='{self.id}', title='{self.title}', "
             f"parent='{'None' if not self.parent else self.parent.id}', "
             f"children='{','.join([child.id for child in self.children])}')"
         )
 
     @property
-    def self_and_parents(self) -> list["DekiPage"]:
-        result: list[DekiPage] = [self]
+    def self_and_parents(self) -> list["LibraryPage"]:
+        result: list[LibraryPage] = [self]
         current = self
         while current.parent is not None:
             result.append(current.parent)
@@ -47,14 +52,16 @@ def self_and_parents(self) -> list["DekiPage"]:
         return result
 
 
-class DekiTree(BaseModel):
-    root: DekiPage
-    pages: dict[str, DekiPage] = {}
+class LibraryTree(BaseModel):
+    """Class holding information about the tree of pages on a given library"""
 
-    def sub_tree(self, subroot_id: str) -> "DekiTree":
+    root: LibraryPage
+    pages: dict[LibraryPageId, LibraryPage] = {}
+
+    def sub_tree(self, subroot_id: LibraryPageId) -> "LibraryTree":
         """Returns a sub-tree, starting at give page id"""
         new_root = self.pages[subroot_id]
-        tree = DekiTree(root=new_root)
+        tree = LibraryTree(root=new_root)
         tree.pages[new_root.id] = new_root
         children_to_explore = [*new_root.children]
         while len(children_to_explore) > 0:
@@ -205,12 +212,12 @@ def get_deki_token(self) -> str:
         self.deki_token = _get_deki_token_from_home(soup)
         return self.deki_token
 
-    def get_all_pages_ids(self):
+    def get_all_pages_ids(self) -> list[LibraryPageId]:
         """Returns the IDs of all pages on current website, exploring the whole tree"""
 
         tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
 
-        page_ids: list[str] = []
+        page_ids: list[LibraryPageId] = []
 
         def _get_page_ids(page_node: Any) -> None:
             page_ids.append(page_node["@id"])
@@ -226,31 +233,33 @@ def _get_page_ids(page_node: Any) -> None:
 
         return page_ids
 
-    def get_root_page_id(self) -> str:
+    def get_root_page_id(self) -> LibraryPageId:
         """Returns the ID the root of the tree of pages"""
 
         tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
         return tree["page"]["@id"]
 
-    def get_page_tree(self) -> DekiTree:
+    def get_page_tree(self) -> LibraryTree:
 
         tree_data = self._get_api_json(
             "/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS
         )
 
-        root = DekiPage(id=tree_data["page"]["@id"], title=tree_data["page"]["title"])
-        tree_obj = DekiTree(root=root)
+        root = LibraryPage(
+            id=tree_data["page"]["@id"], title=tree_data["page"]["title"]
+        )
+        tree_obj = LibraryTree(root=root)
         tree_obj.pages[root.id] = root
 
-        def _add_page(page_node: Any, parent: DekiPage) -> DekiPage:
-            page = DekiPage(
+        def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage:
+            page = LibraryPage(
                 id=page_node["@id"], title=page_node["title"], parent=parent
             )
             parent.children.append(page)
             tree_obj.pages[page.id] = page
             return page
 
-        def _process_tree_data(page_node: Any, parent: DekiPage) -> None:
+        def _process_tree_data(page_node: Any, parent: LibraryPage) -> None:
             if not page_node["subpages"]:
                 return
             if "@id" in page_node["subpages"]["page"]:
diff --git a/scraper/src/libretexts2zim/entrypoint.py b/scraper/src/libretexts2zim/entrypoint.py
index 04b2ac5..80a6ca2 100644
--- a/scraper/src/libretexts2zim/entrypoint.py
+++ b/scraper/src/libretexts2zim/entrypoint.py
@@ -128,17 +128,18 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--page-title-include",
         help="Includes only pages with title matching the given regular "
-        "expression, and their parent pages for proper navigation. Can be combined"
-        " with --page-id-include (pages with matching title or id will be included"
-        ")",
+        "expression, and their parent pages for proper navigation, up to root (or "
+        "subroot if --root-page-id is set). Can be combined with --page-id-include "
+        "(pages with matching title or id will be included)",
         metavar="REGEX",
     )
 
     parser.add_argument(
         "--page-id-include",
-        help="CSV value of page ids to include. Parent pages will be included as "
-        "well for proper navigation. Can be combined with --page-title-include "
-        "(pages with matching title or id will be included)",
+        help="CSV of page ids to include. Parent pages will be included as "
+        "well for proper navigation, up to root (or subroot if --root-page-id is set). "
+        "Can be combined with --page-title-include (pages with matching title or id "
+        "will be included)",
     )
 
     parser.add_argument(
diff --git a/scraper/src/libretexts2zim/processor.py b/scraper/src/libretexts2zim/processor.py
index 51bacec..4862afb 100644
--- a/scraper/src/libretexts2zim/processor.py
+++ b/scraper/src/libretexts2zim/processor.py
@@ -14,8 +14,9 @@
 from zimscraperlib.zim.indexing import IndexData
 
 from libretexts2zim.client import (
-    DekiPage,
-    DekiTree,
+    LibraryPage,
+    LibraryPageId,
+    LibraryTree,
     LibreTextsClient,
     LibreTextsMetadata,
 )
@@ -53,7 +54,7 @@ def of(namespace: argparse.Namespace) -> "ContentFilter":
         """Parses a namespace to create a new DocFilter."""
         return ContentFilter.model_validate(namespace, from_attributes=True)
 
-    def filter(self, page_tree: DekiTree) -> list[DekiPage]:
+    def filter(self, page_tree: LibraryTree) -> list[LibraryPage]:
         """Filters pages based on the user's choices."""
 
         if self.root_page_id:
@@ -78,8 +79,8 @@ def filter(self, page_tree: DekiTree) -> list[DekiPage]:
         def is_selected(
             title_include_re: re.Pattern[str] | None,
             title_exclude_re: re.Pattern[str] | None,
-            id_include: list[str] | None,
-            page: DekiPage,
+            id_include: list[LibraryPageId] | None,
+            page: LibraryPage,
         ) -> bool:
             return (
                 (
diff --git a/scraper/tests-integration/test_client.py b/scraper/tests-integration/test_client.py
index b4f45f9..5217470 100644
--- a/scraper/tests-integration/test_client.py
+++ b/scraper/tests-integration/test_client.py
@@ -7,7 +7,12 @@
 )
 from zimscraperlib.image.probing import format_for
 
-from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome
+from libretexts2zim.client import (
+    LibraryPageId,
+    LibraryTree,
+    LibreTextsClient,
+    LibreTextsHome,
+)
 
 
 @pytest.fixture(scope="module")
@@ -31,7 +36,7 @@ def minimum_number_of_pages() -> int:
 
 
 @pytest.fixture(scope="module")
-def root_page_id() -> str:
+def root_page_id() -> LibraryPageId:
     return "34"
 
 
@@ -44,7 +49,7 @@ def nb_root_children() -> int:
 def page_tree(
     client: LibreTextsClient,
     deki_token: str,  # noqa: ARG001
-) -> DekiTree:
+) -> LibraryTree:
     return client.get_page_tree()
 
 
@@ -64,21 +69,21 @@ def test_get_all_pages_ids(
 
 def test_get_root_page_id(
     client: LibreTextsClient,
-    root_page_id: str,
+    root_page_id: LibraryPageId,
     deki_token: str,  # noqa: ARG001
 ):
     assert client.get_root_page_id() == root_page_id
 
 
 def test_get_page_tree_pages(
-    page_tree: DekiTree,
+    page_tree: LibraryTree,
     minimum_number_of_pages: int,
 ):
     assert len(page_tree.pages.keys()) > minimum_number_of_pages
 
 
 def test_get_page_tree_root(
-    page_tree: DekiTree,
+    page_tree: LibraryTree,
     root_page_id: str,
     nb_root_children: int,
 ):
@@ -90,7 +95,7 @@ def test_get_page_tree_root(
 
 
 def test_get_page_tree_subtree(
-    page_tree: DekiTree,
+    page_tree: LibraryTree,
 ):
 
     # 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science
diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py
index 9953738..50e6c4f 100644
--- a/scraper/tests/test_processor.py
+++ b/scraper/tests/test_processor.py
@@ -1,37 +1,37 @@
 import pytest
 
-from libretexts2zim.client import DekiPage, DekiTree
+from libretexts2zim.client import LibraryPage, LibraryTree
 from libretexts2zim.processor import ContentFilter
 
 
 @pytest.fixture(scope="module")
-def deki_tree() -> DekiTree:
-    root = DekiPage(id="24", title="Home page")
-    topic1 = DekiPage(id="25", title="1: First topic", parent=root)
+def library_tree() -> LibraryTree:
+    root = LibraryPage(id="24", title="Home page")
+    topic1 = LibraryPage(id="25", title="1: First topic", parent=root)
     root.children.append(topic1)
-    topic1_1 = DekiPage(id="26", title="1.1: Cloud", parent=topic1)
+    topic1_1 = LibraryPage(id="26", title="1.1: Cloud", parent=topic1)
     topic1.children.append(topic1_1)
-    topic1_2 = DekiPage(id="27", title="1.2: Tree", parent=topic1)
+    topic1_2 = LibraryPage(id="27", title="1.2: Tree", parent=topic1)
     topic1.children.append(topic1_2)
-    topic1_3 = DekiPage(id="28", title="1.3: Bees", parent=topic1)
+    topic1_3 = LibraryPage(id="28", title="1.3: Bees", parent=topic1)
     topic1.children.append(topic1_3)
-    topic2 = DekiPage(id="29", title="2: Second topic", parent=root)
+    topic2 = LibraryPage(id="29", title="2: Second topic", parent=root)
     root.children.append(topic2)
-    topic2_1 = DekiPage(id="30", title="2.1: Underground", parent=topic2)
+    topic2_1 = LibraryPage(id="30", title="2.1: Underground", parent=topic2)
     topic2.children.append(topic2_1)
-    topic2_2 = DekiPage(id="31", title="2.2: Lava", parent=topic2)
+    topic2_2 = LibraryPage(id="31", title="2.2: Lava", parent=topic2)
     topic2.children.append(topic2_2)
-    topic2_3 = DekiPage(id="32", title="2.3: Volcano", parent=topic2)
+    topic2_3 = LibraryPage(id="32", title="2.3: Volcano", parent=topic2)
     topic2.children.append(topic2_3)
-    topic3 = DekiPage(id="33", title="3: Third topic", parent=root)
+    topic3 = LibraryPage(id="33", title="3: Third topic", parent=root)
     root.children.append(topic3)
-    topic3_1 = DekiPage(id="34", title="3.1: Ground", parent=topic3)
+    topic3_1 = LibraryPage(id="34", title="3.1: Ground", parent=topic3)
     topic3.children.append(topic3_1)
-    topic3_2 = DekiPage(id="35", title="3.2: Earth", parent=topic3)
+    topic3_2 = LibraryPage(id="35", title="3.2: Earth", parent=topic3)
     topic3.children.append(topic3_2)
-    topic3_3 = DekiPage(id="36", title="3.3: Sky", parent=topic3)
+    topic3_3 = LibraryPage(id="36", title="3.3: Sky", parent=topic3)
     topic3.children.append(topic3_3)
-    return DekiTree(
+    return LibraryTree(
         root=root,
         pages={
             root.id: root,
@@ -177,6 +177,6 @@ def deki_tree() -> DekiTree:
     ],
 )
 def test_content_filter(
-    content_filter: ContentFilter, expected_ids: list[str], deki_tree: DekiTree
+    content_filter: ContentFilter, expected_ids: list[str], library_tree: LibraryTree
 ):
-    assert [page.id for page in content_filter.filter(deki_tree)] == expected_ids
+    assert [page.id for page in content_filter.filter(library_tree)] == expected_ids