Skip to content

Commit

Permalink
fixup! Add ability to retrieve and filter the whole page tree
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 3, 2024
1 parent 0120c2b commit 35b5448
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 57 deletions.
51 changes: 30 additions & 21 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,37 +24,44 @@ class LibreTextsHome(BaseModel):
welcome_image_url: str


class DekiPage(BaseModel):
id: str
LibraryPageId = str


class LibraryPage(BaseModel):
"""Class holding information about a given library page on the library tree"""

id: LibraryPageId
title: str
parent: "DekiPage | None" = None
children: list["DekiPage"] = []
parent: "LibraryPage | None" = None
children: list["LibraryPage"] = []

def __repr__(self) -> str:
return (

Check warning on line 39 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L39

Added line #L39 was not covered by tests
f"DekiPage(id='{self.id}', title='{self.title}', "
f"WikiPage(id='{self.id}', title='{self.title}', "
f"parent='{'None' if not self.parent else self.parent.id}', "
f"children='{','.join([child.id for child in self.children])}')"
)

@property
def self_and_parents(self) -> list["DekiPage"]:
result: list[DekiPage] = [self]
def self_and_parents(self) -> list["LibraryPage"]:
result: list[LibraryPage] = [self]
current = self
while current.parent is not None:
result.append(current.parent)
current = current.parent
return result


class DekiTree(BaseModel):
root: DekiPage
pages: dict[str, DekiPage] = {}
class LibraryTree(BaseModel):
"""Class holding information about the tree of pages on a given library"""

def sub_tree(self, subroot_id: str) -> "DekiTree":
root: LibraryPage
pages: dict[LibraryPageId, LibraryPage] = {}

def sub_tree(self, subroot_id: LibraryPageId) -> "LibraryTree":
"""Returns a sub-tree, starting at give page id"""
new_root = self.pages[subroot_id]
tree = DekiTree(root=new_root)
tree = LibraryTree(root=new_root)
tree.pages[new_root.id] = new_root
children_to_explore = [*new_root.children]
while len(children_to_explore) > 0:
Expand Down Expand Up @@ -205,12 +212,12 @@ def get_deki_token(self) -> str:
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

def get_all_pages_ids(self):
def get_all_pages_ids(self) -> list[LibraryPageId]:
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)

page_ids: list[str] = []
page_ids: list[LibraryPageId] = []

Check warning on line 220 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L220

Added line #L220 was not covered by tests

def _get_page_ids(page_node: Any) -> None:
page_ids.append(page_node["@id"])
Expand All @@ -226,31 +233,33 @@ def _get_page_ids(page_node: Any) -> None:

return page_ids

def get_root_page_id(self) -> str:
def get_root_page_id(self) -> LibraryPageId:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
return tree["page"]["@id"]

def get_page_tree(self) -> DekiTree:
def get_page_tree(self) -> LibraryTree:

tree_data = self._get_api_json(

Check warning on line 244 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L244

Added line #L244 was not covered by tests
"/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS
)

root = DekiPage(id=tree_data["page"]["@id"], title=tree_data["page"]["title"])
tree_obj = DekiTree(root=root)
root = LibraryPage(

Check warning on line 248 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L248

Added line #L248 was not covered by tests
id=tree_data["page"]["@id"], title=tree_data["page"]["title"]
)
tree_obj = LibraryTree(root=root)
tree_obj.pages[root.id] = root

Check warning on line 252 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L251-L252

Added lines #L251 - L252 were not covered by tests

def _add_page(page_node: Any, parent: DekiPage) -> DekiPage:
page = DekiPage(
def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage:
page = LibraryPage(

Check warning on line 255 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L254-L255

Added lines #L254 - L255 were not covered by tests
id=page_node["@id"], title=page_node["title"], parent=parent
)
parent.children.append(page)
tree_obj.pages[page.id] = page
return page

Check warning on line 260 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L258-L260

Added lines #L258 - L260 were not covered by tests

def _process_tree_data(page_node: Any, parent: DekiPage) -> None:
def _process_tree_data(page_node: Any, parent: LibraryPage) -> None:

Check warning on line 262 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L262

Added line #L262 was not covered by tests
if not page_node["subpages"]:
return

Check warning on line 264 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L264

Added line #L264 was not covered by tests
if "@id" in page_node["subpages"]["page"]:
Expand Down
13 changes: 7 additions & 6 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,18 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
parser.add_argument(
"--page-title-include",
help="Includes only pages with title matching the given regular "
"expression, and their parent pages for proper navigation. Can be combined"
" with --page-id-include (pages with matching title or id will be included"
")",
"expression, and their parent pages for proper navigation, up to root (or "
"subroot if --root-page-id is set). Can be combined with --page-id-include "
"(pages with matching title or id will be included)",
metavar="REGEX",
)

parser.add_argument(
"--page-id-include",
help="CSV value of page ids to include. Parent pages will be included as "
"well for proper navigation. Can be combined with --page-title-include "
"(pages with matching title or id will be included)",
help="CSV of page ids to include. Parent pages will be included as "
"well for proper navigation, up to root (or subroot if --root-page-id is set). "
"Can be combined with --page-title-include (pages with matching title or id "
"will be included)",
)

parser.add_argument(

Check warning on line 145 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L145

Added line #L145 was not covered by tests
Expand Down
11 changes: 6 additions & 5 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from zimscraperlib.zim.indexing import IndexData

from libretexts2zim.client import (
DekiPage,
DekiTree,
LibraryPage,
LibraryPageId,
LibraryTree,
LibreTextsClient,
LibreTextsMetadata,
)
Expand Down Expand Up @@ -53,7 +54,7 @@ def of(namespace: argparse.Namespace) -> "ContentFilter":
"""Parses a namespace to create a new DocFilter."""
return ContentFilter.model_validate(namespace, from_attributes=True)

def filter(self, page_tree: DekiTree) -> list[DekiPage]:
def filter(self, page_tree: LibraryTree) -> list[LibraryPage]:
"""Filters pages based on the user's choices."""

if self.root_page_id:
Expand All @@ -78,8 +79,8 @@ def filter(self, page_tree: DekiTree) -> list[DekiPage]:
def is_selected(
title_include_re: re.Pattern[str] | None,
title_exclude_re: re.Pattern[str] | None,
id_include: list[str] | None,
page: DekiPage,
id_include: list[LibraryPageId] | None,
page: LibraryPage,
) -> bool:
return (
(
Expand Down
19 changes: 12 additions & 7 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@
)
from zimscraperlib.image.probing import format_for

from libretexts2zim.client import DekiTree, LibreTextsClient, LibreTextsHome
from libretexts2zim.client import (
LibraryPageId,
LibraryTree,
LibreTextsClient,
LibreTextsHome,
)


@pytest.fixture(scope="module")
Expand All @@ -31,7 +36,7 @@ def minimum_number_of_pages() -> int:


@pytest.fixture(scope="module")
def root_page_id() -> str:
def root_page_id() -> LibraryPageId:
return "34"


Expand All @@ -44,7 +49,7 @@ def nb_root_children() -> int:
def page_tree(
client: LibreTextsClient,
deki_token: str, # noqa: ARG001
) -> DekiTree:
) -> LibraryTree:
return client.get_page_tree()


Expand All @@ -64,21 +69,21 @@ def test_get_all_pages_ids(

def test_get_root_page_id(
client: LibreTextsClient,
root_page_id: str,
root_page_id: LibraryPageId,
deki_token: str, # noqa: ARG001
):
assert client.get_root_page_id() == root_page_id


def test_get_page_tree_pages(
page_tree: DekiTree,
page_tree: LibraryTree,
minimum_number_of_pages: int,
):
assert len(page_tree.pages.keys()) > minimum_number_of_pages


def test_get_page_tree_root(
page_tree: DekiTree,
page_tree: LibraryTree,
root_page_id: str,
nb_root_children: int,
):
Expand All @@ -90,7 +95,7 @@ def test_get_page_tree_root(


def test_get_page_tree_subtree(
page_tree: DekiTree,
page_tree: LibraryTree,
):

# 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science
Expand Down
36 changes: 18 additions & 18 deletions scraper/tests/test_processor.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
import pytest

from libretexts2zim.client import DekiPage, DekiTree
from libretexts2zim.client import LibraryPage, LibraryTree
from libretexts2zim.processor import ContentFilter


@pytest.fixture(scope="module")
def deki_tree() -> DekiTree:
root = DekiPage(id="24", title="Home page")
topic1 = DekiPage(id="25", title="1: First topic", parent=root)
def library_tree() -> LibraryTree:
root = LibraryPage(id="24", title="Home page")
topic1 = LibraryPage(id="25", title="1: First topic", parent=root)
root.children.append(topic1)
topic1_1 = DekiPage(id="26", title="1.1: Cloud", parent=topic1)
topic1_1 = LibraryPage(id="26", title="1.1: Cloud", parent=topic1)
topic1.children.append(topic1_1)
topic1_2 = DekiPage(id="27", title="1.2: Tree", parent=topic1)
topic1_2 = LibraryPage(id="27", title="1.2: Tree", parent=topic1)
topic1.children.append(topic1_2)
topic1_3 = DekiPage(id="28", title="1.3: Bees", parent=topic1)
topic1_3 = LibraryPage(id="28", title="1.3: Bees", parent=topic1)
topic1.children.append(topic1_3)
topic2 = DekiPage(id="29", title="2: Second topic", parent=root)
topic2 = LibraryPage(id="29", title="2: Second topic", parent=root)
root.children.append(topic2)
topic2_1 = DekiPage(id="30", title="2.1: Underground", parent=topic2)
topic2_1 = LibraryPage(id="30", title="2.1: Underground", parent=topic2)
topic2.children.append(topic2_1)
topic2_2 = DekiPage(id="31", title="2.2: Lava", parent=topic2)
topic2_2 = LibraryPage(id="31", title="2.2: Lava", parent=topic2)
topic2.children.append(topic2_2)
topic2_3 = DekiPage(id="32", title="2.3: Volcano", parent=topic2)
topic2_3 = LibraryPage(id="32", title="2.3: Volcano", parent=topic2)
topic2.children.append(topic2_3)
topic3 = DekiPage(id="33", title="3: Third topic", parent=root)
topic3 = LibraryPage(id="33", title="3: Third topic", parent=root)
root.children.append(topic3)
topic3_1 = DekiPage(id="34", title="3.1: Ground", parent=topic3)
topic3_1 = LibraryPage(id="34", title="3.1: Ground", parent=topic3)
topic3.children.append(topic3_1)
topic3_2 = DekiPage(id="35", title="3.2: Earth", parent=topic3)
topic3_2 = LibraryPage(id="35", title="3.2: Earth", parent=topic3)
topic3.children.append(topic3_2)
topic3_3 = DekiPage(id="36", title="3.3: Sky", parent=topic3)
topic3_3 = LibraryPage(id="36", title="3.3: Sky", parent=topic3)
topic3.children.append(topic3_3)
return DekiTree(
return LibraryTree(
root=root,
pages={
root.id: root,
Expand Down Expand Up @@ -177,6 +177,6 @@ def deki_tree() -> DekiTree:
],
)
def test_content_filter(
content_filter: ContentFilter, expected_ids: list[str], deki_tree: DekiTree
content_filter: ContentFilter, expected_ids: list[str], library_tree: LibraryTree
):
assert [page.id for page in content_filter.filter(deki_tree)] == expected_ids
assert [page.id for page in content_filter.filter(library_tree)] == expected_ids

0 comments on commit 35b5448

Please sign in to comment.