Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get the whole page tree and process options to filter it #5

Merged
merged 2 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 88 additions & 3 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,56 @@
welcome_image_url: str


LibraryPageId = str


class LibraryPage(BaseModel):
"""Class holding information about a given library page on the library tree"""

id: LibraryPageId
title: str
parent: "LibraryPage | None" = None
children: list["LibraryPage"] = []

def __repr__(self) -> str:
return (

Check warning on line 39 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L39

Added line #L39 was not covered by tests
f"WikiPage(id='{self.id}', title='{self.title}', "
f"parent='{'None' if not self.parent else self.parent.id}', "
f"children='{','.join([child.id for child in self.children])}')"
)

@property
def self_and_parents(self) -> list["LibraryPage"]:
result: list[LibraryPage] = [self]
current = self
while current.parent is not None:
result.append(current.parent)
current = current.parent
return result


class LibraryTree(BaseModel):
"""Class holding information about the tree of pages on a given library"""

root: LibraryPage
pages: dict[LibraryPageId, LibraryPage] = {}

def sub_tree(self, subroot_id: LibraryPageId) -> "LibraryTree":
"""Returns a sub-tree, starting at give page id"""
new_root = self.pages[subroot_id]
tree = LibraryTree(root=new_root)
tree.pages[new_root.id] = new_root
children_to_explore = [*new_root.children]
while len(children_to_explore) > 0:
child = children_to_explore[0]
children_to_explore.remove(child)
if child.id in tree.pages:
continue # safe-guard

Check warning on line 71 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L71

Added line #L71 was not covered by tests
tree.pages[child.id] = child
children_to_explore.extend(child.children)
return tree


class LibreTextsMetadata(BaseModel):
"""Metadata about a library."""

Expand Down Expand Up @@ -162,12 +212,12 @@
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

def get_all_pages_ids(self):
def get_all_pages_ids(self) -> list[LibraryPageId]:
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)

page_ids: list[str] = []
page_ids: list[LibraryPageId] = []

Check warning on line 220 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L220

Added line #L220 was not covered by tests

def _get_page_ids(page_node: Any) -> None:
page_ids.append(page_node["@id"])
Expand All @@ -183,12 +233,47 @@

return page_ids

def get_root_page_id(self) -> str:
def get_root_page_id(self) -> LibraryPageId:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
return tree["page"]["@id"]

def get_page_tree(self) -> LibraryTree:

tree_data = self._get_api_json(

Check warning on line 244 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L244

Added line #L244 was not covered by tests
"/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS
)

root = LibraryPage(

Check warning on line 248 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L248

Added line #L248 was not covered by tests
id=tree_data["page"]["@id"], title=tree_data["page"]["title"]
)
tree_obj = LibraryTree(root=root)
tree_obj.pages[root.id] = root

Check warning on line 252 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L251-L252

Added lines #L251 - L252 were not covered by tests

def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage:
page = LibraryPage(

Check warning on line 255 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L254-L255

Added lines #L254 - L255 were not covered by tests
id=page_node["@id"], title=page_node["title"], parent=parent
)
parent.children.append(page)
tree_obj.pages[page.id] = page
return page

Check warning on line 260 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L258-L260

Added lines #L258 - L260 were not covered by tests

def _process_tree_data(page_node: Any, parent: LibraryPage) -> None:

Check warning on line 262 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L262

Added line #L262 was not covered by tests
if not page_node["subpages"]:
return

Check warning on line 264 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L264

Added line #L264 was not covered by tests
if "@id" in page_node["subpages"]["page"]:
page = _add_page(page_node["subpages"]["page"], parent=parent)
_process_tree_data(page_node["subpages"]["page"], parent=page)

Check warning on line 267 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L266-L267

Added lines #L266 - L267 were not covered by tests
else:
for subpage_node in page_node["subpages"]["page"]:
page = _add_page(subpage_node, parent=parent)
_process_tree_data(subpage_node, parent=page)

Check warning on line 271 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L270-L271

Added lines #L270 - L271 were not covered by tests

_process_tree_data(tree_data["page"], parent=root)

Check warning on line 273 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L273

Added line #L273 was not covered by tests

return tree_obj

Check warning on line 275 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L275

Added line #L275 was not covered by tests


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
Expand Down
27 changes: 22 additions & 5 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,34 @@
"""Adds flags related to content filtering to the given parser."""

parser.add_argument(
"--shelves-include",
help="Includes only shelves matching the given regular expression.",
"--page-title-include",
help="Includes only pages with title matching the given regular "
"expression, and their parent pages for proper navigation, up to root (or "
"subroot if --root-page-id is set). Can be combined with --page-id-include "
"(pages with matching title or id will be included)",
metavar="REGEX",
)

parser.add_argument(
"--shelves-exclude",
help="Excludes shelves matching the given regular expression.",
"--page-id-include",
help="CSV of page ids to include. Parent pages will be included as "
"well for proper navigation, up to root (or subroot if --root-page-id is set). "
"Can be combined with --page-title-include (pages with matching title or id "
"will be included)",
)

parser.add_argument(

Check warning on line 145 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L145

Added line #L145 was not covered by tests
"--page-title-exclude",
help="Excludes pages with title matching the given regular expression",
metavar="REGEX",
)

parser.add_argument(

Check warning on line 151 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L151

Added line #L151 was not covered by tests
"--root-page-id",
help="ID of the root page to include in ZIM. Only this page and its"
" subpages will be included in the ZIM",
)


def main(tmpdir: str) -> None:
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -223,7 +240,7 @@
doc_filter = ContentFilter.of(args)

cache_folder = tmp_folder / "cache"
cache_folder.mkdir()
cache_folder.mkdir(exist_ok=True)

Check warning on line 243 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L243

Added line #L243 was not covered by tests

libretexts_client = LibreTextsClient(
library_slug=args.library_slug,
Expand Down
89 changes: 75 additions & 14 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import datetime
import re
from io import BytesIO
from pathlib import Path

Expand All @@ -12,7 +13,13 @@
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData

from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
from libretexts2zim.client import (
LibraryPage,
LibraryPageId,
LibraryTree,
LibreTextsClient,
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.ui import ConfigModel, HomeModel, SharedModel
from libretexts2zim.zimconfig import ZimConfig
Expand All @@ -33,23 +40,69 @@
class ContentFilter(BaseModel):
"""Supports filtering documents by user provided attributes."""

# If specified, only shelves matching the regex are included.
shelves_include: str | None
# If specified, shelves matching the regex are excluded.
shelves_exclude: str | None
# If specified, only pages with title matching the regex are included.
page_title_include: str | None
# If specified, only page with matching ids are included.
page_id_include: str | None
# If specified, page with title matching the regex are excluded.
page_title_exclude: str | None
# If specified, only this page and its subpages will be included.
root_page_id: str | None

@staticmethod
def of(namespace: argparse.Namespace) -> "ContentFilter":
"""Parses a namespace to create a new DocFilter."""
return ContentFilter.model_validate(namespace, from_attributes=True)

# TODO: implement filtering of shelves based on configured regex
# def filter(self, shelves: list[LibretextsShelve]) -> list[LibretextsShelve]:
# """Filters docs based on the user's choices."""
# selected: list[LibretextsShelve] = []
# for shelve in shelves:
# ....
# return selected
def filter(self, page_tree: LibraryTree) -> list[LibraryPage]:
"""Filters pages based on the user's choices."""

if self.root_page_id:
page_tree = page_tree.sub_tree(self.root_page_id)

title_include_re = (
re.compile(self.page_title_include, re.IGNORECASE)
if self.page_title_include
else None
)
title_exclude_re = (
re.compile(self.page_title_exclude, re.IGNORECASE)
if self.page_title_exclude
else None
)
id_include = (
[page_id.strip() for page_id in self.page_id_include.split(",")]
if self.page_id_include
else None
)

def is_selected(
title_include_re: re.Pattern[str] | None,
title_exclude_re: re.Pattern[str] | None,
id_include: list[LibraryPageId] | None,
page: LibraryPage,
) -> bool:
return (
(
not title_include_re
or title_include_re.search(page.title) is not None
)
and (not id_include or page.id in id_include)
and (
not title_exclude_re or title_exclude_re.search(page.title) is None
)
)

# Find selected pages and their parent, and create a set of unique ids
selected_ids = {
selected_page.id
for page in page_tree.pages.values()
for selected_page in page.self_and_parents
if is_selected(title_include_re, title_exclude_re, id_include, page)
}

# Then transform set of ids into list of pages
return [page for page in page_tree.pages.values() if page.id in selected_ids]


def add_item_for(
Expand Down Expand Up @@ -113,7 +166,7 @@
"""
self.libretexts_client = libretexts_client
self.zim_config = zim_config
self.doc_filter = content_filter
self.content_filter = content_filter

Check warning on line 169 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L169

Added line #L169 was not covered by tests
self.output_folder = output_folder
self.zimui_dist = zimui_dist
self.overwrite_existing_zim = overwrite_existing_zim
Expand Down Expand Up @@ -222,7 +275,7 @@
).model_dump_json(by_alias=True),
)

logger.info(f"Adding files in {self.zimui_dist}")
logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")

Check warning on line 278 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L278

Added line #L278 was not covered by tests
for file in self.zimui_dist.rglob("*"):
if file.is_dir():
continue
Expand All @@ -247,4 +300,12 @@
is_front=False,
)

logger.info("Fetching pages tree")
pages_tree = self.libretexts_client.get_page_tree()
selected_pages = self.content_filter.filter(pages_tree)
logger.info(

Check warning on line 306 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L303-L306

Added lines #L303 - L306 were not covered by tests
f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be "
"fetched and pushed to the ZIM"
)

return zim_path
59 changes: 56 additions & 3 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@
)
from zimscraperlib.image.probing import format_for

from libretexts2zim.client import LibreTextsClient, LibreTextsHome
from libretexts2zim.client import (
LibraryPageId,
LibraryTree,
LibreTextsClient,
LibreTextsHome,
)


@pytest.fixture(scope="module")
Expand All @@ -31,10 +36,23 @@ def minimum_number_of_pages() -> int:


@pytest.fixture(scope="module")
def root_page_id() -> str:
def root_page_id() -> LibraryPageId:
return "34"


@pytest.fixture(scope="module")
def nb_root_children() -> int:
return 6


@pytest.fixture(scope="module")
def page_tree(
client: LibreTextsClient,
deki_token: str, # noqa: ARG001
) -> LibraryTree:
return client.get_page_tree()


def test_get_deki_token(deki_token: str):
"""Ensures we achieve to get a deki_token"""
assert deki_token
Expand All @@ -51,12 +69,47 @@ def test_get_all_pages_ids(

def test_get_root_page_id(
client: LibreTextsClient,
root_page_id: str,
root_page_id: LibraryPageId,
deki_token: str, # noqa: ARG001
):
assert client.get_root_page_id() == root_page_id


def test_get_page_tree_pages(
page_tree: LibraryTree,
minimum_number_of_pages: int,
):
assert len(page_tree.pages.keys()) > minimum_number_of_pages


def test_get_page_tree_root(
page_tree: LibraryTree,
root_page_id: str,
nb_root_children: int,
):
assert page_tree.root.id == root_page_id
assert len(page_tree.root.children) == nb_root_children
assert page_tree.root.title
for child in page_tree.root.children:
assert child.title


def test_get_page_tree_subtree(
page_tree: LibraryTree,
):

# 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science
subtree1 = page_tree.sub_tree("28207")
# 4 = "1. Understransding Science" + "1.1: What is Science?"
# + "1.2: The Scientific Method" + "1.3: The Study of Geology"
assert len(subtree1.pages.keys()) == 4

# 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College
subtree2 = page_tree.sub_tree("28196")
# 94 is number retrieved in Oct. 2024, might change
assert len(subtree2.pages.keys()) == 94


def test_get_home_image_url(home: LibreTextsHome):
"""Ensures proper image url is retrieved"""
assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"
Expand Down
Loading