Skip to content

Commit

Permalink
Merge pull request #5 from openzim/add_filtering
Browse files Browse the repository at this point in the history
Get the whole page tree and process options to filter it
  • Loading branch information
benoit74 authored Oct 4, 2024
2 parents e7a408c + 35b5448 commit 18d81f3
Show file tree
Hide file tree
Showing 5 changed files with 423 additions and 25 deletions.
91 changes: 88 additions & 3 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,56 @@ class LibreTextsHome(BaseModel):
welcome_image_url: str


LibraryPageId = str


class LibraryPage(BaseModel):
"""Class holding information about a given library page on the library tree"""

id: LibraryPageId
title: str
parent: "LibraryPage | None" = None
children: list["LibraryPage"] = []

def __repr__(self) -> str:
return (
f"WikiPage(id='{self.id}', title='{self.title}', "
f"parent='{'None' if not self.parent else self.parent.id}', "
f"children='{','.join([child.id for child in self.children])}')"
)

@property
def self_and_parents(self) -> list["LibraryPage"]:
result: list[LibraryPage] = [self]
current = self
while current.parent is not None:
result.append(current.parent)
current = current.parent
return result


class LibraryTree(BaseModel):
"""Class holding information about the tree of pages on a given library"""

root: LibraryPage
pages: dict[LibraryPageId, LibraryPage] = {}

def sub_tree(self, subroot_id: LibraryPageId) -> "LibraryTree":
"""Returns a sub-tree, starting at give page id"""
new_root = self.pages[subroot_id]
tree = LibraryTree(root=new_root)
tree.pages[new_root.id] = new_root
children_to_explore = [*new_root.children]
while len(children_to_explore) > 0:
child = children_to_explore[0]
children_to_explore.remove(child)
if child.id in tree.pages:
continue # safe-guard
tree.pages[child.id] = child
children_to_explore.extend(child.children)
return tree


class LibreTextsMetadata(BaseModel):
"""Metadata about a library."""

Expand Down Expand Up @@ -162,12 +212,12 @@ def get_deki_token(self) -> str:
self.deki_token = _get_deki_token_from_home(soup)
return self.deki_token

def get_all_pages_ids(self):
def get_all_pages_ids(self) -> list[LibraryPageId]:
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)

page_ids: list[str] = []
page_ids: list[LibraryPageId] = []

def _get_page_ids(page_node: Any) -> None:
page_ids.append(page_node["@id"])
Expand All @@ -183,12 +233,47 @@ def _get_page_ids(page_node: Any) -> None:

return page_ids

def get_root_page_id(self) -> str:
def get_root_page_id(self) -> LibraryPageId:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
return tree["page"]["@id"]

def get_page_tree(self) -> LibraryTree:

tree_data = self._get_api_json(
"/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS
)

root = LibraryPage(
id=tree_data["page"]["@id"], title=tree_data["page"]["title"]
)
tree_obj = LibraryTree(root=root)
tree_obj.pages[root.id] = root

def _add_page(page_node: Any, parent: LibraryPage) -> LibraryPage:
page = LibraryPage(
id=page_node["@id"], title=page_node["title"], parent=parent
)
parent.children.append(page)
tree_obj.pages[page.id] = page
return page

def _process_tree_data(page_node: Any, parent: LibraryPage) -> None:
if not page_node["subpages"]:
return
if "@id" in page_node["subpages"]["page"]:
page = _add_page(page_node["subpages"]["page"], parent=parent)
_process_tree_data(page_node["subpages"]["page"], parent=page)
else:
for subpage_node in page_node["subpages"]["page"]:
page = _add_page(subpage_node, parent=parent)
_process_tree_data(subpage_node, parent=page)

_process_tree_data(tree_data["page"], parent=root)

return tree_obj


def _get_soup(content: str) -> BeautifulSoup:
"""Return a BeautifulSoup soup from textual content
Expand Down
27 changes: 22 additions & 5 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,34 @@ def add_content_filter_flags(parser: argparse.ArgumentParser):
"""Adds flags related to content filtering to the given parser."""

parser.add_argument(
"--shelves-include",
help="Includes only shelves matching the given regular expression.",
"--page-title-include",
help="Includes only pages with title matching the given regular "
"expression, and their parent pages for proper navigation, up to root (or "
"subroot if --root-page-id is set). Can be combined with --page-id-include "
"(pages with matching title or id will be included)",
metavar="REGEX",
)

parser.add_argument(
"--shelves-exclude",
help="Excludes shelves matching the given regular expression.",
"--page-id-include",
help="CSV of page ids to include. Parent pages will be included as "
"well for proper navigation, up to root (or subroot if --root-page-id is set). "
"Can be combined with --page-title-include (pages with matching title or id "
"will be included)",
)

parser.add_argument(
"--page-title-exclude",
help="Excludes pages with title matching the given regular expression",
metavar="REGEX",
)

parser.add_argument(
"--root-page-id",
help="ID of the root page to include in ZIM. Only this page and its"
" subpages will be included in the ZIM",
)


def main(tmpdir: str) -> None:
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -223,7 +240,7 @@ def main(tmpdir: str) -> None:
doc_filter = ContentFilter.of(args)

cache_folder = tmp_folder / "cache"
cache_folder.mkdir()
cache_folder.mkdir(exist_ok=True)

libretexts_client = LibreTextsClient(
library_slug=args.library_slug,
Expand Down
89 changes: 75 additions & 14 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import datetime
import re
from io import BytesIO
from pathlib import Path

Expand All @@ -12,7 +13,13 @@
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.indexing import IndexData

from libretexts2zim.client import LibreTextsClient, LibreTextsMetadata
from libretexts2zim.client import (
LibraryPage,
LibraryPageId,
LibraryTree,
LibreTextsClient,
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.ui import ConfigModel, HomeModel, SharedModel
from libretexts2zim.zimconfig import ZimConfig
Expand All @@ -33,23 +40,69 @@ class MissingDocumentError(Exception):
class ContentFilter(BaseModel):
"""Supports filtering documents by user provided attributes."""

# If specified, only shelves matching the regex are included.
shelves_include: str | None
# If specified, shelves matching the regex are excluded.
shelves_exclude: str | None
# If specified, only pages with title matching the regex are included.
page_title_include: str | None
# If specified, only page with matching ids are included.
page_id_include: str | None
# If specified, page with title matching the regex are excluded.
page_title_exclude: str | None
# If specified, only this page and its subpages will be included.
root_page_id: str | None

@staticmethod
def of(namespace: argparse.Namespace) -> "ContentFilter":
"""Parses a namespace to create a new DocFilter."""
return ContentFilter.model_validate(namespace, from_attributes=True)

# TODO: implement filtering of shelves based on configured regex
# def filter(self, shelves: list[LibretextsShelve]) -> list[LibretextsShelve]:
# """Filters docs based on the user's choices."""
# selected: list[LibretextsShelve] = []
# for shelve in shelves:
# ....
# return selected
def filter(self, page_tree: LibraryTree) -> list[LibraryPage]:
"""Filters pages based on the user's choices."""

if self.root_page_id:
page_tree = page_tree.sub_tree(self.root_page_id)

title_include_re = (
re.compile(self.page_title_include, re.IGNORECASE)
if self.page_title_include
else None
)
title_exclude_re = (
re.compile(self.page_title_exclude, re.IGNORECASE)
if self.page_title_exclude
else None
)
id_include = (
[page_id.strip() for page_id in self.page_id_include.split(",")]
if self.page_id_include
else None
)

def is_selected(
title_include_re: re.Pattern[str] | None,
title_exclude_re: re.Pattern[str] | None,
id_include: list[LibraryPageId] | None,
page: LibraryPage,
) -> bool:
return (
(
not title_include_re
or title_include_re.search(page.title) is not None
)
and (not id_include or page.id in id_include)
and (
not title_exclude_re or title_exclude_re.search(page.title) is None
)
)

# Find selected pages and their parent, and create a set of unique ids
selected_ids = {
selected_page.id
for page in page_tree.pages.values()
for selected_page in page.self_and_parents
if is_selected(title_include_re, title_exclude_re, id_include, page)
}

# Then transform set of ids into list of pages
return [page for page in page_tree.pages.values() if page.id in selected_ids]


def add_item_for(
Expand Down Expand Up @@ -113,7 +166,7 @@ def __init__(
"""
self.libretexts_client = libretexts_client
self.zim_config = zim_config
self.doc_filter = content_filter
self.content_filter = content_filter
self.output_folder = output_folder
self.zimui_dist = zimui_dist
self.overwrite_existing_zim = overwrite_existing_zim
Expand Down Expand Up @@ -222,7 +275,7 @@ def run(self) -> Path:
).model_dump_json(by_alias=True),
)

logger.info(f"Adding files in {self.zimui_dist}")
logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")
for file in self.zimui_dist.rglob("*"):
if file.is_dir():
continue
Expand All @@ -247,4 +300,12 @@ def run(self) -> Path:
is_front=False,
)

logger.info("Fetching pages tree")
pages_tree = self.libretexts_client.get_page_tree()
selected_pages = self.content_filter.filter(pages_tree)
logger.info(
f"{len(selected_pages)} pages (out of {len(pages_tree.pages)}) will be "
"fetched and pushed to the ZIM"
)

return zim_path
59 changes: 56 additions & 3 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@
)
from zimscraperlib.image.probing import format_for

from libretexts2zim.client import LibreTextsClient, LibreTextsHome
from libretexts2zim.client import (
LibraryPageId,
LibraryTree,
LibreTextsClient,
LibreTextsHome,
)


@pytest.fixture(scope="module")
Expand All @@ -31,10 +36,23 @@ def minimum_number_of_pages() -> int:


@pytest.fixture(scope="module")
def root_page_id() -> str:
def root_page_id() -> LibraryPageId:
return "34"


@pytest.fixture(scope="module")
def nb_root_children() -> int:
return 6


@pytest.fixture(scope="module")
def page_tree(
client: LibreTextsClient,
deki_token: str, # noqa: ARG001
) -> LibraryTree:
return client.get_page_tree()


def test_get_deki_token(deki_token: str):
"""Ensures we achieve to get a deki_token"""
assert deki_token
Expand All @@ -51,12 +69,47 @@ def test_get_all_pages_ids(

def test_get_root_page_id(
client: LibreTextsClient,
root_page_id: str,
root_page_id: LibraryPageId,
deki_token: str, # noqa: ARG001
):
assert client.get_root_page_id() == root_page_id


def test_get_page_tree_pages(
page_tree: LibraryTree,
minimum_number_of_pages: int,
):
assert len(page_tree.pages.keys()) > minimum_number_of_pages


def test_get_page_tree_root(
page_tree: LibraryTree,
root_page_id: str,
nb_root_children: int,
):
assert page_tree.root.id == root_page_id
assert len(page_tree.root.children) == nb_root_children
assert page_tree.root.title
for child in page_tree.root.children:
assert child.title


def test_get_page_tree_subtree(
page_tree: LibraryTree,
):

# 28207 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College/01%3A_Understanding_Science
subtree1 = page_tree.sub_tree("28207")
# 4 = "1. Understransding Science" + "1.1: What is Science?"
# + "1.2: The Scientific Method" + "1.3: The Study of Geology"
assert len(subtree1.pages.keys()) == 4

# 28196 = https://geo.libretexts.org/Courses/Coastline_College/An_Introduction_To_Geology_-_Coastline_College
subtree2 = page_tree.sub_tree("28196")
# 94 is number retrieved in Oct. 2024, might change
assert len(subtree2.pages.keys()) == 94


def test_get_home_image_url(home: LibreTextsHome):
"""Ensures proper image url is retrieved"""
assert home.welcome_image_url == "https://cdn.libretexts.net/Logos/geo_full.png"
Expand Down
Loading

0 comments on commit 18d81f3

Please sign in to comment.