Skip to content

Commit

Permalink
fixup! Minimal scraper working from end-to-end
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 3, 2024
1 parent 4f4c5ba commit 46e6072
Show file tree
Hide file tree
Showing 19 changed files with 186 additions and 222 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/Tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,4 @@ jobs:
run: docker run -v $PWD/output:/output libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo"

- name: Run integration test suite
run: docker run -v $PWD/scraper/tests-integration:/src/scraper/tests-integration -v $PWD/output:/output libretexts2zim bash -c "pip install pytest; pytest -v /src/scraper/tests-integration"
run: docker run -v $PWD/scraper/tests-integration:/src/scraper/tests-integration -v $PWD/output:/output -e ZIM_FILE_PATH=/output/tests_en_libretexts-geo.zim libretexts2zim bash -c "pip install pytest; pytest -v /src/scraper/tests-integration"
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -312,5 +312,6 @@ pyrightconfig.json

# End of https://www.toptal.com/developers/gitignore/api/node,python


output
.vscode
output
tmp
7 changes: 0 additions & 7 deletions .vscode/extensions.json

This file was deleted.

22 changes: 0 additions & 22 deletions .vscode/settings.json

This file was deleted.

6 changes: 4 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ To simplify this, it is possible to:
- extract assets from generated files and place them in a directory where ZIM UI will find them
- iterate on ZIM UI code

This needs to be done everytime you make significant changes to the scraper (Python code) that have impact on files used by the Vue.JS UI.

To achieve this, first build the Docker image based on current code base.

```
Expand All @@ -31,7 +33,7 @@ docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim
Extract interesting ZIM content and move it to `public` folder.

```
find zimui/public/content -mindepth 1 -delete
rm -rf zimui/public/content
docker run -it --rm -v $(pwd)/output:/data ghcr.io/openzim/zim-tools:latest zimdump dump --dir=/data/tests_en_libretexts-geo /data/tests_en_libretexts-geo.zim
sudo chown -R $(id -u -n):$(id -g -n) output/tests_en_libretexts-geo
mkdir -p zimui/public/content
Expand All @@ -46,7 +48,7 @@ cd zimui
yarn dev
```

Do not forget to cleanup `public` folder before building the docker image again, otherwise all assets will be pushed to the ZIM.
Do not forget to cleanup `public/content` folder before building the docker image again, otherwise all assets will be pushed to the ZIM.

```
rm -rf zimui/public/content
Expand Down
3 changes: 2 additions & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "hatchling.build"
[project]
name = "libretexts2zim"
requires-python = ">=3.12,<3.13"
description = "Make ZIM file from LibreTexts courses"
description = "Make ZIM file from LibreTexts libraries"
readme = "../README.md"
dependencies = [
"yt-dlp", # youtube-dl should be updated as frequently as possible
Expand All @@ -19,6 +19,7 @@ dependencies = [
"schedule==1.2.2",
"beautifulsoup4==4.12.3",
"types-beautifulsoup4==4.12.0.20240907",
"lxml==5.3.0",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
22 changes: 5 additions & 17 deletions scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,18 @@ class LibreTextsParsingError(Exception):
pass


class LibreTextsShelve(BaseModel):
title: str
content_url: str
image_url: str


class LibreTextsHome(BaseModel):
welcome_text_paragraphs: list[str]
shelves: list[LibreTextsShelve]
welcome_image_url: str


class LibreTextsMetadata(BaseModel):
"""Metadata about a course."""
"""Metadata about a library."""

# Human readable name for the course.
# Human readable name for the library.
name: str

# URL prefix for the course, e.g. for Geosciences which is at
# URL prefix for the library, e.g. for Geosciences which is at
# https://geo.libretexts.org/, the slug is `geo`
slug: str

Expand Down Expand Up @@ -91,12 +84,11 @@ def get_home(self) -> LibreTextsHome:
return LibreTextsHome(

Check warning on line 84 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L83-L84

Added lines #L83 - L84 were not covered by tests
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
shelves=[],
)


def _get_soup(content: str) -> BeautifulSoup:
return BeautifulSoup(content, "html.parser")
return BeautifulSoup(content, "lxml")


def _get_welcome_image_url_from_home(soup: BeautifulSoup) -> str:
Expand Down Expand Up @@ -125,11 +117,7 @@ def _get_welcome_text_from_home(soup: BeautifulSoup) -> list[str]:
"<section> with class 'mt-content-container' not found"
)
welcome_text: list[str] = []
for paragraph in content_section.find_all("p"):
if paragraph.find("div", class_="mt-category-container"):
# once we found a mt-category-container div, we are not in the welcome text
# anymore
break
for paragraph in content_section.find_all("p", recursive=False):
if paragraph_text := paragraph.text:
welcome_text.append(paragraph_text)
return welcome_text
2 changes: 1 addition & 1 deletion scraper/src/libretexts2zim/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
VERSION = __version__
ROOT_DIR = pathlib.Path(__file__).parent

# As of 2024-09-24, all courses appears to be in English.
# As of 2024-09-24, all libraries appears to be in English.
LANGUAGE_ISO_639_3 = "eng"

logger = getLogger(NAME, level=logging.DEBUG)
170 changes: 140 additions & 30 deletions scraper/src/libretexts2zim/entrypoint.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import argparse
import logging
import os
from pathlib import Path

from zimscraperlib.constants import (
MAXIMUM_DESCRIPTION_METADATA_LENGTH,
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
RECOMMENDED_MAX_TITLE_LENGTH,
)

from libretexts2zim.client import LibreTextsClient
from libretexts2zim.constants import (
NAME,
VERSION,
logger,
)
from libretexts2zim.generator import ContentFilter, Generator
from libretexts2zim.processor import ContentFilter, Processor
from libretexts2zim.zimconfig import ZimConfig


Expand All @@ -28,13 +35,111 @@ def zim_defaults() -> ZimConfig:
)


def main() -> None:
parser = argparse.ArgumentParser(
prog=NAME,
def add_zim_config_flags(parser: argparse.ArgumentParser, defaults: "ZimConfig"):
"""
Adds flags related to zim configuration
Flags are added to the given parser with given defaults.
"""

parser.add_argument(
"--library-name",
help="Display name for the library, e.g. Geosciences",
required=True,
)

parser.add_argument(
"--debug", help="Enable verbose output", action="store_true", default=False
"--creator",
help=f"Name of content creator. Default: {defaults.creator!r}",
default=defaults.creator,
)

parser.add_argument(
"--publisher",
help=f"Custom publisher name. Default: {defaults.publisher!r}",
default=defaults.publisher,
)

parser.add_argument(
"--file-name-format",
help="Custom file name format for individual ZIMs. "
f"Default: {defaults.file_name_format!r}",
default=defaults.file_name_format,
metavar="FORMAT",
)

parser.add_argument(
"--name-format",
help="Custom name format for individual ZIMs. "
f"Default: {defaults.name_format!r}",
default=defaults.name_format,
metavar="FORMAT",
)

parser.add_argument(
"--title-format",
help=f"Custom title format for individual ZIMs. Final value must not be "
f"longer than {RECOMMENDED_MAX_TITLE_LENGTH} chars. "
f"Default: {defaults.title_format!r}",
default=defaults.title_format,
metavar="FORMAT",
)

parser.add_argument(
"--description-format",
help="Custom description format for individual ZIMs. Final value must not "
f"be longer than {MAXIMUM_DESCRIPTION_METADATA_LENGTH} chars. "
f"Default: {defaults.title_format!r}",
default=defaults.description_format,
metavar="FORMAT",
)

parser.add_argument(
"--long-description-format",
help="Custom long description format for your ZIM. Final value must not be "
f"longer than {MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH} chars. "
f"Default: {defaults.long_description_format!r}",
default=defaults.long_description_format,
metavar="FORMAT",
)

# Due to https://github.com/python/cpython/issues/60603 defaulting an array in
# argparse doesn't work so we expose the underlying semicolon delimited string.
parser.add_argument(
"--tags",
help="A semicolon (;) delimited list of tags to add to the ZIM."
"Formatting is supported. "
f"Default: {defaults.tags!r}",
default=defaults.tags,
)

parser.add_argument(
"--secondary-color",
help="Secondary (background) color of ZIM UI. Default: "
f"{defaults.secondary_color!r}",
default=defaults.secondary_color,
)


def add_content_filter_flags(parser: argparse.ArgumentParser):
"""Adds flags related to content filtering to the given parser."""

parser.add_argument(

Check warning on line 127 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L127

Added line #L127 was not covered by tests
"--shelves-include",
help="Includes only shelves matching the given regular expression.",
metavar="REGEX",
)

parser.add_argument(

Check warning on line 133 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L133

Added line #L133 was not covered by tests
"--shelves-exclude",
help="Excludes shelves matching the given regular expression.",
metavar="REGEX",
)


def main() -> None:
parser = argparse.ArgumentParser(

Check warning on line 141 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L141

Added line #L141 was not covered by tests
prog=NAME,
)

parser.add_argument(

Check warning on line 145 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L145

Added line #L145 was not covered by tests
Expand All @@ -44,39 +149,41 @@ def main() -> None:
version=VERSION,
)

# Client configuration flags
parser.add_argument(

Check warning on line 153 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L153

Added line #L153 was not covered by tests
"--library-slug",
help="URL prefix for the library, e.g. for Geosciences which is at "
"https://geo.libretexts.org/, the slug is `geo`",
required=True,
)

# ZIM configuration flags
add_zim_config_flags(parser, zim_defaults())

Check warning on line 161 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L161

Added line #L161 was not covered by tests

# Document selection flags
add_content_filter_flags(parser)

Check warning on line 164 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L164

Added line #L164 was not covered by tests

parser.add_argument(

Check warning on line 166 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L166

Added line #L166 was not covered by tests
"--output",
help="Output folder for ZIMs. Default: /output",
default="/output",
dest="output_folder",
)

parser.add_argument(

Check warning on line 173 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L173

Added line #L173 was not covered by tests
"--debug", help="Enable verbose output", action="store_true", default=False
)

parser.add_argument(

Check warning on line 177 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L177

Added line #L177 was not covered by tests
"--zimui-dist",
type=str,
help=(
"Directory containing Vite build output from the Zim UI Vue.JS application"
"Dev option to customize directory containing Vite build output from the "
"ZIM UI Vue.JS application"
),
default=os.getenv("LIBRETEXTS_ZIMUI_DIST", "../zimui/dist"),
)

# ZIM configuration flags
ZimConfig.add_flags(
parser,
zim_defaults(),
)

# Document selection flags
ContentFilter.add_flags(parser)

# Client configuration flags
parser.add_argument(
"--library-slug",
help="URL prefix for the course, e.g. for Geosciences which is at "
"https://geo.libretexts.org/, the slug is `geo`",
required=True,
)

args = parser.parse_args()

Check warning on line 187 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L187

Added line #L187 was not covered by tests

logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)

Check warning on line 189 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L189

Added line #L189 was not covered by tests
Expand All @@ -88,17 +195,20 @@ def main() -> None:
library_slug=args.library_slug,
)

Generator(
Processor(

Check warning on line 198 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L198

Added line #L198 was not covered by tests
libretexts_client=libretexts_client,
zim_config=zim_config,
output_folder=args.output_folder,
zimui_dist=args.zimui_dist,
output_folder=Path(args.output_folder),
zimui_dist=Path(args.zimui_dist),
content_filter=doc_filter,
).run()
except Exception as e:
logger.exception(e)
logger.error(f"Generation failed with the following error: {e}")
raise SystemExit(1) from e
except SystemExit:
logger.error("Generation failed, exiting")
raise
except Exception as exc:
logger.exception(exc)
logger.error(f"Generation failed with the following error: {exc}")
raise SystemExit(1) from exc

Check warning on line 211 in scraper/src/libretexts2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/entrypoint.py#L206-L211

Added lines #L206 - L211 were not covered by tests


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 46e6072

Please sign in to comment.