Skip to content

Commit

Permalink
Apply proper CSS for proper page display - step 1
Browse files Browse the repository at this point in the history
This first step takes care of CSS stylesheets which are in an external
file (two indeed, one for screen and one for print).

It does not consider inline CSS which is needed and will be handled in
a step 2.
  • Loading branch information
benoit74 committed Oct 10, 2024
1 parent 733c35a commit 4749161
Show file tree
Hide file tree
Showing 10 changed files with 614 additions and 7 deletions.
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ To achieve this, first build the Docker image based on current code base.
docker build -t local-libretexts2zim .
```

Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, but you could use any other one of interest for your UI developments).
Scrape a library (here we use the [Geosciences](https://geo.libretexts.org) library, with only page id 28207 and its children but you could use any other one of interest for your UI developments).

```
docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --overwrite
docker run --rm -it -v "$PWD/output":/output local-libretexts2zim libretexts2zim --library-slug geo --library-name Geosciences --file-name-format "tests_en_libretexts-geo" --root-page-id 28207 --overwrite
```

Extract interesting ZIM content and move it to `public` folder.
Expand Down
1 change: 1 addition & 0 deletions scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"beautifulsoup4==4.12.3",
"types-beautifulsoup4==4.12.0.20240907",
"lxml==5.3.0",
"tinycss2==1.3.0",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
34 changes: 33 additions & 1 deletion scraper/src/libretexts2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class LibreTextsParsingError(Exception):
class LibreTextsHome(BaseModel):
welcome_text_paragraphs: list[str]
welcome_image_url: str
screen_css_url: str
print_css_url: str


LibraryPageId = str
Expand Down Expand Up @@ -206,6 +208,8 @@ def get_home(self) -> LibreTextsHome:
return LibreTextsHome(
welcome_text_paragraphs=_get_welcome_text_from_home(soup),
welcome_image_url=_get_welcome_image_url_from_home(soup),
screen_css_url=_get_screen_css_url_from_home(soup),
print_css_url=_get_print_css_url_from_home(soup),
)

def get_deki_token(self) -> str:
Expand Down Expand Up @@ -308,7 +312,7 @@ def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
if tree["body"][1]["@target"] != "toc":
raise LibreTextsParsingError(
f"Unexpected second body element of /pages/{page.id}/contents, "
f"@target property is '{tree["body"][1]["@target"]}' while only 'toc' "
f"@target property is '{tree['body'][1]['@target']}' while only 'toc' "
"is expected"
)
return LibraryPageContent(html_body=tree["body"][0])
Expand Down Expand Up @@ -373,3 +377,31 @@ def _get_deki_token_from_home(soup: BeautifulSoup) -> str:
"Failed to retrieve API token to query website API, missing apiToken."
)
return x_deki_token


def _get_any_css_url_from_home(soup: BeautifulSoup, media: str) -> str:
"""Returns the URL of any media CSS found on home page
This function expects there is only one <style /> with a media attribute per page
and returns the URL of this tag. This is is the case on libretexts.org as of October
2024, might be a bit fragile.
"""
links = soup.find_all("link", {"rel": "stylesheet", "media": media})

Check warning on line 389 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L389

Added line #L389 was not covered by tests
if len(links) != 1:
raise LibreTextsParsingError(

Check warning on line 391 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L391

Added line #L391 was not covered by tests
f"Failed to find {media} CSS URL in home page, {len(links)} link(s) found"
)
css_url = links[0].get("href", None)

Check warning on line 394 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L394

Added line #L394 was not covered by tests
if not css_url:
raise LibreTextsParsingError("screen CSS link has no href")
return css_url

Check warning on line 397 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L396-L397

Added lines #L396 - L397 were not covered by tests


def _get_screen_css_url_from_home(soup: BeautifulSoup) -> str:
"""Returns the URL of screen CSS found on home page"""
return _get_any_css_url_from_home(soup, "screen")

Check warning on line 402 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L402

Added line #L402 was not covered by tests


def _get_print_css_url_from_home(soup: BeautifulSoup) -> str:
"""Returns the URL of print CSS found on home page"""
return _get_any_css_url_from_home(soup, "print")

Check warning on line 407 in scraper/src/libretexts2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/client.py#L407

Added line #L407 was not covered by tests
137 changes: 137 additions & 0 deletions scraper/src/libretexts2zim/css.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from collections.abc import Iterable
from pathlib import Path
from urllib.parse import urljoin, urlparse

from tinycss2 import ast, parse_stylesheet_bytes, serialize # pyright: ignore
from tinycss2.serializer import serialize_url # pyright: ignore

from libretexts2zim.utils import get_asset_path_from_url

OriginalUrl = str
FullZimPath = Path
RelativeCssPath = Path


class CssProcessor:
"""Utility to to process CSS, extract assets and rewrite URLs
This utility can process multiple CSS documents that will be stored in a ZIM
It extracts the list of assets (images, fonts) that are used in the CSS documents
and compute appropriate ZIM paths for each of them.
Arguments:
css_target_path: "folder" where the CSS documents that will be processed will be
stored in the ZIM
css_assets_root_path: "folder" where the CSS assets referenced in the CSS
documents will be stored in the ZIM
"""

def __init__(
self,
css_target_path: Path = Path("/content"),
css_assets_root_path: Path = Path("/content/css_assets"),
) -> None:
self.css_target_path = css_target_path
self.css_assets_root_path = css_assets_root_path
self.css_assets: dict[OriginalUrl, FullZimPath] = {}
self.used_paths: list[RelativeCssPath] = []

def process(self, css_original_url: str, css_content: bytes) -> str:
"""Rewrite CSS rules and update list of assets to fetch
This function updates the CSS rules to target assets path inside the ZIM
It also updates the list of `css_assets` which is the list of online resources
referenced inside the ZIM and which should be fetched and stored inside the ZIM
for proper CSS operation.
"""
rules, _ = parse_stylesheet_bytes( # pyright: ignore[reportUnknownVariableType]
css_content
)
self._process_list(
css_original_url,
rules, # pyright: ignore[reportUnknownArgumentType]
)
return serialize(rules)

def _process_url(
self, css_original_url: str, css_url: str
) -> RelativeCssPath | None:
"""Process a URL which has been found in CSS rules
- Transforms the URL into a ZIM path
- Updates the list of assets to retrieve
"""
original_url = urljoin(css_original_url, css_url)
original_url_parsed = urlparse(original_url)
if original_url_parsed.scheme.lower() not in ["http", "https"]:
return None
if original_url in self.css_assets:
return self.css_assets[original_url].relative_to(self.css_target_path)
relative_path = get_asset_path_from_url(original_url, self.used_paths)
self.used_paths.append(relative_path)
target_path = self.css_assets_root_path / relative_path
self.css_assets[original_url] = target_path
return target_path.relative_to(self.css_target_path)

def _process_node(self, css_original_url: str, node: ast.Node):
"""Process one single CSS node"""
if isinstance(
node,
ast.QualifiedRule
| ast.SquareBracketsBlock
| ast.ParenthesesBlock
| ast.CurlyBracketsBlock,
):
self._process_list(
css_original_url,
node.content, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType]
)
elif isinstance(node, ast.FunctionBlock):
if node.lower_name == "url": # pyright: ignore[reportUnknownMemberType]
url_node: ast.Node = node.arguments[0] # pyright: ignore
relative_css_path = self._process_url(
css_original_url,
url_node.value, # pyright: ignore
)
if not relative_css_path:
return

Check warning on line 97 in scraper/src/libretexts2zim/css.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/css.py#L97

Added line #L97 was not covered by tests
url_node.value = str(relative_css_path) # pyright: ignore
url_node.representation = ( # pyright: ignore
f'"{serialize_url(str(relative_css_path))}"'
)

else:
self._process_list(
css_original_url,
node.arguments, # pyright: ignore
)
elif isinstance(node, ast.AtRule):
self._process_list(
css_original_url,
node.prelude, # pyright: ignore
)
self._process_list(
css_original_url,
node.content, # pyright: ignore
)
elif isinstance(node, ast.Declaration):
self._process_list(

Check warning on line 118 in scraper/src/libretexts2zim/css.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/css.py#L118

Added line #L118 was not covered by tests
css_original_url,
node.value, # pyright: ignore
)
elif isinstance(node, ast.URLToken):
relative_css_path = self._process_url(
css_original_url,
node.value, # pyright: ignore
)
if not relative_css_path:
return
node.value = str(relative_css_path)
node.representation = f"url({serialize_url(str(relative_css_path))})"

def _process_list(self, css_original_url: str, nodes: Iterable[ast.Node] | None):
"""Process a list of CSS nodes"""
if not nodes:
return
for node in nodes:
self._process_node(css_original_url, node)
36 changes: 36 additions & 0 deletions scraper/src/libretexts2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

from pydantic import BaseModel
from requests.exceptions import HTTPError
from zimscraperlib.download import (
stream_file, # pyright: ignore[reportUnknownVariableType]
)
Expand All @@ -21,6 +22,7 @@
LibreTextsMetadata,
)
from libretexts2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger
from libretexts2zim.css import CssProcessor
from libretexts2zim.ui import (
ConfigModel,
PageContentModel,
Expand Down Expand Up @@ -261,11 +263,45 @@ def run(self) -> Path:

logger.info(" Fetching and storing home page...")
home = self.libretexts_client.get_home()

welcome_image = BytesIO()
stream_file(home.welcome_image_url, byte_stream=welcome_image)
add_item_for(creator, "content/logo.png", content=welcome_image.getvalue())
del welcome_image

css_processor = CssProcessor()
screen_css = BytesIO()
stream_file(home.screen_css_url, byte_stream=screen_css)
result = css_processor.process(

Check warning on line 275 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L272-L275

Added lines #L272 - L275 were not covered by tests
css_original_url=home.screen_css_url, css_content=screen_css.getvalue()
)
add_item_for(creator, "content/screen.css", content=result)
del screen_css

Check warning on line 279 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L278-L279

Added lines #L278 - L279 were not covered by tests

print_css = BytesIO()
stream_file(home.print_css_url, byte_stream=print_css)
result = css_processor.process(

Check warning on line 283 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L281-L283

Added lines #L281 - L283 were not covered by tests
css_original_url=home.print_css_url, css_content=print_css.getvalue()
)
add_item_for(creator, "content/print.css", content=result)
del print_css

Check warning on line 287 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L286-L287

Added lines #L286 - L287 were not covered by tests

logger.info(f" Retrieving {len(css_processor.css_assets)} CSS assets...")

Check warning on line 289 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L289

Added line #L289 was not covered by tests
for asset_url, asset_path in css_processor.css_assets.items():
try:
css_asset = BytesIO()
stream_file(asset_url, byte_stream=css_asset)
add_item_for(

Check warning on line 294 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L291-L294

Added lines #L291 - L294 were not covered by tests
creator, str(asset_path)[1:], content=css_asset.getvalue()
)
logger.debug(f"Adding {asset_url} to {asset_path} in the ZIM")
del css_asset
except HTTPError as exc:

Check warning on line 299 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L297-L299

Added lines #L297 - L299 were not covered by tests
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path} due to {exc}")

Check warning on line 303 in scraper/src/libretexts2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/libretexts2zim/processor.py#L303

Added line #L303 was not covered by tests

logger.info(f"Adding Vue.JS UI files in {self.zimui_dist}")
for file in self.zimui_dist.rglob("*"):
if file.is_dir():
Expand Down
30 changes: 30 additions & 0 deletions scraper/src/libretexts2zim/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from pathlib import Path
from urllib.parse import urlparse


def get_asset_path_from_url(online_url: str, already_used_paths: list[Path]) -> Path:
"""Computes the path where one should store its asset based on its online URL
This function try to:
- preserve as much the online path as possible
- simplify filename (e.g. dropping querystring) to simply ZimPath
"""
original_path = Path(urlparse(online_url).path)
target_parent = Path(
*[
parent.name
for parent in reversed(original_path.parents)
if parent.name and parent.name != ".."
]
)

index = 0
while True:
relative_path = (
target_parent / f"{original_path.stem}{'_' + str(index) if index else ''}"
f"{original_path.suffix}"
)
if relative_path not in already_used_paths:
break
index += 1
return relative_path
16 changes: 16 additions & 0 deletions scraper/tests-integration/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,19 @@ def test_get_home_welcome_text_paragraphs(
def test_get_home_page_content(client: LibreTextsClient, page_tree: LibraryTree):
"""Ensures we can get content of root page"""
assert client.get_page_content(page_tree.root).html_body


def test_get_home_screen_css_url(home: LibreTextsHome):
"""Ensures proper screen CSS url is retrieved"""
assert (
home.screen_css_url
== "https://a.mtstatic.com/@cache/layout/anonymous.css?_=715eca8811db7abb8e6f0555936e020d_Z2VvLmxpYnJldGV4dHMub3Jn:site_4038"
)


def test_get_home_print_css_url(home: LibreTextsHome):
"""Ensures proper print CSS url is retrieved"""
assert (
home.print_css_url
== "https://a.mtstatic.com/@cache/layout/print.css?_=99d83fb44eaebe60981933ec554d138d:site_4038"
)
Loading

0 comments on commit 4749161

Please sign in to comment.