diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 89ff4e6..cbe097b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,8 +23,14 @@ jobs: pip install poetry poetry install + - name: Lint check + run: poetry run ruff check + - name: Check formatting run: poetry run black --check . - - name: Test with pytest + - name: Check types + run: poetry run mypy . + + - name: Run tests run: poetry run pytest -v diff --git a/marctable/__init__.py b/marctable/__init__.py index 94144da..483a929 100644 --- a/marctable/__init__.py +++ b/marctable/__init__.py @@ -1,4 +1,6 @@ from collections.abc import Callable +from io import IOBase +from typing import BinaryIO, TextIO import click @@ -38,7 +40,7 @@ def rule_params(f: Callable) -> Callable: @cli.command() @io_params @rule_params -def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> None: +def csv(infile: BinaryIO, outfile: TextIO, rules: list, batch: int) -> None: """ Convert MARC to CSV. """ @@ -48,7 +50,7 @@ def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> Non @cli.command() @io_params @rule_params -def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> None: +def parquet(infile: BinaryIO, outfile: IOBase, rules: list, batch: int) -> None: """ Convert MARC to Parquet. """ @@ -58,7 +60,7 @@ def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> @cli.command() @io_params @rule_params -def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> None: +def jsonl(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None: """ Convert MARC to JSON Lines (JSONL) """ @@ -67,9 +69,10 @@ def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> N @cli.command() @click.argument("outfile", type=click.File("w"), default="-") -def avram(outfile: click.File) -> None: +def avram(outfile: TextIO) -> None: """ - Generate Avram (YAML) from scraping the Library of Congress MARC bibliographic website. + Generate Avram (YAML) from scraping the Library of Congress MARC + bibliographic web. """ marctable.marc.crawl(outfile=outfile) diff --git a/marctable/marc.py b/marctable/marc.py index 42fd48b..e5c9c00 100644 --- a/marctable/marc.py +++ b/marctable/marc.py @@ -14,11 +14,11 @@ import re import sys from functools import cache -from typing import IO, Generator +from typing import IO, Generator, List, Optional, Type from urllib.parse import urljoin import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag class Subfield: @@ -28,8 +28,8 @@ def __init__(self, code: str, label: str, repeatable: bool = False) -> None: self.repeatable = repeatable @classmethod - def from_dict(_, d: dict): - return Subfield(d.get("code"), d.get("label"), d.get("repeatable")) + def from_dict(cls: Type["Subfield"], d: dict) -> "Subfield": + return Subfield(d["code"], d["label"], d["repeatable"]) def to_dict(self) -> dict: return {"code": self.code, "label": self.label, "repeatable": self.repeatable} @@ -37,7 +37,12 @@ def to_dict(self) -> dict: class Field: def __init__( - self, tag: str, label: str, subfields: dict, repeatable: False, url: str = None + self, + tag: str, + label: str, + subfields: list[Subfield], + repeatable: bool = False, + url: Optional[str] = None, ) -> None: self.tag = tag self.label = label @@ -47,7 +52,7 @@ def __init__( def __str__(self) -> str: if len(self.subfields) > 0: - subfields = ": " + (",".join(self.subfields.keys())) + subfields = ": " + (",".join([sf.code for sf in self.subfields])) else: subfields = "" return ( @@ -55,63 +60,58 @@ def __str__(self) -> str: ) @classmethod - def from_dict(klass, d: dict): + def from_dict(cls: Type["Field"], d: dict) -> "Field": return Field( - tag=d.get("tag"), - label=d.get("label"), - repeatable=d.get("repeatable"), + tag=d["tag"], + label=d["label"], + repeatable=d["repeatable"], url=d.get("url"), subfields=[Subfield.from_dict(d) for d in d.get("subfields", {}).values()], ) def to_dict(self) -> dict: - return { + d = { "tag": self.tag, "label": self.label, "repeatable": self.repeatable, "url": self.url, - "subfields": {sf.code: sf.to_dict() for sf in self.subfields.values()}, } - def to_avram(self) -> dict: - d = self.to_dict() - if len(d["subfields"]) == 0: - del d["subfields"] + if self.subfields is not None: + d["subfields"] = {sf.code: sf.to_dict() for sf in self.subfields} + return d def get_subfield(self, code: str) -> Subfield: for sf in self.subfields: if sf.code == code: return sf - return None + raise SchemaSubfieldError(f"{code} is not a valid subfield in field {self.tag}") class MARC: def __init__(self) -> None: - self.fields = [] + self.fields: List[Field] = [] @cache def get_field(self, tag: str) -> Field: for field in self.fields: if field.tag == tag: return field - return None + raise SchemaFieldError(f"{tag} is not a defined field tag in Avram schema") @cache def get_subfield(self, tag: str, code: str) -> Subfield: field = self.get_field(tag) - if field: - return field.get_subfield(code) - else: - return None + return field.get_subfield(code) @property - def avram_file(self): + def avram_file(self) -> pathlib.Path: return pathlib.Path(__file__).parent / "marc.json" @classmethod @cache - def from_avram(cls, avram_file: IO = None) -> dict: + def from_avram(cls: Type["MARC"], avram_file: Optional[IO] = None) -> "MARC": marc = MARC() if avram_file is None: @@ -122,7 +122,7 @@ def from_avram(cls, avram_file: IO = None) -> dict: return marc - def write_avram(self, avram_file: IO = None) -> None: + def to_avram(self, avram_file: Optional[IO] = None) -> None: if avram_file is None: avram_file = self.avram_file.open("w") @@ -131,11 +131,19 @@ def write_avram(self, avram_file: IO = None) -> None: "url": "https://www.loc.gov/marc/bibliographic/", "family": "marc", "language": "en", - "fields": {f.tag: f.to_avram() for f in self.fields}, + "fields": {f.tag: f.to_dict() for f in self.fields}, } json.dump(d, avram_file, indent=2) +class SchemaFieldError(Exception): + pass + + +class SchemaSubfieldError(Exception): + pass + + def fields() -> Generator[Field, None, None]: toc_url = "https://www.loc.gov/marc/bibliographic/" toc_doc = _soup(toc_url) @@ -150,19 +158,21 @@ def fields() -> Generator[Field, None, None]: yield field -def make_field(url: str) -> Field: +def make_field(url: str) -> Optional[Field]: soup = _soup(url) - h1 = soup.select_one("h1", first=True).text.strip() - if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1): + h1: Optional[Tag] = soup.select_one("h1") + if h1 is None: + raise Exception("Expecting h1 element in {url}") + + h1_text: str = h1.text.strip() + if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1_text): tag, label, repeatable = m1.groups() # most pages put the subfield info in a list - subfields = {} + subfields = [] for el in soup.select("table.subfields li"): if m2 := re.match(r"^\$(.) - (.+) \((.+)\)$", el.text): - subfields[m2.group(1)] = Subfield( - m2.group(1), m2.group(2), m2.group(3) == "R" - ) + subfields.append(Subfield(m2.group(1), m2.group(2), m2.group(3) == "R")) # some pages use a different layout, of course if len(subfields) == 0: @@ -170,10 +180,12 @@ def make_field(url: str) -> Field: for text in el.text.split("$"): text = text.strip() if m2 := re.match(r"^(.) - (.+) \((.+)\)$", text): - subfields[m2.group(1)] = Subfield( - code=m2.group(1), - label=m2.group(2), - repeatable=m2.group(3) == "R", + subfields.append( + Subfield( + code=m2.group(1), + label=m2.group(2), + repeatable=m2.group(3) == "R", + ) ) return Field( @@ -184,6 +196,8 @@ def make_field(url: str) -> Field: subfields=subfields, ) + return None + # scrape the loc website for the marc fields def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None: @@ -194,7 +208,7 @@ def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None: print(f) if n != 0 and len(marc.fields) >= n: break - marc.write_avram(outfile) + marc.to_avram(outfile) def _soup(url: str) -> BeautifulSoup: diff --git a/marctable/utils.py b/marctable/utils.py index 5991fbc..42a15c9 100644 --- a/marctable/utils.py +++ b/marctable/utils.py @@ -1,6 +1,6 @@ import json -import typing -from typing import Generator +from io import IOBase +from typing import BinaryIO, Dict, Generator, List, TextIO, Tuple, Union import pyarrow import pymarc @@ -9,8 +9,10 @@ from .marc import MARC +ListOrString = Union[str, List[str]] -def to_dataframe(marc_input: typing.BinaryIO, rules: list = []) -> DataFrame: + +def to_dataframe(marc_input: BinaryIO, rules: list = []) -> DataFrame: """ Return a single DataFrame for the entire dataset. """ @@ -18,8 +20,8 @@ def to_dataframe(marc_input: typing.BinaryIO, rules: list = []) -> DataFrame: def to_csv( - marc_input: typing.BinaryIO, - csv_output: typing.TextIO, + marc_input: BinaryIO, + csv_output: TextIO, rules: list = [], batch: int = 1000, ) -> None: @@ -32,8 +34,8 @@ def to_csv( def to_jsonl( - marc_input: typing.BinaryIO, - jsonl_output: typing.BinaryIO, + marc_input: BinaryIO, + jsonl_output: BinaryIO, rules: list = [], batch: int = 1000, ) -> None: @@ -46,8 +48,8 @@ def to_jsonl( def to_parquet( - marc_input: typing.BinaryIO, - parquet_output: typing.BinaryIO, + marc_input: BinaryIO, + parquet_output: IOBase, rules: list = [], batch: int = 1000, ) -> None: @@ -55,7 +57,7 @@ def to_parquet( Convert MARC to Parquet. """ schema = _make_parquet_schema(rules) - writer = ParquetWriter(parquet_output, schema, compression="gzip") + writer = ParquetWriter(parquet_output, schema, compression="SNAPPY") for records_batch in records_iter(marc_input, rules=rules, batch=batch): table = pyarrow.Table.from_pylist(records_batch, schema) writer.write_table(table) @@ -64,7 +66,7 @@ def to_parquet( def dataframe_iter( - marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000 + marc_input: BinaryIO, rules: list = [], batch: int = 1000 ) -> Generator[DataFrame, None, None]: columns = _columns(_mapping(rules)) for records_batch in records_iter(marc_input, rules, batch): @@ -72,8 +74,8 @@ def dataframe_iter( def records_iter( - marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000 -) -> Generator[DataFrame, None, None]: + marc_input: BinaryIO, rules: list = [], batch: int = 1000 +) -> Generator[List[Dict], None, None]: """ Read MARC input and generate a list of dictionaries, where each list element represents a MARC record. @@ -85,9 +87,10 @@ def records_iter( for record in pymarc.MARCReader(marc_input): # if pymarc can't make sense of a record it returns None if record is None: + # TODO: log this? continue - r = {} + r: Dict[str, ListOrString] = {} for field in record.fields: if field.tag not in mapping: continue @@ -98,12 +101,15 @@ def records_iter( if subfields is None: key = f"F{field.tag}" if marc.get_field(field.tag).repeatable: - value = r.get(key, []) - value.append(_stringify_field(field)) + lst = r.get(key, []) + assert isinstance( + lst, list + ), "Repeatable field contains a string instead of a list" + lst.append(_stringify_field(field)) + r[key] = lst else: - value = _stringify_field(field) - - r[key] = value + s = _stringify_field(field) + r[key] = s # otherwise only add the subfields that were requested in the mapping else: @@ -113,7 +119,10 @@ def records_iter( key = f"F{field.tag}{sf.code}" if marc.get_subfield(field.tag, sf.code).repeatable: - value = r.get(key, []) + value: ListOrString = r.get(key, []) + assert isinstance( + value, list + ), "Repeatable field contains a string instead of list" value.append(sf.value) else: value = sf.value @@ -180,7 +189,7 @@ def _columns(mapping: dict) -> list: return cols -def _make_pandas_schema(rules: list) -> pyarrow.Schema: +def _make_pandas_schema(rules: list) -> Dict[str, str]: marc = MARC.from_avram() mapping = _mapping(rules) schema = {} @@ -200,19 +209,22 @@ def _make_pandas_schema(rules: list) -> pyarrow.Schema: def _make_parquet_schema(rules: list) -> pyarrow.Schema: marc = MARC.from_avram() mapping = _mapping(rules) - cols = [] + + pyarrow_str = pyarrow.string() + pyarrow_list_of_str = pyarrow.list_(pyarrow.string()) + + cols: List[Tuple[str, pyarrow.DataType]] = [] for field_tag, subfields in mapping.items(): if subfields is None: if marc.get_field(field_tag).repeatable: - typ = pyarrow.list_(pyarrow.string()) + cols.append((f"F{field_tag}", pyarrow_list_of_str)) else: - typ = pyarrow.string() - cols.append((f"F{field_tag}", typ)) + cols.append((f"F{field_tag}", pyarrow_str)) else: - for sf in subfields: - if marc.get_subfield(field_tag, sf).repeatable: - typ = pyarrow.list_(pyarrow.string()) + for sf_code in subfields: + sf = marc.get_subfield(field_tag, sf_code) + if sf is not None and sf.repeatable: + cols.append((f"F{field_tag}{sf}", pyarrow_list_of_str)) else: - typ = pyarrow.string() - cols.append((f"F{field_tag}{sf}", typ)) - return pyarrow.schema(cols) + cols.append((f"F{field_tag}{sf}", pyarrow_str)) + return pyarrow.schema(cols) # type: ignore[arg-type] diff --git a/pyproject.toml b/pyproject.toml index eb7e41b..e9c0b2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,11 @@ click = "^8.1.7" [tool.poetry.group.dev.dependencies] pytest = "^7.4.3" black = "^23.12.0" +types-requests = "^2.31.0.10" +types-beautifulsoup4 = "^4.12.0.7" +mypy = "^1.8.0" +pandas-stubs = "^2.1.4.231227" +pyarrow-stubs = "^10.0.1.7" [tool.poetry.scripts] marctable = "marctable:main" @@ -34,3 +39,6 @@ build-backend = "poetry.core.masonry.api" [project.urls] Homepage = "https://github.com/edsu/marctable" + +[tool.mypy] +warn_no_return = false diff --git a/test_marctable.py b/test_marctable.py index 3a1e1f6..016222a 100644 --- a/test_marctable.py +++ b/test_marctable.py @@ -1,10 +1,11 @@ +import json import pathlib from io import StringIO import pandas -import json -from marctable.marc import MARC, crawl +from marctable.marc import MARC, SchemaFieldError, SchemaSubfieldError, crawl from marctable.utils import _mapping, dataframe_iter, to_csv, to_dataframe, to_parquet +from pytest import raises marc = MARC.from_avram() @@ -41,12 +42,14 @@ def test_marc() -> None: def test_get_field() -> None: assert marc.get_field("245") - assert not marc.get_field("abc") + with raises(SchemaFieldError, match="abc is not a defined field tag in Avram"): + marc.get_field("abc") def test_get_subfield() -> None: assert marc.get_subfield("245", "a").label == "Title" - assert marc.get_subfield("245", "-") is None + with raises(SchemaSubfieldError, match="- is not a valid subfield in field 245"): + marc.get_subfield("245", "-") is None def test_non_repeatable_field() -> None: @@ -71,7 +74,8 @@ def test_df() -> None: # 245 is not repeatable assert ( df.iloc[0]["F245"] - == "Leak testing CD-ROM [computer file] / technical editors, Charles N. Jackson, Jr., Charles N. Sherlock ; editor, Patrick O. Moore." + == "Leak testing CD-ROM [computer file] / technical editors, Charles N. " + "Jackson, Jr., Charles N. Sherlock ; editor, Patrick O. Moore." ) # 650 is repeatable assert df.iloc[0]["F650"] == ["Leak detectors.", "Gas leakage."] @@ -86,7 +90,8 @@ def test_custom_fields_df() -> None: assert df.columns[1] == "F650" assert ( df.iloc[0]["F245"] - == "Leak testing CD-ROM [computer file] / technical editors, Charles N. Jackson, Jr., Charles N. Sherlock ; editor, Patrick O. Moore." + == "Leak testing CD-ROM [computer file] / technical editors, Charles N. " + "Jackson, Jr., Charles N. Sherlock ; editor, Patrick O. Moore." ) assert df.iloc[0]["F650"] == ["Leak detectors.", "Gas leakage."] @@ -132,7 +137,8 @@ def test_to_csv() -> None: assert len(df.columns) == 215 assert ( df.iloc[0]["F245"] - == "Leak testing CD-ROM [computer file] / technical editors, Charles N. Jackson, Jr., Charles N. Sherlock ; editor, Patrick O. Moore." + == "Leak testing CD-ROM [computer file] / technical editors, Charles N. " + "Jackson, Jr., Charles N. Sherlock ; editor, Patrick O. Moore." )