Skip to content

Commit

Permalink
Fix types
Browse files Browse the repository at this point in the history
  • Loading branch information
edsu committed Jan 3, 2024
1 parent 702b2a4 commit 8a6277c
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 51 deletions.
9 changes: 5 additions & 4 deletions marctable/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections.abc import Callable
from typing import BinaryIO, TextIO

import click

Expand Down Expand Up @@ -38,7 +39,7 @@ def rule_params(f: Callable) -> Callable:
@cli.command()
@io_params
@rule_params
def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
def csv(infile: BinaryIO, outfile: TextIO, rules: list, batch: int) -> None:
"""
Convert MARC to CSV.
"""
Expand All @@ -48,7 +49,7 @@ def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> Non
@cli.command()
@io_params
@rule_params
def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
def parquet(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None:
"""
Convert MARC to Parquet.
"""
Expand All @@ -58,7 +59,7 @@ def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) ->
@cli.command()
@io_params
@rule_params
def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
def jsonl(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None:
"""
Convert MARC to JSON Lines (JSONL)
"""
Expand All @@ -67,7 +68,7 @@ def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> N

@cli.command()
@click.argument("outfile", type=click.File("w"), default="-")
def avram(outfile: click.File) -> None:
def avram(outfile: TextIO) -> None:
"""
Generate Avram (YAML) from scraping the Library of Congress MARC bibliographic website.
"""
Expand Down
71 changes: 39 additions & 32 deletions marctable/marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
import re
import sys
from functools import cache
from typing import IO, Generator
from typing import IO, Generator, List, Optional
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag


class Subfield:
Expand All @@ -29,15 +29,20 @@ def __init__(self, code: str, label: str, repeatable: bool = False) -> None:

@classmethod
def from_dict(_, d: dict):
return Subfield(d.get("code"), d.get("label"), d.get("repeatable"))
return Subfield(d["code"], d["label"], d["repeatable"])

def to_dict(self) -> dict:
return {"code": self.code, "label": self.label, "repeatable": self.repeatable}


class Field:
def __init__(
self, tag: str, label: str, subfields: dict, repeatable: False, url: str = None
self,
tag: str,
label: str,
subfields: list[Subfield],
repeatable: bool = False,
url: Optional[str] = None,
) -> None:
self.tag = tag
self.label = label
Expand All @@ -47,7 +52,7 @@ def __init__(

def __str__(self) -> str:
if len(self.subfields) > 0:
subfields = ": " + (",".join(self.subfields.keys()))
subfields = ": " + (",".join([sf.code for sf in self.subfields]))
else:
subfields = ""
return (
Expand All @@ -57,29 +62,27 @@ def __str__(self) -> str:
@classmethod
def from_dict(klass, d: dict):
return Field(
tag=d.get("tag"),
label=d.get("label"),
repeatable=d.get("repeatable"),
tag=d["tag"],
label=d["label"],
repeatable=d["repeatable"],
url=d.get("url"),
subfields=[Subfield.from_dict(d) for d in d.get("subfields", {}).values()],
)

def to_dict(self) -> dict:
return {
d = {
"tag": self.tag,
"label": self.label,
"repeatable": self.repeatable,
"url": self.url,
"subfields": {sf.code: sf.to_dict() for sf in self.subfields.values()},
}

def to_avram(self) -> dict:
d = self.to_dict()
if len(d["subfields"]) == 0:
del d["subfields"]
if self.subfields is not None:
d["subfields"] = {sf.code: sf.to_dict() for sf in self.subfields}

return d

def get_subfield(self, code: str) -> Subfield:
def get_subfield(self, code: str) -> Optional[Subfield]:
for sf in self.subfields:
if sf.code == code:
return sf
Expand All @@ -88,17 +91,17 @@ def get_subfield(self, code: str) -> Subfield:

class MARC:
def __init__(self) -> None:
self.fields = []
self.fields: List[Field] = []

@cache
def get_field(self, tag: str) -> Field:
def get_field(self, tag: str) -> Optional[Field]:
for field in self.fields:
if field.tag == tag:
return field
return None

@cache
def get_subfield(self, tag: str, code: str) -> Subfield:
def get_subfield(self, tag: str, code: str) -> Optional[Subfield]:
field = self.get_field(tag)
if field:
return field.get_subfield(code)
Expand All @@ -111,7 +114,7 @@ def avram_file(self):

@classmethod
@cache
def from_avram(cls, avram_file: IO = None) -> dict:
def from_avram(cls, avram_file: Optional[IO] = None):
marc = MARC()

if avram_file is None:
Expand All @@ -122,7 +125,7 @@ def from_avram(cls, avram_file: IO = None) -> dict:

return marc

def write_avram(self, avram_file: IO = None) -> None:
def to_avram(self, avram_file: Optional[IO] = None) -> None:
if avram_file is None:
avram_file = self.avram_file.open("w")

Expand All @@ -131,7 +134,7 @@ def write_avram(self, avram_file: IO = None) -> None:
"url": "https://www.loc.gov/marc/bibliographic/",
"family": "marc",
"language": "en",
"fields": {f.tag: f.to_avram() for f in self.fields},
"fields": {f.tag: f.to_dict() for f in self.fields},
}
json.dump(d, avram_file, indent=2)

Expand All @@ -152,28 +155,32 @@ def fields() -> Generator[Field, None, None]:

def make_field(url: str) -> Field:
soup = _soup(url)
h1 = soup.select_one("h1", first=True).text.strip()
if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1):
h1: Optional[Tag] = soup.select_one("h1")
if h1 is None:
raise Exception("Expecting h1 element in {url}")

h1_text: str = h1.text.strip()
if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1_text):
tag, label, repeatable = m1.groups()

# most pages put the subfield info in a list
subfields = {}
subfields = []
for el in soup.select("table.subfields li"):
if m2 := re.match(r"^\$(.) - (.+) \((.+)\)$", el.text):
subfields[m2.group(1)] = Subfield(
m2.group(1), m2.group(2), m2.group(3) == "R"
)
subfields.append(Subfield(m2.group(1), m2.group(2), m2.group(3) == "R"))

# some pages use a different layout, of course
if len(subfields) == 0:
for el in soup.select('td[colspan="1"]'):
for text in el.text.split("$"):
text = text.strip()
if m2 := re.match(r"^(.) - (.+) \((.+)\)$", text):
subfields[m2.group(1)] = Subfield(
code=m2.group(1),
label=m2.group(2),
repeatable=m2.group(3) == "R",
subfields.append(
Subfield(
code=m2.group(1),
label=m2.group(2),
repeatable=m2.group(3) == "R",
)
)

return Field(
Expand All @@ -194,7 +201,7 @@ def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None:
print(f)
if n != 0 and len(marc.fields) >= n:
break
marc.write_avram(outfile)
marc.to_avram(outfile)


def _soup(url: str) -> BeautifulSoup:
Expand Down
30 changes: 15 additions & 15 deletions marctable/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json
import typing
from typing import Generator
from typing import BinaryIO, Dict, Generator, List, TextIO, Union

import pyarrow
import pymarc
Expand All @@ -10,16 +9,16 @@
from .marc import MARC


def to_dataframe(marc_input: typing.BinaryIO, rules: list = []) -> DataFrame:
def to_dataframe(marc_input: BinaryIO, rules: list = []) -> DataFrame:
"""
Return a single DataFrame for the entire dataset.
"""
return next(dataframe_iter(marc_input, rules, batch=0))


def to_csv(
marc_input: typing.BinaryIO,
csv_output: typing.TextIO,
marc_input: BinaryIO,
csv_output: TextIO,
rules: list = [],
batch: int = 1000,
) -> None:
Expand All @@ -32,8 +31,8 @@ def to_csv(


def to_jsonl(
marc_input: typing.BinaryIO,
jsonl_output: typing.BinaryIO,
marc_input: BinaryIO,
jsonl_output: BinaryIO,
rules: list = [],
batch: int = 1000,
) -> None:
Expand All @@ -46,16 +45,16 @@ def to_jsonl(


def to_parquet(
marc_input: typing.BinaryIO,
parquet_output: typing.BinaryIO,
marc_input: BinaryIO,
parquet_output: BinaryIO,
rules: list = [],
batch: int = 1000,
) -> None:
"""
Convert MARC to Parquet.
"""
schema = _make_parquet_schema(rules)
writer = ParquetWriter(parquet_output, schema, compression="gzip")
writer = ParquetWriter(parquet_output, schema, compression="SNAPPY")
for records_batch in records_iter(marc_input, rules=rules, batch=batch):
table = pyarrow.Table.from_pylist(records_batch, schema)
writer.write_table(table)
Expand All @@ -64,15 +63,15 @@ def to_parquet(


def dataframe_iter(
marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000
marc_input: BinaryIO, rules: list = [], batch: int = 1000
) -> Generator[DataFrame, None, None]:
columns = _columns(_mapping(rules))
for records_batch in records_iter(marc_input, rules, batch):
yield DataFrame.from_records(records_batch, columns=columns)


def records_iter(
marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000
marc_input: BinaryIO, rules: list = [], batch: int = 1000
) -> Generator[DataFrame, None, None]:
"""
Read MARC input and generate a list of dictionaries, where each list element
Expand All @@ -87,7 +86,7 @@ def records_iter(
if record is None:
continue

r = {}
r: Dict[str, Union[str, List[str]]] = {}
for field in record.fields:
if field.tag not in mapping:
continue
Expand Down Expand Up @@ -209,8 +208,9 @@ def _make_parquet_schema(rules: list) -> pyarrow.Schema:
typ = pyarrow.string()
cols.append((f"F{field_tag}", typ))
else:
for sf in subfields:
if marc.get_subfield(field_tag, sf).repeatable:
for sf_code in subfields:
sf = marc.get_subfield(field_tag, sf_code)
if sf is not None and sf.repeatable:
typ = pyarrow.list_(pyarrow.string())
else:
typ = pyarrow.string()
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ click = "^8.1.7"
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.3"
black = "^23.12.0"
types-requests = "^2.31.0.10"
types-beautifulsoup4 = "^4.12.0.7"
mypy = "^1.8.0"
pandas-stubs = "^2.1.4.231227"
pyarrow-stubs = "^10.0.1.7"

[tool.poetry.scripts]
marctable = "marctable:main"
Expand Down

0 comments on commit 8a6277c

Please sign in to comment.