Fix types

sul-dlss-labs · Jan 3, 2024 · 8a6277c · 8a6277c
1 parent 702b2a4
commit 8a6277c
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 51 deletions.
diff --git a/marctable/__init__.py b/marctable/__init__.py
@@ -1,4 +1,5 @@
 from collections.abc import Callable
+from typing import BinaryIO, TextIO
 
 import click
 
@@ -38,7 +39,7 @@ def rule_params(f: Callable) -> Callable:
 @cli.command()
 @io_params
 @rule_params
-def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
+def csv(infile: BinaryIO, outfile: TextIO, rules: list, batch: int) -> None:
     """
     Convert MARC to CSV.
     """
@@ -48,7 +49,7 @@ def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> Non
 @cli.command()
 @io_params
 @rule_params
-def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
+def parquet(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None:
     """
     Convert MARC to Parquet.
     """
@@ -58,7 +59,7 @@ def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) ->
 @cli.command()
 @io_params
 @rule_params
-def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
+def jsonl(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None:
     """
     Convert MARC to JSON Lines (JSONL)
     """
@@ -67,7 +68,7 @@ def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> N
 
 @cli.command()
 @click.argument("outfile", type=click.File("w"), default="-")
-def avram(outfile: click.File) -> None:
+def avram(outfile: TextIO) -> None:
     """
     Generate Avram (YAML) from scraping the Library of Congress MARC bibliographic website.
     """

diff --git a/marctable/marc.py b/marctable/marc.py
@@ -14,11 +14,11 @@
 import re
 import sys
 from functools import cache
-from typing import IO, Generator
+from typing import IO, Generator, List, Optional
 from urllib.parse import urljoin
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 
 class Subfield:
@@ -29,15 +29,20 @@ def __init__(self, code: str, label: str, repeatable: bool = False) -> None:
 
     @classmethod
     def from_dict(_, d: dict):
-        return Subfield(d.get("code"), d.get("label"), d.get("repeatable"))
+        return Subfield(d["code"], d["label"], d["repeatable"])
 
     def to_dict(self) -> dict:
         return {"code": self.code, "label": self.label, "repeatable": self.repeatable}
 
 
 class Field:
     def __init__(
-        self, tag: str, label: str, subfields: dict, repeatable: False, url: str = None
+        self,
+        tag: str,
+        label: str,
+        subfields: list[Subfield],
+        repeatable: bool = False,
+        url: Optional[str] = None,
     ) -> None:
         self.tag = tag
         self.label = label
@@ -47,7 +52,7 @@ def __init__(
 
     def __str__(self) -> str:
         if len(self.subfields) > 0:
-            subfields = ": " + (",".join(self.subfields.keys()))
+            subfields = ": " + (",".join([sf.code for sf in self.subfields]))
         else:
             subfields = ""
         return (
@@ -57,29 +62,27 @@ def __str__(self) -> str:
     @classmethod
     def from_dict(klass, d: dict):
         return Field(
-            tag=d.get("tag"),
-            label=d.get("label"),
-            repeatable=d.get("repeatable"),
+            tag=d["tag"],
+            label=d["label"],
+            repeatable=d["repeatable"],
             url=d.get("url"),
             subfields=[Subfield.from_dict(d) for d in d.get("subfields", {}).values()],
         )
 
     def to_dict(self) -> dict:
-        return {
+        d = {
             "tag": self.tag,
             "label": self.label,
             "repeatable": self.repeatable,
             "url": self.url,
-            "subfields": {sf.code: sf.to_dict() for sf in self.subfields.values()},
         }
 
-    def to_avram(self) -> dict:
-        d = self.to_dict()
-        if len(d["subfields"]) == 0:
-            del d["subfields"]
+        if self.subfields is not None:
+            d["subfields"] = {sf.code: sf.to_dict() for sf in self.subfields}
+
         return d
 
-    def get_subfield(self, code: str) -> Subfield:
+    def get_subfield(self, code: str) -> Optional[Subfield]:
         for sf in self.subfields:
             if sf.code == code:
                 return sf
@@ -88,17 +91,17 @@ def get_subfield(self, code: str) -> Subfield:
 
 class MARC:
     def __init__(self) -> None:
-        self.fields = []
+        self.fields: List[Field] = []
 
     @cache
-    def get_field(self, tag: str) -> Field:
+    def get_field(self, tag: str) -> Optional[Field]:
         for field in self.fields:
             if field.tag == tag:
                 return field
         return None
 
     @cache
-    def get_subfield(self, tag: str, code: str) -> Subfield:
+    def get_subfield(self, tag: str, code: str) -> Optional[Subfield]:
         field = self.get_field(tag)
         if field:
             return field.get_subfield(code)
@@ -111,7 +114,7 @@ def avram_file(self):
 
     @classmethod
     @cache
-    def from_avram(cls, avram_file: IO = None) -> dict:
+    def from_avram(cls, avram_file: Optional[IO] = None):
         marc = MARC()
 
         if avram_file is None:
@@ -122,7 +125,7 @@ def from_avram(cls, avram_file: IO = None) -> dict:
 
         return marc
 
-    def write_avram(self, avram_file: IO = None) -> None:
+    def to_avram(self, avram_file: Optional[IO] = None) -> None:
         if avram_file is None:
             avram_file = self.avram_file.open("w")
 
@@ -131,7 +134,7 @@ def write_avram(self, avram_file: IO = None) -> None:
             "url": "https://www.loc.gov/marc/bibliographic/",
             "family": "marc",
             "language": "en",
-            "fields": {f.tag: f.to_avram() for f in self.fields},
+            "fields": {f.tag: f.to_dict() for f in self.fields},
         }
         json.dump(d, avram_file, indent=2)
 
@@ -152,28 +155,32 @@ def fields() -> Generator[Field, None, None]:
 
 def make_field(url: str) -> Field:
     soup = _soup(url)
-    h1 = soup.select_one("h1", first=True).text.strip()
-    if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1):
+    h1: Optional[Tag] = soup.select_one("h1")
+    if h1 is None:
+        raise Exception("Expecting h1 element in {url}")
+
+    h1_text: str = h1.text.strip()
+    if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1_text):
         tag, label, repeatable = m1.groups()
 
         # most pages put the subfield info in a list
-        subfields = {}
+        subfields = []
         for el in soup.select("table.subfields li"):
             if m2 := re.match(r"^\$(.) - (.+) \((.+)\)$", el.text):
-                subfields[m2.group(1)] = Subfield(
-                    m2.group(1), m2.group(2), m2.group(3) == "R"
-                )
+                subfields.append(Subfield(m2.group(1), m2.group(2), m2.group(3) == "R"))
 
         # some pages use a different layout, of course
         if len(subfields) == 0:
             for el in soup.select('td[colspan="1"]'):
                 for text in el.text.split("$"):
                     text = text.strip()
                     if m2 := re.match(r"^(.) - (.+) \((.+)\)$", text):
-                        subfields[m2.group(1)] = Subfield(
-                            code=m2.group(1),
-                            label=m2.group(2),
-                            repeatable=m2.group(3) == "R",
+                        subfields.append(
+                            Subfield(
+                                code=m2.group(1),
+                                label=m2.group(2),
+                                repeatable=m2.group(3) == "R",
+                            )
                         )
 
         return Field(
@@ -194,7 +201,7 @@ def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None:
             print(f)
         if n != 0 and len(marc.fields) >= n:
             break
-    marc.write_avram(outfile)
+    marc.to_avram(outfile)
 
 
 def _soup(url: str) -> BeautifulSoup:

diff --git a/marctable/utils.py b/marctable/utils.py
@@ -1,6 +1,5 @@
 import json
-import typing
-from typing import Generator
+from typing import BinaryIO, Dict, Generator, List, TextIO, Union
 
 import pyarrow
 import pymarc
@@ -10,16 +9,16 @@
 from .marc import MARC
 
 
-def to_dataframe(marc_input: typing.BinaryIO, rules: list = []) -> DataFrame:
+def to_dataframe(marc_input: BinaryIO, rules: list = []) -> DataFrame:
     """
     Return a single DataFrame for the entire dataset.
     """
     return next(dataframe_iter(marc_input, rules, batch=0))
 
 
 def to_csv(
-    marc_input: typing.BinaryIO,
-    csv_output: typing.TextIO,
+    marc_input: BinaryIO,
+    csv_output: TextIO,
     rules: list = [],
     batch: int = 1000,
 ) -> None:
@@ -32,8 +31,8 @@ def to_csv(
 
 
 def to_jsonl(
-    marc_input: typing.BinaryIO,
-    jsonl_output: typing.BinaryIO,
+    marc_input: BinaryIO,
+    jsonl_output: BinaryIO,
     rules: list = [],
     batch: int = 1000,
 ) -> None:
@@ -46,16 +45,16 @@ def to_jsonl(
 
 
 def to_parquet(
-    marc_input: typing.BinaryIO,
-    parquet_output: typing.BinaryIO,
+    marc_input: BinaryIO,
+    parquet_output: BinaryIO,
     rules: list = [],
     batch: int = 1000,
 ) -> None:
     """
     Convert MARC to Parquet.
     """
     schema = _make_parquet_schema(rules)
-    writer = ParquetWriter(parquet_output, schema, compression="gzip")
+    writer = ParquetWriter(parquet_output, schema, compression="SNAPPY")
     for records_batch in records_iter(marc_input, rules=rules, batch=batch):
         table = pyarrow.Table.from_pylist(records_batch, schema)
         writer.write_table(table)
@@ -64,15 +63,15 @@ def to_parquet(
 
 
 def dataframe_iter(
-    marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000
+    marc_input: BinaryIO, rules: list = [], batch: int = 1000
 ) -> Generator[DataFrame, None, None]:
     columns = _columns(_mapping(rules))
     for records_batch in records_iter(marc_input, rules, batch):
         yield DataFrame.from_records(records_batch, columns=columns)
 
 
 def records_iter(
-    marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000
+    marc_input: BinaryIO, rules: list = [], batch: int = 1000
 ) -> Generator[DataFrame, None, None]:
     """
     Read MARC input and generate a list of dictionaries, where each list element
@@ -87,7 +86,7 @@ def records_iter(
         if record is None:
             continue
 
-        r = {}
+        r: Dict[str, Union[str, List[str]]] = {}
         for field in record.fields:
             if field.tag not in mapping:
                 continue
@@ -209,8 +208,9 @@ def _make_parquet_schema(rules: list) -> pyarrow.Schema:
                 typ = pyarrow.string()
             cols.append((f"F{field_tag}", typ))
         else:
-            for sf in subfields:
-                if marc.get_subfield(field_tag, sf).repeatable:
+            for sf_code in subfields:
+                sf = marc.get_subfield(field_tag, sf_code)
+                if sf is not None and sf.repeatable:
                     typ = pyarrow.list_(pyarrow.string())
                 else:
                     typ = pyarrow.string()

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,11 @@ click = "^8.1.7"
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.3"
 black = "^23.12.0"
+types-requests = "^2.31.0.10"
+types-beautifulsoup4 = "^4.12.0.7"
+mypy = "^1.8.0"
+pandas-stubs = "^2.1.4.231227"
+pyarrow-stubs = "^10.0.1.7"
 
 [tool.poetry.scripts]
 marctable = "marctable:main"