globaldothealth · pipliggins · Nov 22, 2024
diff --git a/src/autoparser/__init__.py b/src/autoparser/__init__.py
@@ -23,14 +23,15 @@
 def main():
     if len(sys.argv) < 2:
         print(
-            """autoparser: specify subcommand to run
+            """
+            autoparser: specify subcommand to run
 
-Available subcommands:
-  create-dict - Create a data dictionary from a dataset
-  add-descriptions - Add descriptions to a data dictionary (LLM key required)
-  create-mapping - Create initial CSV mapping from data dictionary (LLM key required)
-  create-parser - Generate TOML parser from CSV mapping file
-"""
+            Available subcommands:
+            create-dict - Create a data dictionary from a dataset
+            add-descriptions - Add descriptions to a data dictionary (LLM key required)
+            create-mapping - Create initial CSV mapping from data dictionary (LLM key required) # noqa
+            create-parser - Generate TOML parser from CSV mapping file
+            """
         )
         sys.exit(1)
     subcommand = sys.argv[1]

diff --git a/src/autoparser/create_mapping.py b/src/autoparser/create_mapping.py
@@ -14,7 +14,7 @@
 from .openai_calls import _map_values as _map_values_openai
 from .gemini_calls import _map_fields as _map_fields_gemini
 from .gemini_calls import _map_values as _map_values_gemini
-from .util import read_json, read_data, load_data_dict
+from .util import read_json, read_config_schema, load_data_dict
 from .util import DEFAULT_CONFIG
 
 from typing import Literal
@@ -59,19 +59,21 @@ def __init__(
         self.api_key = api_key
         if llm is None:
             self.client = None
-        elif llm == "openai":
+        elif llm == "openai":  # pragma: no cover
             self.client = OpenAI(api_key=self.api_key)
             self.map_fields = _map_fields_openai
             self.map_values = _map_values_openai
-        elif llm == "gemini":
+        elif llm == "gemini":  # pragma: no cover
             gemini.configure(api_key=self.api_key)
             self.client = gemini.GenerativeModel("gemini-1.5-flash")
             self.map_fields = _map_fields_gemini
             self.map_values = _map_values_gemini
         else:
             raise ValueError(f"Unsupported LLM: {llm}")
 
-        self.config = read_data(config or Path(Path(__file__).parent, DEFAULT_CONFIG))
+        self.config = read_config_schema(
+            config or Path(Path(__file__).parent, DEFAULT_CONFIG)
+        )
 
         self.data_dictionary = load_data_dict(self.config, data_dictionary)
 

diff --git a/src/autoparser/dict_writer.py b/src/autoparser/dict_writer.py
@@ -11,7 +11,7 @@
 
 from .openai_calls import _get_definitions as _get_definitions_openai
 from .gemini_calls import _get_definitions as _get_definitions_gemini
-from .util import read_data, load_data_dict
+from .util import read_config_schema, load_data_dict, read_data
 from .util import DEFAULT_CONFIG
 
 
@@ -38,7 +38,9 @@ def __init__(
     ):
         if isinstance(config, str):
             config = Path(config)
-        self.config = read_data(config or Path(Path(__file__).parent, DEFAULT_CONFIG))
+        self.config = read_config_schema(
+            config or Path(Path(__file__).parent, DEFAULT_CONFIG)
+        )
 
     def _setup_llm(self, key: str, name: str):
         """
@@ -54,13 +56,16 @@ def _setup_llm(self, key: str, name: str):
         name
             Name of the LLM to use (currently only OpenAI and Gemini are supported)
         """
-        self.key = key
-        if name == "openai":
-            self.client = OpenAI(api_key=key)
+        if key is None:
+            raise ValueError("API key required for generating descriptions")
+        else:
+            self.key = key
 
+        if name == "openai":  # pragma: no cover
+            self.client = OpenAI(api_key=key)
             self._get_descriptions = _get_definitions_openai
 
-        elif name == "gemini":
+        elif name == "gemini":  # pragma: no cover
             gemini.configure(api_key=key)
             self.client = gemini.GenerativeModel("gemini-1.5-flash")
             self._get_descriptions = _get_definitions_gemini
@@ -88,20 +93,7 @@ def create_dict(self, data: pd.DataFrame | str) -> pd.DataFrame:
             Data dictionary containing field names, field types, and common values.
         """
 
-        if isinstance(data, str):
-            data = Path(data)
-            if data.suffix == ".csv":
-                df = pd.read_csv(data)
-            elif data.suffix == ".xlsx":
-                df = pd.read_excel(data)
-            else:
-                raise ValueError(f"Unsupported format (not CSV or XLSX): {data}")
-        elif isinstance(data, pd.DataFrame):
-            df = data
-        else:
-            raise ValueError(
-                "Data must be a path to a CSV or XLSX file, or a DataFrame"
-            )
+        df = read_data(data, "Data")
 
         names = df.columns
         types = [str(t) for t in df.dtypes]

diff --git a/src/autoparser/make_toml.py b/src/autoparser/make_toml.py
@@ -10,8 +10,7 @@
 import pandas as pd
 
 from .toml_writer import dump
-from .util import parse_choices
-from .util import read_data
+from .util import parse_choices, read_config_schema, read_data
 from .util import DEFAULT_CONFIG
 
 
@@ -73,24 +72,21 @@ def __init__(
         config: Path | None = None,
         transformation_tool: str = "ADTL",
     ):
-        if isinstance(mappings, str):
-            mappings = Path(mappings)
-        if mappings.suffix == ".csv":
-            self.mappings = pd.read_csv(mappings)
-        else:
-            raise ValueError(f"Unsupported format (not CSV): {mappings}")
+        self.mappings = read_data(mappings, "A mapping file")
 
         self.schema_path = (
             schema_path if isinstance(schema_path, Path) else Path(schema_path)
         )
         self.parser_name = parser_name
         self.parser_description = description or parser_name
 
-        self.config = read_data(config or Path(Path(__file__).parent, DEFAULT_CONFIG))
+        self.config = read_config_schema(
+            config or Path(Path(__file__).parent, DEFAULT_CONFIG)
+        )
         self.tables = self.config["schemas"].keys()
 
         self.schemas = {
-            t: read_data(Path(schema_path, self.config["schemas"][t]))
+            t: read_config_schema(Path(schema_path, self.config["schemas"][t]))
             for t in self.tables
         }
 
@@ -286,7 +282,7 @@ def create_parser(
         parser_name,
         description,
         Path(config),
-    ).create_parser()
+    ).create_parser(parser_name)
 
 
 def main():

diff --git a/src/autoparser/util.py b/src/autoparser/util.py
@@ -14,7 +14,7 @@
 DEFAULT_CONFIG = "config/autoparser.toml"
 
 
-def read_data(path: str | Path) -> Dict:
+def read_config_schema(path: str | Path) -> Dict:
     if isinstance(path, str):
         path = Path(path)
 
@@ -24,7 +24,9 @@ def read_data(path: str | Path) -> Dict:
         with path.open("rb") as fp:
             return tomli.load(fp)
     else:
-        raise ValueError(f"read_data(): Unsupported file format: {path.suffix}")
+        raise ValueError(
+            f"read_config_schema(): Unsupported file format: {path.suffix}"
+        )
 
 
 def read_json(file: str | Path) -> dict:
@@ -35,6 +37,25 @@ def read_json(file: str | Path) -> dict:
         return json.load(fp)
 
 
+def read_data(file: str | Path | pd.DataFrame, file_type: str):
+    """Reads in data/mapping files, which expect csv, excel or dataframe formats"""
+
+    if isinstance(file, str):
+        file = Path(file)
+        if file.suffix == ".csv":
+            return pd.read_csv(file)
+        elif file.suffix == ".xlsx":
+            return pd.read_excel(file)
+        else:
+            raise ValueError(f"Unsupported format (not CSV or XLSX): {file}")
+    elif isinstance(file, pd.DataFrame):
+        return file
+    else:
+        raise ValueError(
+            f"{file_type} must be a path to a CSV or XLSX file, or a DataFrame"
+        )
+
+
 def parse_choices(config, s: str) -> Dict[str, Any]:
     delimiter = config["choice_delimiter"]
     delimiter_map = config["choice_delimiter_map"]

diff --git a/tests/__snapshots__/test_parser_generator.ambr b/tests/__snapshots__/test_parser_generator.ambr
@@ -71,6 +71,78 @@
     }),
   })
 # ---
+# name: test_create_parser_ap_access
+  dict({
+    'age_months': dict({
+      'description': 'Age in Months',
+      'field': 'AgeMois         ',
+    }),
+    'age_years': dict({
+      'description': 'Age in Years',
+      'field': 'AgeAns',
+    }),
+    'case_status': dict({
+      'caseInsensitive': True,
+      'description': 'Case Status',
+      'field': 'StatusCas',
+      'values': dict({
+        'décédé': 'dead',
+        'vivant': 'alive',
+      }),
+    }),
+    'chipped': dict({
+      'description': 'Microchipped',
+      'field': 'Micropucé',
+      'ref': 'Y/N/NK',
+    }),
+    'classification': dict({
+      'caseInsensitive': True,
+      'description': 'Classification',
+      'field': 'Classicfication ',
+      'values': dict({
+        'amphibie': 'amphibian',
+        'autre': '',
+        'fish': 'fish',
+        'mammifère': 'mammal',
+        'oiseau': 'bird',
+        'poisson': 'fish',
+        'rept': 'reptile',
+      }),
+    }),
+    'country_iso3': '',
+    'identity': dict({
+      'description': 'Identity',
+      'field': 'Identité',
+    }),
+    'loc_admin_1': dict({
+      'description': 'Province',
+      'field': 'Province',
+    }),
+    'name': dict({
+      'description': 'Full Name',
+      'field': 'Nom complet ',
+    }),
+    'notification_date': dict({
+      'description': 'Notification Date',
+      'field': 'DateNotification',
+    }),
+    'pet': dict({
+      'description': 'Pet Animal',
+      'field': 'AnimalDeCompagnie',
+      'ref': 'Y/N/NK',
+    }),
+    'sex': dict({
+      'caseInsensitive': True,
+      'description': 'Gender',
+      'field': 'Sexe',
+      'values': dict({
+        'f': 'female',
+        'inconnu': '',
+        'm': 'male',
+      }),
+    }),
+  })
+# ---
 # name: test_schema_fields
   dict({
     'age_months': dict({

diff --git a/tests/sources/animal_data.xlsx b/tests/sources/animal_data.xlsx
diff --git a/tests/test_dict_writer.py b/tests/test_dict_writer.py
@@ -1,4 +1,5 @@
 # tests the `DictWriter` class
+import autoparser
 from autoparser.dict_writer import DictWriter
 from pathlib import Path
 
@@ -19,8 +20,22 @@ def _setup_llm(self, key, name):
         self._get_descriptions = get_definitions
 
 
+def test_unsupported_data_format_txt():
+    writer = DictWriter(config="tests/test_config.toml")
+
+    with pytest.raises(ValueError, match="Unsupported format"):
+        writer.create_dict("tests/sources/animals.txt")
+
+
+def test_data_not_df_or_path():
+    writer = DictWriter(config="tests/test_config.toml")
+
+    with pytest.raises(ValueError, match="Data must be a path"):
+        writer.create_dict(None)
+
+
 def test_dictionary_creation_no_descrip():
-    writer = DictWriter(config=Path("tests/test_config.toml"))
+    writer = DictWriter(config="tests/test_config.toml")
 
     df = writer.create_dict("tests/sources/animal_data.csv")
 
@@ -29,18 +44,50 @@ def test_dictionary_creation_no_descrip():
     pd.testing.assert_frame_equal(df, df_desired)
 
 
+def test_create_dict_no_descrip():
+    df = autoparser.create_dict(
+        "tests/sources/animal_data.csv", config="tests/test_config.toml"
+    )
+
+    df_desired = pd.read_csv("tests/sources/animals_dd.csv")
+
+    pd.testing.assert_frame_equal(df, df_desired)
+
+
+def test_dictionary_creation_no_descrip_excel_dataframe():
+    writer = DictWriter(config="tests/test_config.toml")
+
+    # check no errors excel
+    writer.create_dict("tests/sources/animal_data.xlsx")
+
+    # check no errors dataframe
+    df = pd.read_csv("tests/sources/animals_dd.csv")
+    writer.create_dict(df)
+
+
 def test_dictionary_description():
     writer = DictWriterTest(config=Path("tests/test_config.toml"))
 
+    # check descriptions aren't generated without a dictionary
+    with pytest.raises(ValueError, match="No data dictionary found"):
+        writer.generate_descriptions("fr")
+
     df = writer.generate_descriptions("fr", "tests/sources/animals_dd.csv")
 
     df_desired = pd.read_csv("tests/sources/animals_dd_described.csv")
 
     pd.testing.assert_frame_equal(df, df_desired)
 
 
+def test_missing_key_error():
+    with pytest.raises(ValueError, match="API key required"):
+        DictWriter(config=Path("tests/test_config.toml")).generate_descriptions(
+            "fr", "tests/sources/animals_dd.csv"
+        )
+
+
 def test_wrong_llm_error():
     with pytest.raises(ValueError, match="Unsupported LLM: fish"):
         DictWriter(config=Path("tests/test_config.toml")).generate_descriptions(
-            "fr", "tests/sources/animals_dd.csv", llm="fish"
+            "fr", "tests/sources/animals_dd.csv", key="a12b3c", llm="fish"
         )