Skip to content
This repository has been archived by the owner on Dec 10, 2024. It is now read-only.

Increase test coverage #4

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/autoparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@
def main():
if len(sys.argv) < 2:
print(
"""autoparser: specify subcommand to run
"""
autoparser: specify subcommand to run

Available subcommands:
create-dict - Create a data dictionary from a dataset
add-descriptions - Add descriptions to a data dictionary (LLM key required)
create-mapping - Create initial CSV mapping from data dictionary (LLM key required)
create-parser - Generate TOML parser from CSV mapping file
"""
Available subcommands:
create-dict - Create a data dictionary from a dataset
add-descriptions - Add descriptions to a data dictionary (LLM key required)
create-mapping - Create initial CSV mapping from data dictionary (LLM key required) # noqa
create-parser - Generate TOML parser from CSV mapping file
"""
)
sys.exit(1)
subcommand = sys.argv[1]
Expand Down
10 changes: 6 additions & 4 deletions src/autoparser/create_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .openai_calls import _map_values as _map_values_openai
from .gemini_calls import _map_fields as _map_fields_gemini
from .gemini_calls import _map_values as _map_values_gemini
from .util import read_json, read_data, load_data_dict
from .util import read_json, read_config_schema, load_data_dict
from .util import DEFAULT_CONFIG

from typing import Literal
Expand Down Expand Up @@ -59,19 +59,21 @@ def __init__(
self.api_key = api_key
if llm is None:
self.client = None
elif llm == "openai":
elif llm == "openai": # pragma: no cover
self.client = OpenAI(api_key=self.api_key)
self.map_fields = _map_fields_openai
self.map_values = _map_values_openai
elif llm == "gemini":
elif llm == "gemini": # pragma: no cover
gemini.configure(api_key=self.api_key)
self.client = gemini.GenerativeModel("gemini-1.5-flash")
self.map_fields = _map_fields_gemini
self.map_values = _map_values_gemini
else:
raise ValueError(f"Unsupported LLM: {llm}")

self.config = read_data(config or Path(Path(__file__).parent, DEFAULT_CONFIG))
self.config = read_config_schema(
config or Path(Path(__file__).parent, DEFAULT_CONFIG)
)

self.data_dictionary = load_data_dict(self.config, data_dictionary)

Expand Down
32 changes: 12 additions & 20 deletions src/autoparser/dict_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from .openai_calls import _get_definitions as _get_definitions_openai
from .gemini_calls import _get_definitions as _get_definitions_gemini
from .util import read_data, load_data_dict
from .util import read_config_schema, load_data_dict, read_data
from .util import DEFAULT_CONFIG


Expand All @@ -38,7 +38,9 @@ def __init__(
):
if isinstance(config, str):
config = Path(config)
self.config = read_data(config or Path(Path(__file__).parent, DEFAULT_CONFIG))
self.config = read_config_schema(
config or Path(Path(__file__).parent, DEFAULT_CONFIG)
)

def _setup_llm(self, key: str, name: str):
"""
Expand All @@ -54,13 +56,16 @@ def _setup_llm(self, key: str, name: str):
name
Name of the LLM to use (currently only OpenAI and Gemini are supported)
"""
self.key = key
if name == "openai":
self.client = OpenAI(api_key=key)
if key is None:
raise ValueError("API key required for generating descriptions")
else:
self.key = key

if name == "openai": # pragma: no cover
self.client = OpenAI(api_key=key)
self._get_descriptions = _get_definitions_openai

elif name == "gemini":
elif name == "gemini": # pragma: no cover
gemini.configure(api_key=key)
self.client = gemini.GenerativeModel("gemini-1.5-flash")
self._get_descriptions = _get_definitions_gemini
Expand Down Expand Up @@ -88,20 +93,7 @@ def create_dict(self, data: pd.DataFrame | str) -> pd.DataFrame:
Data dictionary containing field names, field types, and common values.
"""

if isinstance(data, str):
data = Path(data)
if data.suffix == ".csv":
df = pd.read_csv(data)
elif data.suffix == ".xlsx":
df = pd.read_excel(data)
else:
raise ValueError(f"Unsupported format (not CSV or XLSX): {data}")
elif isinstance(data, pd.DataFrame):
df = data
else:
raise ValueError(
"Data must be a path to a CSV or XLSX file, or a DataFrame"
)
df = read_data(data, "Data")

names = df.columns
types = [str(t) for t in df.dtypes]
Expand Down
18 changes: 7 additions & 11 deletions src/autoparser/make_toml.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
import pandas as pd

from .toml_writer import dump
from .util import parse_choices
from .util import read_data
from .util import parse_choices, read_config_schema, read_data
from .util import DEFAULT_CONFIG


Expand Down Expand Up @@ -73,24 +72,21 @@ def __init__(
config: Path | None = None,
transformation_tool: str = "ADTL",
):
if isinstance(mappings, str):
mappings = Path(mappings)
if mappings.suffix == ".csv":
self.mappings = pd.read_csv(mappings)
else:
raise ValueError(f"Unsupported format (not CSV): {mappings}")
self.mappings = read_data(mappings, "A mapping file")

self.schema_path = (
schema_path if isinstance(schema_path, Path) else Path(schema_path)
)
self.parser_name = parser_name
self.parser_description = description or parser_name

self.config = read_data(config or Path(Path(__file__).parent, DEFAULT_CONFIG))
self.config = read_config_schema(
config or Path(Path(__file__).parent, DEFAULT_CONFIG)
)
self.tables = self.config["schemas"].keys()

self.schemas = {
t: read_data(Path(schema_path, self.config["schemas"][t]))
t: read_config_schema(Path(schema_path, self.config["schemas"][t]))
for t in self.tables
}

Expand Down Expand Up @@ -286,7 +282,7 @@ def create_parser(
parser_name,
description,
Path(config),
).create_parser()
).create_parser(parser_name)


def main():
Expand Down
25 changes: 23 additions & 2 deletions src/autoparser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
DEFAULT_CONFIG = "config/autoparser.toml"


def read_data(path: str | Path) -> Dict:
def read_config_schema(path: str | Path) -> Dict:
if isinstance(path, str):
path = Path(path)

Expand All @@ -24,7 +24,9 @@ def read_data(path: str | Path) -> Dict:
with path.open("rb") as fp:
return tomli.load(fp)
else:
raise ValueError(f"read_data(): Unsupported file format: {path.suffix}")
raise ValueError(
f"read_config_schema(): Unsupported file format: {path.suffix}"
)


def read_json(file: str | Path) -> dict:
Expand All @@ -35,6 +37,25 @@ def read_json(file: str | Path) -> dict:
return json.load(fp)


def read_data(file: str | Path | pd.DataFrame, file_type: str):
"""Reads in data/mapping files, which expect csv, excel or dataframe formats"""

if isinstance(file, str):
file = Path(file)
if file.suffix == ".csv":
return pd.read_csv(file)
elif file.suffix == ".xlsx":
return pd.read_excel(file)
else:
raise ValueError(f"Unsupported format (not CSV or XLSX): {file}")
elif isinstance(file, pd.DataFrame):
return file
else:
raise ValueError(
f"{file_type} must be a path to a CSV or XLSX file, or a DataFrame"
)


def parse_choices(config, s: str) -> Dict[str, Any]:
delimiter = config["choice_delimiter"]
delimiter_map = config["choice_delimiter_map"]
Expand Down
72 changes: 72 additions & 0 deletions tests/__snapshots__/test_parser_generator.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,78 @@
}),
})
# ---
# name: test_create_parser_ap_access
dict({
'age_months': dict({
'description': 'Age in Months',
'field': 'AgeMois ',
}),
'age_years': dict({
'description': 'Age in Years',
'field': 'AgeAns',
}),
'case_status': dict({
'caseInsensitive': True,
'description': 'Case Status',
'field': 'StatusCas',
'values': dict({
'décédé': 'dead',
'vivant': 'alive',
}),
}),
'chipped': dict({
'description': 'Microchipped',
'field': 'Micropucé',
'ref': 'Y/N/NK',
}),
'classification': dict({
'caseInsensitive': True,
'description': 'Classification',
'field': 'Classicfication ',
'values': dict({
'amphibie': 'amphibian',
'autre': '',
'fish': 'fish',
'mammifère': 'mammal',
'oiseau': 'bird',
'poisson': 'fish',
'rept': 'reptile',
}),
}),
'country_iso3': '',
'identity': dict({
'description': 'Identity',
'field': 'Identité',
}),
'loc_admin_1': dict({
'description': 'Province',
'field': 'Province',
}),
'name': dict({
'description': 'Full Name',
'field': 'Nom complet ',
}),
'notification_date': dict({
'description': 'Notification Date',
'field': 'DateNotification',
}),
'pet': dict({
'description': 'Pet Animal',
'field': 'AnimalDeCompagnie',
'ref': 'Y/N/NK',
}),
'sex': dict({
'caseInsensitive': True,
'description': 'Gender',
'field': 'Sexe',
'values': dict({
'f': 'female',
'inconnu': '',
'm': 'male',
}),
}),
})
# ---
# name: test_schema_fields
dict({
'age_months': dict({
Expand Down
Binary file added tests/sources/animal_data.xlsx
Binary file not shown.
51 changes: 49 additions & 2 deletions tests/test_dict_writer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# tests the `DictWriter` class
import autoparser
from autoparser.dict_writer import DictWriter
from pathlib import Path

Expand All @@ -19,8 +20,22 @@ def _setup_llm(self, key, name):
self._get_descriptions = get_definitions


def test_unsupported_data_format_txt():
writer = DictWriter(config="tests/test_config.toml")

with pytest.raises(ValueError, match="Unsupported format"):
writer.create_dict("tests/sources/animals.txt")


def test_data_not_df_or_path():
writer = DictWriter(config="tests/test_config.toml")

with pytest.raises(ValueError, match="Data must be a path"):
writer.create_dict(None)


def test_dictionary_creation_no_descrip():
writer = DictWriter(config=Path("tests/test_config.toml"))
writer = DictWriter(config="tests/test_config.toml")

df = writer.create_dict("tests/sources/animal_data.csv")

Expand All @@ -29,18 +44,50 @@ def test_dictionary_creation_no_descrip():
pd.testing.assert_frame_equal(df, df_desired)


def test_create_dict_no_descrip():
df = autoparser.create_dict(
"tests/sources/animal_data.csv", config="tests/test_config.toml"
)

df_desired = pd.read_csv("tests/sources/animals_dd.csv")

pd.testing.assert_frame_equal(df, df_desired)


def test_dictionary_creation_no_descrip_excel_dataframe():
writer = DictWriter(config="tests/test_config.toml")

# check no errors excel
writer.create_dict("tests/sources/animal_data.xlsx")

# check no errors dataframe
df = pd.read_csv("tests/sources/animals_dd.csv")
writer.create_dict(df)


def test_dictionary_description():
writer = DictWriterTest(config=Path("tests/test_config.toml"))

# check descriptions aren't generated without a dictionary
with pytest.raises(ValueError, match="No data dictionary found"):
writer.generate_descriptions("fr")

df = writer.generate_descriptions("fr", "tests/sources/animals_dd.csv")

df_desired = pd.read_csv("tests/sources/animals_dd_described.csv")

pd.testing.assert_frame_equal(df, df_desired)


def test_missing_key_error():
with pytest.raises(ValueError, match="API key required"):
DictWriter(config=Path("tests/test_config.toml")).generate_descriptions(
"fr", "tests/sources/animals_dd.csv"
)


def test_wrong_llm_error():
with pytest.raises(ValueError, match="Unsupported LLM: fish"):
DictWriter(config=Path("tests/test_config.toml")).generate_descriptions(
"fr", "tests/sources/animals_dd.csv", llm="fish"
"fr", "tests/sources/animals_dd.csv", key="a12b3c", llm="fish"
)
Loading