Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create CSV artifact including json keys as top level fields #88

Merged
merged 9 commits into from
Apr 6, 2022
40 changes: 40 additions & 0 deletions digital_land/api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections import OrderedDict
import csv
import itertools
import os
import sys
import json
Expand Down Expand Up @@ -244,6 +247,43 @@ def dataset_dump_cmd(self, input_path, output_path):
logging.info(cmd)
os.system(cmd)

def dataset_dump_hoisted_cmd(self, csv_path, hoisted_csv_path):
if not hoisted_csv_path:
hoisted_csv_path = csv_path.replace(".csv", "-hoisted.csv")

with open(csv_path, "r") as read_file, open(
hoisted_csv_path, "w+"
) as write_file:
reader = csv.DictReader(read_file)

spec_field_names = [
field.replace("-", "_")
for field in itertools.chain(
*[
self.specification.current_fieldnames(schema)
for schema in self.specification.dataset_schema[self.dataset]
]
)
]
reader_fieldnames = list(reader.fieldnames)
reader_fieldnames.remove("json")
hoisted_field_names = set(spec_field_names).difference(
set(reader_fieldnames)
)
# Make sure we put hoisted fieldnames last
field_names = reader_fieldnames + sorted(list(hoisted_field_names))

writer = csv.DictWriter(write_file, fieldnames=field_names)
writer.writeheader()
for row in reader:
row = OrderedDict(row)
json_string = row.pop("json") or "{}"
row.update(json.loads(json_string))
snake_case_row = dict(
[(key.replace("-", "_"), val) for key, val in row.items()]
)
writer.writerow(snake_case_row)

#
# configuration commands
#
Expand Down
11 changes: 10 additions & 1 deletion digital_land/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,16 @@ def dataset_create_cmd(input_paths, output_path, organisation_path):
@cli.command("dataset-entries", short_help="dump dataset entries as csv")
@input_output_path
def dataset_dump_cmd(input_path, output_path):
return API.dataset_dump_cmd(input_path, output_path)
API.dataset_dump_cmd(input_path, output_path)


@cli.command(
"dataset-entries-hoisted",
short_help="dump dataset entries as csv with additional top-level `entity.json` fields",
)
@input_output_path
def dataset_dump_hoisted_cmd(input_path, output_path):
API.dataset_dump_hoisted_cmd(csv_path=input_path, hoisted_csv_path=output_path)


@cli.command("pipeline", short_help="process a resource")
Expand Down
1 change: 0 additions & 1 deletion digital_land/specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

class Specification:
def __init__(self, path="specification"):
self.dataset = {}
self.dataset = {}
self.dataset_names = []
self.schema = {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,reference,start_date,typology,listed_building_grade
listed-building-grade,,700000,2021-11-22,,,I,,,listed-building-grade,I,,category,I
listed-building-grade,,700001,2021-11-22,,,II,,,listed-building-grade,II,,category,II
listed-building-grade,,700002,2021-11-22,,,II*,,,listed-building-grade,II*,,category,II*
953 changes: 953 additions & 0 deletions tests/data/listed-building/dataset/listed-building-outline-hoisted.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,reference,start_date,typology,documentation_url,listed_building_grade,locally_listed_building,wikidata,wikipedia
locally-listed-building,,41100447,2021-12-23,,"MULTIPOLYGON (((1.025641 51.354931,1.025825 51.355022,1.025780 51.355060,1.025370 51.354861,1.025418 51.354823,1.025641 51.354931)))","The Yard Cottage, The Street,, Barham, CT4 6NY",,POINT(1.025598 51.354941),locally-listed-building,,,geography,,,,,
53 changes: 47 additions & 6 deletions tests/integration/test_package_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,7 @@ def test_package_dataset(
output_dir = tmp_path.joinpath("dataset_output")
output_dir.mkdir()
sqlite_path = output_dir.joinpath(f"{dataset_name}.sqlite3")
sqlite_path.touch()
csv_path = output_dir.joinpath(f"{dataset_name}.csv")
csv_path.touch()

# Call
api = DigitalLandApi(
Expand All @@ -119,11 +117,54 @@ def test_package_dataset(
specification_dir=str(specification_path),
)
api.dataset_create_cmd(input_paths, sqlite_path, organisation_path)

api.dataset_dump_cmd(sqlite_path, csv_path)

# Assert
with csv_path.open() as actual, expected_csv_result.open() as expected:
actual_dict_list = list(DictReader(actual))
expected_dict_list = list(DictReader(expected))
assert actual_dict_list == expected_dict_list
actual_dict_reader = DictReader(actual)
expected_dict_reader = DictReader(expected)
assert actual_dict_reader.fieldnames == expected_dict_reader.fieldnames
assert list(actual_dict_reader) == list(expected_dict_reader)


@pytest.mark.parametrize(
"dataset_name",
[
"listed-building-grade",
"listed-building-outline",
"locally-listed-building",
],
)
def test_package_dataset_hoisted(
# Parametrize args
dataset_name,
# Runtime filesystem dependencies generated by previous steps
pipeline_dir,
# Test assertion directories
dataset_dir,
# Pytest fixtures
tmp_path,
):
# Setup
expected_hoisted_csv_result = dataset_dir.joinpath(f"{dataset_name}-hoisted.csv")

csv_path = dataset_dir.joinpath(f"{dataset_name}.csv")

output_dir = tmp_path.joinpath("dataset_output")
output_dir.mkdir()
hoisted_csv_path = output_dir.joinpath(f"{dataset_name}-hoisted.csv")

# Call
api = DigitalLandApi(
debug=False,
dataset=dataset_name,
pipeline_dir=pipeline_dir,
specification_dir=str(specification_path),
)
api.dataset_dump_hoisted_cmd(csv_path, hoisted_csv_path)

with hoisted_csv_path.open() as actual, expected_hoisted_csv_result.open() as expected:
actual_dict_reader = DictReader(actual)
expected_dict_reader = DictReader(expected)
assert actual_dict_reader.fieldnames == expected_dict_reader.fieldnames
assert list(actual_dict_reader) == list(expected_dict_reader)