Skip to content

Commit

Permalink
Merge pull request #88 from digital-land/2228-flatten-json-fields
Browse files Browse the repository at this point in the history
Create CSV artifact including json keys as top level fields
  • Loading branch information
Staberinde authored Apr 6, 2022
2 parents 9557ef2 + 28c58bd commit 3a47bea
Show file tree
Hide file tree
Showing 7 changed files with 1,056 additions and 8 deletions.
40 changes: 40 additions & 0 deletions digital_land/api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections import OrderedDict
import csv
import itertools
import os
import sys
import json
Expand Down Expand Up @@ -244,6 +247,43 @@ def dataset_dump_cmd(self, input_path, output_path):
logging.info(cmd)
os.system(cmd)

def dataset_dump_hoisted_cmd(self, csv_path, hoisted_csv_path):
if not hoisted_csv_path:
hoisted_csv_path = csv_path.replace(".csv", "-hoisted.csv")

with open(csv_path, "r") as read_file, open(
hoisted_csv_path, "w+"
) as write_file:
reader = csv.DictReader(read_file)

spec_field_names = [
field.replace("-", "_")
for field in itertools.chain(
*[
self.specification.current_fieldnames(schema)
for schema in self.specification.dataset_schema[self.dataset]
]
)
]
reader_fieldnames = list(reader.fieldnames)
reader_fieldnames.remove("json")
hoisted_field_names = set(spec_field_names).difference(
set(reader_fieldnames)
)
# Make sure we put hoisted fieldnames last
field_names = reader_fieldnames + sorted(list(hoisted_field_names))

writer = csv.DictWriter(write_file, fieldnames=field_names)
writer.writeheader()
for row in reader:
row = OrderedDict(row)
json_string = row.pop("json") or "{}"
row.update(json.loads(json_string))
snake_case_row = dict(
[(key.replace("-", "_"), val) for key, val in row.items()]
)
writer.writerow(snake_case_row)

#
# configuration commands
#
Expand Down
11 changes: 10 additions & 1 deletion digital_land/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,16 @@ def dataset_create_cmd(input_paths, output_path, organisation_path):
@cli.command("dataset-entries", short_help="dump dataset entries as csv")
@input_output_path
def dataset_dump_cmd(input_path, output_path):
return API.dataset_dump_cmd(input_path, output_path)
API.dataset_dump_cmd(input_path, output_path)


@cli.command(
"dataset-entries-hoisted",
short_help="dump dataset entries as csv with additional top-level `entity.json` fields",
)
@input_output_path
def dataset_dump_hoisted_cmd(input_path, output_path):
API.dataset_dump_hoisted_cmd(csv_path=input_path, hoisted_csv_path=output_path)


@cli.command("pipeline", short_help="process a resource")
Expand Down
1 change: 0 additions & 1 deletion digital_land/specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

class Specification:
def __init__(self, path="specification"):
self.dataset = {}
self.dataset = {}
self.dataset_names = []
self.schema = {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,reference,start_date,typology,listed_building_grade
listed-building-grade,,700000,2021-11-22,,,I,,,listed-building-grade,I,,category,I
listed-building-grade,,700001,2021-11-22,,,II,,,listed-building-grade,II,,category,II
listed-building-grade,,700002,2021-11-22,,,II*,,,listed-building-grade,II*,,category,II*
953 changes: 953 additions & 0 deletions tests/data/listed-building/dataset/listed-building-outline-hoisted.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,reference,start_date,typology,documentation_url,listed_building_grade,locally_listed_building,wikidata,wikipedia
locally-listed-building,,41100447,2021-12-23,,"MULTIPOLYGON (((1.025641 51.354931,1.025825 51.355022,1.025780 51.355060,1.025370 51.354861,1.025418 51.354823,1.025641 51.354931)))","The Yard Cottage, The Street,, Barham, CT4 6NY",,POINT(1.025598 51.354941),locally-listed-building,,,geography,,,,,
53 changes: 47 additions & 6 deletions tests/integration/test_package_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,7 @@ def test_package_dataset(
output_dir = tmp_path.joinpath("dataset_output")
output_dir.mkdir()
sqlite_path = output_dir.joinpath(f"{dataset_name}.sqlite3")
sqlite_path.touch()
csv_path = output_dir.joinpath(f"{dataset_name}.csv")
csv_path.touch()

# Call
api = DigitalLandApi(
Expand All @@ -119,11 +117,54 @@ def test_package_dataset(
specification_dir=str(specification_path),
)
api.dataset_create_cmd(input_paths, sqlite_path, organisation_path)

api.dataset_dump_cmd(sqlite_path, csv_path)

# Assert
with csv_path.open() as actual, expected_csv_result.open() as expected:
actual_dict_list = list(DictReader(actual))
expected_dict_list = list(DictReader(expected))
assert actual_dict_list == expected_dict_list
actual_dict_reader = DictReader(actual)
expected_dict_reader = DictReader(expected)
assert actual_dict_reader.fieldnames == expected_dict_reader.fieldnames
assert list(actual_dict_reader) == list(expected_dict_reader)


@pytest.mark.parametrize(
"dataset_name",
[
"listed-building-grade",
"listed-building-outline",
"locally-listed-building",
],
)
def test_package_dataset_hoisted(
# Parametrize args
dataset_name,
# Runtime filesystem dependencies generated by previous steps
pipeline_dir,
# Test assertion directories
dataset_dir,
# Pytest fixtures
tmp_path,
):
# Setup
expected_hoisted_csv_result = dataset_dir.joinpath(f"{dataset_name}-hoisted.csv")

csv_path = dataset_dir.joinpath(f"{dataset_name}.csv")

output_dir = tmp_path.joinpath("dataset_output")
output_dir.mkdir()
hoisted_csv_path = output_dir.joinpath(f"{dataset_name}-hoisted.csv")

# Call
api = DigitalLandApi(
debug=False,
dataset=dataset_name,
pipeline_dir=pipeline_dir,
specification_dir=str(specification_path),
)
api.dataset_dump_hoisted_cmd(csv_path, hoisted_csv_path)

with hoisted_csv_path.open() as actual, expected_hoisted_csv_result.open() as expected:
actual_dict_reader = DictReader(actual)
expected_dict_reader = DictReader(expected)
assert actual_dict_reader.fieldnames == expected_dict_reader.fieldnames
assert list(actual_dict_reader) == list(expected_dict_reader)

0 comments on commit 3a47bea

Please sign in to comment.