Merge pull request #88 from digital-land/2228-flatten-json-fields

Create CSV artifact including json keys as top level fields
digital-land · Apr 6, 2022 · 3a47bea · 3a47bea
2 parents 9557ef2 + 28c58bd
commit 3a47bea
Show file tree

Hide file tree

Showing 7 changed files with 1,056 additions and 8 deletions.
diff --git a/digital_land/api.py b/digital_land/api.py
@@ -1,3 +1,6 @@
+from collections import OrderedDict
+import csv
+import itertools
 import os
 import sys
 import json
@@ -244,6 +247,43 @@ def dataset_dump_cmd(self, input_path, output_path):
         logging.info(cmd)
         os.system(cmd)
 
+    def dataset_dump_hoisted_cmd(self, csv_path, hoisted_csv_path):
+        if not hoisted_csv_path:
+            hoisted_csv_path = csv_path.replace(".csv", "-hoisted.csv")
+
+        with open(csv_path, "r") as read_file, open(
+            hoisted_csv_path, "w+"
+        ) as write_file:
+            reader = csv.DictReader(read_file)
+
+            spec_field_names = [
+                field.replace("-", "_")
+                for field in itertools.chain(
+                    *[
+                        self.specification.current_fieldnames(schema)
+                        for schema in self.specification.dataset_schema[self.dataset]
+                    ]
+                )
+            ]
+            reader_fieldnames = list(reader.fieldnames)
+            reader_fieldnames.remove("json")
+            hoisted_field_names = set(spec_field_names).difference(
+                set(reader_fieldnames)
+            )
+            # Make sure we put hoisted fieldnames last
+            field_names = reader_fieldnames + sorted(list(hoisted_field_names))
+
+            writer = csv.DictWriter(write_file, fieldnames=field_names)
+            writer.writeheader()
+            for row in reader:
+                row = OrderedDict(row)
+                json_string = row.pop("json") or "{}"
+                row.update(json.loads(json_string))
+                snake_case_row = dict(
+                    [(key.replace("-", "_"), val) for key, val in row.items()]
+                )
+                writer.writerow(snake_case_row)
+
     #
     #  configuration commands
     #

diff --git a/digital_land/cli.py b/digital_land/cli.py
@@ -169,7 +169,16 @@ def dataset_create_cmd(input_paths, output_path, organisation_path):
 @cli.command("dataset-entries", short_help="dump dataset entries as csv")
 @input_output_path
 def dataset_dump_cmd(input_path, output_path):
-    return API.dataset_dump_cmd(input_path, output_path)
+    API.dataset_dump_cmd(input_path, output_path)
+
+
+@cli.command(
+    "dataset-entries-hoisted",
+    short_help="dump dataset entries as csv with additional top-level `entity.json` fields",
+)
+@input_output_path
+def dataset_dump_hoisted_cmd(input_path, output_path):
+    API.dataset_dump_hoisted_cmd(csv_path=input_path, hoisted_csv_path=output_path)
 
 
 @cli.command("pipeline", short_help="process a resource")

diff --git a/digital_land/specification.py b/digital_land/specification.py
@@ -21,7 +21,6 @@
 
 class Specification:
     def __init__(self, path="specification"):
-        self.dataset = {}
         self.dataset = {}
         self.dataset_names = []
         self.schema = {}

diff --git a/tests/data/listed-building/dataset/listed-building-grade-hoisted.csv b/tests/data/listed-building/dataset/listed-building-grade-hoisted.csv
@@ -0,0 +1,4 @@
+dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,reference,start_date,typology,listed_building_grade
+listed-building-grade,,700000,2021-11-22,,,I,,,listed-building-grade,I,,category,I
+listed-building-grade,,700001,2021-11-22,,,II,,,listed-building-grade,II,,category,II
+listed-building-grade,,700002,2021-11-22,,,II*,,,listed-building-grade,II*,,category,II*
diff --git a/tests/data/listed-building/dataset/listed-building-outline-hoisted.csv b/tests/data/listed-building/dataset/listed-building-outline-hoisted.csv
diff --git a/tests/data/listed-building/dataset/locally-listed-building-hoisted.csv b/tests/data/listed-building/dataset/locally-listed-building-hoisted.csv
@@ -0,0 +1,2 @@
+dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,reference,start_date,typology,documentation_url,listed_building_grade,locally_listed_building,wikidata,wikipedia
+locally-listed-building,,41100447,2021-12-23,,"MULTIPOLYGON (((1.025641 51.354931,1.025825 51.355022,1.025780 51.355060,1.025370 51.354861,1.025418 51.354823,1.025641 51.354931)))","The Yard Cottage, The Street,, Barham, CT4 6NY",,POINT(1.025598 51.354941),locally-listed-building,,,geography,,,,,
diff --git a/tests/integration/test_package_dataset.py b/tests/integration/test_package_dataset.py
@@ -107,9 +107,7 @@ def test_package_dataset(
     output_dir = tmp_path.joinpath("dataset_output")
     output_dir.mkdir()
     sqlite_path = output_dir.joinpath(f"{dataset_name}.sqlite3")
-    sqlite_path.touch()
     csv_path = output_dir.joinpath(f"{dataset_name}.csv")
-    csv_path.touch()
 
     # Call
     api = DigitalLandApi(
@@ -119,11 +117,54 @@ def test_package_dataset(
         specification_dir=str(specification_path),
     )
     api.dataset_create_cmd(input_paths, sqlite_path, organisation_path)
-
     api.dataset_dump_cmd(sqlite_path, csv_path)
 
     # Assert
     with csv_path.open() as actual, expected_csv_result.open() as expected:
-        actual_dict_list = list(DictReader(actual))
-        expected_dict_list = list(DictReader(expected))
-        assert actual_dict_list == expected_dict_list
+        actual_dict_reader = DictReader(actual)
+        expected_dict_reader = DictReader(expected)
+        assert actual_dict_reader.fieldnames == expected_dict_reader.fieldnames
+        assert list(actual_dict_reader) == list(expected_dict_reader)
+
+
+@pytest.mark.parametrize(
+    "dataset_name",
+    [
+        "listed-building-grade",
+        "listed-building-outline",
+        "locally-listed-building",
+    ],
+)
+def test_package_dataset_hoisted(
+    # Parametrize args
+    dataset_name,
+    # Runtime filesystem dependencies generated by previous steps
+    pipeline_dir,
+    # Test assertion directories
+    dataset_dir,
+    # Pytest fixtures
+    tmp_path,
+):
+    # Setup
+    expected_hoisted_csv_result = dataset_dir.joinpath(f"{dataset_name}-hoisted.csv")
+
+    csv_path = dataset_dir.joinpath(f"{dataset_name}.csv")
+
+    output_dir = tmp_path.joinpath("dataset_output")
+    output_dir.mkdir()
+    hoisted_csv_path = output_dir.joinpath(f"{dataset_name}-hoisted.csv")
+
+    # Call
+    api = DigitalLandApi(
+        debug=False,
+        dataset=dataset_name,
+        pipeline_dir=pipeline_dir,
+        specification_dir=str(specification_path),
+    )
+    api.dataset_dump_hoisted_cmd(csv_path, hoisted_csv_path)
+
+    with hoisted_csv_path.open() as actual, expected_hoisted_csv_result.open() as expected:
+        actual_dict_reader = DictReader(actual)
+        expected_dict_reader = DictReader(expected)
+        assert actual_dict_reader.fieldnames == expected_dict_reader.fieldnames
+        assert list(actual_dict_reader) == list(expected_dict_reader)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		dataset,end_date,entity,entry_date,geojson,geometry,name,organisation_entity,point,prefix,reference,start_date,typology,documentation_url,listed_building_grade,locally_listed_building,wikidata,wikipedia
		locally-listed-building,,41100447,2021-12-23,,"MULTIPOLYGON (((1.025641 51.354931,1.025825 51.355022,1.025780 51.355060,1.025370 51.354861,1.025418 51.354823,1.025641 51.354931)))","The Yard Cottage, The Street,, Barham, CT4 6NY",,POINT(1.025598 51.354941),locally-listed-building,,,geography,,,,,