From 977a8a8291eb371eab722103b09ac31ff51792ca Mon Sep 17 00:00:00 2001 From: James Bannister Date: Fri, 5 Apr 2024 15:32:02 +0100 Subject: [PATCH 01/58] Updated --- .../checkpoints/converted_resource.py | 72 ++++++++++++++++++- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index e735a41d..f2eebfec 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -5,9 +5,75 @@ # a checkpoint represents the moment in the process where we tell it the # type of data it is validating and where the data is # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from) +from pathlib import Path +import csv +import re from .base import BaseCheckpoint -class CovertedResourceCheckpoint(BaseCheckpoint): - def load(): - pass +class ConvertedResourceCheckpoint(BaseCheckpoint): + def __init__(self, data_path): + super().__init__("converted_resource", data_path) + self.csv_path = Path(data_path) + + def load(self): + self.expectations = [ + { + "function": self.check_for_duplicate_references, + "name": "Check for Duplicate References", + "severity": "error", + "responsibility": "system", + }, + { + "function": self.validate_references, + "name": "Validate References", + "severity": "error", + "responsibility": "system", + }, + ] + + def check_for_duplicate_references(self): + duplicates = {} + issues = [] + + with self.csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if ref in duplicates: + duplicates[ref].append(row_number) + else: + duplicates[ref] = [row_number] + + for ref, rows in duplicates.items(): + if len(rows) > 1: + issues.append( + { + "scope": "duplicate_reference", + "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", + "rows": rows, + "reference": ref, + } + ) + + return True, "Checked for duplicate references.", issues + + def validate_references(self): + pattern = re.compile(r"^REF-\d+$") + issues = [] + + with self.csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if not pattern.match(ref): + issues.append( + { + "scope": "invalid_reference", + "message": f"Invalid reference '{ref}' on row {row_number}.", + "row": row_number, + "reference": ref, + } + ) + + return len(issues) == 0, "Checked for invalid references.", issues From dec636c5e42ccd0b77f70d8ff786886094f969e7 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 9 Apr 2024 14:41:41 +0100 Subject: [PATCH 02/58] Added unit tests and integrated into convert --- digital_land/phase/convert.py | 25 +++++++++ .../expectations/test_checkpoint.py | 53 ++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 091fa006..8e057e7e 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -11,6 +11,9 @@ import pandas as pd from .load import Stream from .phase import Phase +from digital_land.expectations.checkpoints.converted_resource import ( + ConvertedResourceCheckpoint, +) def detect_file_encoding(path): @@ -187,12 +190,34 @@ def _read_text_file(self, input_path, encoding): if converted_csv_file: f.close() + self.run_checkpoint(converted_csv_file) reader = read_csv(converted_csv_file) else: reader = f return reader + def run_checkpoint(self, path): + checkpoint = ConvertedResourceCheckpoint(data_path=path) + checkpoint.load() + checkpoint_result, issues = checkpoint.run() + + if issues: + for issue in issues: + log_message = self.format_issue_message(issue) + + if issue["severity"] == "error": + logging.error(log_message) + elif issue["severity"] == "warning": + logging.warning(log_message) + else: + logging.info(log_message) + else: + logging.info(f"Checkpoint completed with result: {checkpoint_result}") + + def format_issue_message(self, issue): + return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})" + def _find_zip_file(self, input_file, suffix=".gml"): zip_ = zipfile.ZipFile(input_file) files = zip_.namelist() diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index 6f5e4caa..e0efc11f 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -2,8 +2,11 @@ import os import spatialite import pandas as pd -from csv import DictReader +from csv import DictReader, DictWriter from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint +from digital_land.expectations.checkpoints.converted_resource import ( + ConvertedResourceCheckpoint, +) @pytest.fixture @@ -43,6 +46,22 @@ def sqlite3_with_entity_tables_path(tmp_path): return dataset_path +@pytest.fixture +def csv_path(tmp_path): + data = [ + {"reference": "REF-001", "name": "Test 1"}, + {"reference": "REF-002", "name": "Test 2"}, + {"reference": "REF-001", "name": "Test 3"}, # Duplicate + {"reference": "INVALID-003", "name": "Test 4"}, # Invalid format + ] + csv_file = tmp_path / "test_data.csv" + with csv_file.open(mode="w", newline="") as f: + writer = DictWriter(f, fieldnames=["reference", "name"]) + writer.writeheader() + writer.writerows(data) + return csv_file + + def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path): # load data test_entity_data = pd.DataFrame.from_dict({"entity": [1], "name": ["test1"]}) @@ -126,3 +145,35 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): assert issues[0]["rows"] == "" assert issues[0]["row"] != "" # Just check it's there assert issues[0]["value"] == "" + + +def test_check_for_duplicate_references(csv_path): + checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) + checkpoint.load() + + success, message, issues = checkpoint.check_for_duplicate_references() + + assert success is True, "The function should successfully identify issues." + assert len(issues) == 1, "There should be one issue identified." + assert ( + issues[0]["scope"] == "duplicate_reference" + ), "The issue should be identified as a duplicate reference." + assert ( + "REF-001" in issues[0]["message"] + ), "REF-001 should be identified as a duplicate." + + +def test_validate_references(csv_path): + checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) + checkpoint.load() + + success, message, issues = checkpoint.validate_references() + + assert success is False, "The function should fail due to invalid references." + assert len(issues) == 1, "There should be one issue identified." + assert ( + issues[0]["scope"] == "invalid_reference" + ), "The issue should be identified as an invalid reference." + assert ( + "INVALID-003" in issues[0]["message"] + ), "INVALID-003 should be identified as invalid." From 113dbefed0da8f9282b21fb9183352306ccb2d5f Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 14:19:05 +0100 Subject: [PATCH 03/58] Updated verification --- .../expectations/checkpoints/converted_resource.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index f2eebfec..206eecb8 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -7,7 +7,6 @@ # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from) from pathlib import Path import csv -import re from .base import BaseCheckpoint @@ -59,21 +58,21 @@ def check_for_duplicate_references(self): return True, "Checked for duplicate references.", issues def validate_references(self): - pattern = re.compile(r"^REF-\d+$") issues = [] with self.csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): ref = row.get("reference") - if not pattern.match(ref): + # Check if reference is not populated (None or empty string) + if not ref: # This will be True for both None and empty strings issues.append( { "scope": "invalid_reference", - "message": f"Invalid reference '{ref}' on row {row_number}.", + "message": f"Reference is missing on row {row_number}.", "row": row_number, - "reference": ref, + "reference": ref, # This will be None or '' } ) - return len(issues) == 0, "Checked for invalid references.", issues + return len(issues) == 0, "Checked for unpopulated references.", issues From 77b6cbc0920f67a02563cef468ceaac9d7a33753 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 14:29:16 +0100 Subject: [PATCH 04/58] Adjust issue factory --- digital_land/expectations/issue.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index 80a750cd..aad74641 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -19,6 +19,8 @@ def issue_factory(scope): "row-group": RowGroupIssue, "row": RowIssue, "value": ValueIssue, + "duplicate_reference": RowIssue, + "invalid_reference": ValueIssue, } if scope in SCOPE_MAP: return SCOPE_MAP[scope] From 0b1f12f55883ad920a772bb7251732a91ac0e63e Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:05:42 +0100 Subject: [PATCH 05/58] Issue adjustments --- .../checkpoints/converted_resource.py | 14 ++++++-- digital_land/expectations/issue.py | 32 +++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 206eecb8..43671a56 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -52,6 +52,11 @@ def check_for_duplicate_references(self): "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", "rows": rows, "reference": ref, + "dataset": "dataset", + "field_name": "reference", + "row_id": str(rows[0]), + "value": ref, + "organisation": "organisation", } ) @@ -64,14 +69,19 @@ def validate_references(self): reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): ref = row.get("reference") - # Check if reference is not populated (None or empty string) + if not ref: # This will be True for both None and empty strings issues.append( { "scope": "invalid_reference", "message": f"Reference is missing on row {row_number}.", "row": row_number, - "reference": ref, # This will be None or '' + "reference": ref, + "dataset": "dataset", + "field_name": "reference", + "row_id": str(row_number), + "value": ref, + "organisation": "organisation", } ) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index aad74641..9ccad93b 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -19,8 +19,8 @@ def issue_factory(scope): "row-group": RowGroupIssue, "row": RowIssue, "value": ValueIssue, - "duplicate_reference": RowIssue, - "invalid_reference": ValueIssue, + "duplicate_reference": DuplicateReferenceIssue, + "invalid_reference": InvalidReferenceIssue, } if scope in SCOPE_MAP: return SCOPE_MAP[scope] @@ -131,3 +131,31 @@ def __post_init__(self): issue_scope = "value" if self.scope != issue_scope: raise ValueError(f"scope must be '{issue_scope}'.") + + +@dataclass +class DuplicateReferenceIssue(Issue): + dataset: str + field_name: str = field(metadata=config(field_name="field_name")) + duplicated_value: str = field(metadata=config(field_name="duplicated_value")) + rows: list = field(metadata=config(field_name="rows")) + organisation: str + + def __post_init__(self): + issue_scope = "duplicate_reference" + if self.scope != issue_scope: + raise ValueError(f"scope must be '{issue_scope}'.") + + +@dataclass +class InvalidReferenceIssue(Issue): + dataset: str + field_name: str = field(metadata=config(field_name="field_name")) + invalid_value: str = field(metadata=config(field_name="invalid_value")) + row_id: str = field(metadata=config(field_name="row_id")) + organisation: str + + def __post_init__(self): + issue_scope = "invalid_reference" + if self.scope != issue_scope: + raise ValueError(f"scope must be '{issue_scope}'.") From 9f683107cf6af628b252f135a876b8a4befe1927 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:09:51 +0100 Subject: [PATCH 06/58] Changed value --- digital_land/expectations/checkpoints/converted_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 43671a56..8a952d5e 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -55,7 +55,7 @@ def check_for_duplicate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(rows[0]), - "value": ref, + "invalid_value": ref, "organisation": "organisation", } ) @@ -80,7 +80,7 @@ def validate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(row_number), - "value": ref, + "invalid_value": ref, "organisation": "organisation", } ) From b71b479d0ae921db123dbb2f67f976f31cda5704 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:15:26 +0100 Subject: [PATCH 07/58] Value changes --- digital_land/expectations/checkpoints/converted_resource.py | 2 -- digital_land/expectations/issue.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 8a952d5e..8e7f1727 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -55,7 +55,6 @@ def check_for_duplicate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(rows[0]), - "invalid_value": ref, "organisation": "organisation", } ) @@ -80,7 +79,6 @@ def validate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(row_number), - "invalid_value": ref, "organisation": "organisation", } ) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index 9ccad93b..b93ca030 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -137,7 +137,6 @@ def __post_init__(self): class DuplicateReferenceIssue(Issue): dataset: str field_name: str = field(metadata=config(field_name="field_name")) - duplicated_value: str = field(metadata=config(field_name="duplicated_value")) rows: list = field(metadata=config(field_name="rows")) organisation: str @@ -151,7 +150,6 @@ def __post_init__(self): class InvalidReferenceIssue(Issue): dataset: str field_name: str = field(metadata=config(field_name="field_name")) - invalid_value: str = field(metadata=config(field_name="invalid_value")) row_id: str = field(metadata=config(field_name="row_id")) organisation: str From d8ef949369a06f6c694df3686750d9a403bcbe44 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:22:06 +0100 Subject: [PATCH 08/58] Adjust convert.py --- digital_land/phase/convert.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 8e057e7e..74da23eb 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -200,7 +200,14 @@ def _read_text_file(self, input_path, encoding): def run_checkpoint(self, path): checkpoint = ConvertedResourceCheckpoint(data_path=path) checkpoint.load() - checkpoint_result, issues = checkpoint.run() + result = checkpoint.run() + + # Check if the result is not None and is iterable (unpackable) + if result is not None and isinstance(result, tuple) and len(result) == 2: + checkpoint_result, issues = result + else: + logging.error("Checkpoint did not return the expected result format.") + return if issues: for issue in issues: From 9fde2aeb13b7683735e3310081b4b531f3713670 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:28:53 +0100 Subject: [PATCH 09/58] Test fixes --- tests/integration/expectations/test_checkpoint.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index e0efc11f..3ab31bdc 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -52,7 +52,7 @@ def csv_path(tmp_path): {"reference": "REF-001", "name": "Test 1"}, {"reference": "REF-002", "name": "Test 2"}, {"reference": "REF-001", "name": "Test 3"}, # Duplicate - {"reference": "INVALID-003", "name": "Test 4"}, # Invalid format + {"reference": "", "name": "Test 4"}, # Invalid format ] csv_file = tmp_path / "test_data.csv" with csv_file.open(mode="w", newline="") as f: @@ -174,6 +174,4 @@ def test_validate_references(csv_path): assert ( issues[0]["scope"] == "invalid_reference" ), "The issue should be identified as an invalid reference." - assert ( - "INVALID-003" in issues[0]["message"] - ), "INVALID-003 should be identified as invalid." + assert "" in issues[0]["message"], " 4th value should be identified as invalid." From 45b0e11a0d14f5e14c88746298867e8705179f8c Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 10:21:33 +0100 Subject: [PATCH 10/58] Chanegs to issues --- .../checkpoints/converted_resource.py | 13 ++++----- digital_land/expectations/issue.py | 28 ------------------- 2 files changed, 6 insertions(+), 35 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 8e7f1727..73b666cf 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -48,12 +48,11 @@ def check_for_duplicate_references(self): if len(rows) > 1: issues.append( { - "scope": "duplicate_reference", + "scope": "row-group", "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", - "rows": rows, - "reference": ref, "dataset": "dataset", - "field_name": "reference", + "table_name": "resource", + "rows": rows, "row_id": str(rows[0]), "organisation": "organisation", } @@ -72,13 +71,13 @@ def validate_references(self): if not ref: # This will be True for both None and empty strings issues.append( { - "scope": "invalid_reference", + "scope": "value", "message": f"Reference is missing on row {row_number}.", - "row": row_number, - "reference": ref, "dataset": "dataset", + "table_name": "resource", "field_name": "reference", "row_id": str(row_number), + "value": ref, "organisation": "organisation", } ) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index b93ca030..80a750cd 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -19,8 +19,6 @@ def issue_factory(scope): "row-group": RowGroupIssue, "row": RowIssue, "value": ValueIssue, - "duplicate_reference": DuplicateReferenceIssue, - "invalid_reference": InvalidReferenceIssue, } if scope in SCOPE_MAP: return SCOPE_MAP[scope] @@ -131,29 +129,3 @@ def __post_init__(self): issue_scope = "value" if self.scope != issue_scope: raise ValueError(f"scope must be '{issue_scope}'.") - - -@dataclass -class DuplicateReferenceIssue(Issue): - dataset: str - field_name: str = field(metadata=config(field_name="field_name")) - rows: list = field(metadata=config(field_name="rows")) - organisation: str - - def __post_init__(self): - issue_scope = "duplicate_reference" - if self.scope != issue_scope: - raise ValueError(f"scope must be '{issue_scope}'.") - - -@dataclass -class InvalidReferenceIssue(Issue): - dataset: str - field_name: str = field(metadata=config(field_name="field_name")) - row_id: str = field(metadata=config(field_name="row_id")) - organisation: str - - def __post_init__(self): - issue_scope = "invalid_reference" - if self.scope != issue_scope: - raise ValueError(f"scope must be '{issue_scope}'.") From 24594c2b6cc01ce8ed762f837fa31ec61d129c99 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 10:35:25 +0100 Subject: [PATCH 11/58] Change to reference --- digital_land/expectations/checkpoints/converted_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 73b666cf..512a8dce 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -77,7 +77,7 @@ def validate_references(self): "table_name": "resource", "field_name": "reference", "row_id": str(row_number), - "value": ref, + "value": "reference", "organisation": "organisation", } ) From adddaa4fc21a71afa2ac62bb0bae7b2a4cbba697 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 14:39:35 +0100 Subject: [PATCH 12/58] Separate functions and correct tests --- .../checkpoints/converted_resource.py | 69 ++----------------- .../resource_validations.py | 56 +++++++++++++++ .../expectations/test_checkpoint.py | 23 +++---- 3 files changed, 71 insertions(+), 77 deletions(-) create mode 100644 digital_land/expectations/expectation_functions/resource_validations.py diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 512a8dce..f00f24fc 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -1,13 +1,9 @@ -# checkpoint needs to assemble class state -# it needs to validate inputs specific for that checkpoint -# it then needs to run expectations -# then it needs to be able to save those expectation resultts -# a checkpoint represents the moment in the process where we tell it the -# type of data it is validating and where the data is -# the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from) from pathlib import Path -import csv from .base import BaseCheckpoint +from ..expectation_functions.resource_validations import ( + check_for_duplicate_references, + validate_references, +) class ConvertedResourceCheckpoint(BaseCheckpoint): @@ -18,68 +14,15 @@ def __init__(self, data_path): def load(self): self.expectations = [ { - "function": self.check_for_duplicate_references, + "function": check_for_duplicate_references(self.csv_path), "name": "Check for Duplicate References", "severity": "error", "responsibility": "system", }, { - "function": self.validate_references, + "function": validate_references(self.csv_path), "name": "Validate References", "severity": "error", "responsibility": "system", }, ] - - def check_for_duplicate_references(self): - duplicates = {} - issues = [] - - with self.csv_path.open(newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - if ref in duplicates: - duplicates[ref].append(row_number) - else: - duplicates[ref] = [row_number] - - for ref, rows in duplicates.items(): - if len(rows) > 1: - issues.append( - { - "scope": "row-group", - "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", - "dataset": "dataset", - "table_name": "resource", - "rows": rows, - "row_id": str(rows[0]), - "organisation": "organisation", - } - ) - - return True, "Checked for duplicate references.", issues - - def validate_references(self): - issues = [] - - with self.csv_path.open(newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - - if not ref: # This will be True for both None and empty strings - issues.append( - { - "scope": "value", - "message": f"Reference is missing on row {row_number}.", - "dataset": "dataset", - "table_name": "resource", - "field_name": "reference", - "row_id": str(row_number), - "value": "reference", - "organisation": "organisation", - } - ) - - return len(issues) == 0, "Checked for unpopulated references.", issues diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py new file mode 100644 index 00000000..23150be1 --- /dev/null +++ b/digital_land/expectations/expectation_functions/resource_validations.py @@ -0,0 +1,56 @@ +import csv + + +def check_for_duplicate_references(csv_path): + duplicates = {} + issues = [] + + with csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if ref in duplicates: + duplicates[ref].append(row_number) + else: + duplicates[ref] = [row_number] + + for ref, rows in duplicates.items(): + if len(rows) > 1: + issues.append( + { + "scope": "row-group", + "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", + "dataset": "dataset", + "table_name": "resource", + "rows": rows, + "row_id": str(rows[0]), + "organisation": "organisation", + } + ) + + return issues + + +def validate_references(csv_path): + issues = [] + + with csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + + if not ref: # This will be True for both None and empty strings + issues.append( + { + "scope": "value", + "message": f"Reference is missing on row {row_number}.", + "dataset": "dataset", + "table_name": "resource", + "field_name": "reference", + "row_id": str(row_number), + "value": "Missing", + "organisation": "organisation", + } + ) + + return issues diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index 3ab31bdc..f7d8d7c8 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -4,8 +4,9 @@ import pandas as pd from csv import DictReader, DictWriter from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint -from digital_land.expectations.checkpoints.converted_resource import ( - ConvertedResourceCheckpoint, +from digital_land.expectations.expectation_functions.resource_validations import ( + check_for_duplicate_references, + validate_references, ) @@ -148,15 +149,12 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): def test_check_for_duplicate_references(csv_path): - checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) - checkpoint.load() - - success, message, issues = checkpoint.check_for_duplicate_references() + issues = check_for_duplicate_references(csv_path) - assert success is True, "The function should successfully identify issues." + assert issues, "The function should successfully identify issues." assert len(issues) == 1, "There should be one issue identified." assert ( - issues[0]["scope"] == "duplicate_reference" + issues[0]["scope"] == "row-group" ), "The issue should be identified as a duplicate reference." assert ( "REF-001" in issues[0]["message"] @@ -164,14 +162,11 @@ def test_check_for_duplicate_references(csv_path): def test_validate_references(csv_path): - checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) - checkpoint.load() - - success, message, issues = checkpoint.validate_references() + issues = validate_references(csv_path) - assert success is False, "The function should fail due to invalid references." + assert issues, "The function should fail due to invalid references." assert len(issues) == 1, "There should be one issue identified." assert ( - issues[0]["scope"] == "invalid_reference" + issues[0]["scope"] == "value" ), "The issue should be identified as an invalid reference." assert "" in issues[0]["message"], " 4th value should be identified as invalid." From dab0d77a346236438c03fefcc8eef76184ea8398 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 15:02:58 +0100 Subject: [PATCH 13/58] Changes back to helpers --- .../expectations/checkpoints/converted_resource.py | 11 ++++++++--- .../expectation_functions/resource_validations.py | 7 ++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index f00f24fc..d82726c7 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -7,9 +7,14 @@ class ConvertedResourceCheckpoint(BaseCheckpoint): - def __init__(self, data_path): - super().__init__("converted_resource", data_path) - self.csv_path = Path(data_path) + def __init__(self, dataset_path, typology, dataset=None): + super().__init__("converted_resource", dataset_path) + self.csv_path = Path(dataset_path) + if dataset: + self.dataset = dataset + else: + self.dataset = self.csv_path.stem + self.typology = typology def load(self): self.expectations = [ diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py index 23150be1..2acbe669 100644 --- a/digital_land/expectations/expectation_functions/resource_validations.py +++ b/digital_land/expectations/expectation_functions/resource_validations.py @@ -4,7 +4,6 @@ def check_for_duplicate_references(csv_path): duplicates = {} issues = [] - with csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): @@ -28,17 +27,15 @@ def check_for_duplicate_references(csv_path): } ) - return issues + return True, "Checked for duplicate references.", issues def validate_references(csv_path): issues = [] - with csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): ref = row.get("reference") - if not ref: # This will be True for both None and empty strings issues.append( { @@ -53,4 +50,4 @@ def validate_references(csv_path): } ) - return issues + return len(issues) == 0, "Checked for unpopulated references.", issues From f162dcf87ace42eb8ccb3a6d4d62e0cedb138ec1 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 15:07:19 +0100 Subject: [PATCH 14/58] Fix --- digital_land/phase/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 74da23eb..303609f9 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -198,7 +198,7 @@ def _read_text_file(self, input_path, encoding): return reader def run_checkpoint(self, path): - checkpoint = ConvertedResourceCheckpoint(data_path=path) + checkpoint = ConvertedResourceCheckpoint(dataset_path=path) checkpoint.load() result = checkpoint.run() From c1f9081434daa457fcf9732c7dbbf4b123bc228e Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 12:41:53 +0100 Subject: [PATCH 15/58] Core changes --- digital_land/commands.py | 11 +++ .../checkpoints/converted_resource.py | 71 ++++++++++++++----- digital_land/expectations/commands.py | 4 +- digital_land/phase/convert.py | 29 -------- digital_land/phase/post_conversion.py | 38 ++++++++++ .../expectations/test_checkpoint.py | 4 +- 6 files changed, 106 insertions(+), 51 deletions(-) create mode 100644 digital_land/phase/post_conversion.py diff --git a/digital_land/commands.py b/digital_land/commands.py index ad9d05b1..07d7c488 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -19,6 +19,7 @@ from digital_land.phase.combine import FactCombinePhase from digital_land.phase.concat import ConcatFieldPhase from digital_land.phase.convert import ConvertPhase, execute +from digital_land.phase.post_conversion import PostConversionPhase from digital_land.phase.default import DefaultPhase from digital_land.phase.dump import DumpPhase from digital_land.phase.factor import FactorPhase @@ -162,6 +163,16 @@ def pipeline_run( dataset_resource_log=dataset_resource_log, custom_temp_dir=custom_temp_dir, ), + PostConversionPhase( + converted_resource_path=os.path.join( + custom_temp_dir, f"{resource}_converted.csv" + ), + output_dir=os.path.join( + os.path.dirname(output_path), "post_conversion_outputs" + ), + dataset=dataset, + typology=specification.get_typology_for_dataset(dataset), + ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), ConcatFieldPhase(concats=concats, log=column_field_log), diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index d82726c7..59c1c307 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -1,33 +1,68 @@ from pathlib import Path from .base import BaseCheckpoint +from ..utils import QueryRunner +import os from ..expectation_functions.resource_validations import ( check_for_duplicate_references, validate_references, ) +# Define BASE expectations which should always run +BASE = [ + { + "function": check_for_duplicate_references, + "name": "Check for Duplicate References", + "severity": "error", + "responsibility": "system", + }, + { + "function": validate_references, + "name": "Validate References", + "severity": "error", + "responsibility": "system", + }, +] + +# Empty TYPOLOGY and DATASET for now as per advice +TYPOLOGY = {} +DATASET = {} + class ConvertedResourceCheckpoint(BaseCheckpoint): def __init__(self, dataset_path, typology, dataset=None): super().__init__("converted_resource", dataset_path) self.csv_path = Path(dataset_path) - if dataset: - self.dataset = dataset - else: - self.dataset = self.csv_path.stem + self.dataset = dataset if dataset else self.csv_path.stem self.typology = typology def load(self): - self.expectations = [ - { - "function": check_for_duplicate_references(self.csv_path), - "name": "Check for Duplicate References", - "severity": "error", - "responsibility": "system", - }, - { - "function": validate_references(self.csv_path), - "name": "Validate References", - "severity": "error", - "responsibility": "system", - }, - ] + self.expectations = [] + self.expectations.extend(BASE) + typology_expectations = TYPOLOGY.get(self.typology, []) + dataset_expectations = DATASET.get(self.dataset, []) + + # Extend the expectations list with relevant typology and dataset-specific expectations + if typology_expectations: + self.expectations.extend(typology_expectations) + if dataset_expectations: + self.expectations.extend(dataset_expectations) + + # Assign a QueryRunner instance to each expectation + for expectation in self.expectations: + expectation["query_runner"] = QueryRunner(self.csv_path) + + def save(self, output_dir, format="csv"): + responses_file_path = os.path.join( + output_dir, self.checkpoint, f"{self.dataset}-responses.csv" + ) + issues_file_path = os.path.join( + output_dir, self.checkpoint, f"{self.dataset}-issues.csv" + ) + + self.save_responses( + self.responses, + responses_file_path, + format=format, + ) + + self.save_issues(self.issues, issues_file_path, format=format) diff --git a/digital_land/expectations/commands.py b/digital_land/expectations/commands.py index d16c6533..7b7f7922 100644 --- a/digital_land/expectations/commands.py +++ b/digital_land/expectations/commands.py @@ -1,5 +1,5 @@ from .checkpoints.dataset import DatasetCheckpoint -from .checkpoints.converted_resource import CovertedResourceCheckpoint +from .checkpoints.converted_resource import ConvertedResourceCheckpoint def run_dataset_checkpoint( @@ -30,7 +30,7 @@ def run_converted_resource_checkpoint( """ Function to run the expectation checkpoint for a converted resource """ - checkpoint = CovertedResourceCheckpoint(converted_resource_path, dataset, typology) + checkpoint = ConvertedResourceCheckpoint(converted_resource_path, dataset, typology) checkpoint.load() checkpoint.run() checkpoint.save(output_dir, format="csv") diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 303609f9..b57c22c1 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -11,9 +11,6 @@ import pandas as pd from .load import Stream from .phase import Phase -from digital_land.expectations.checkpoints.converted_resource import ( - ConvertedResourceCheckpoint, -) def detect_file_encoding(path): @@ -190,38 +187,12 @@ def _read_text_file(self, input_path, encoding): if converted_csv_file: f.close() - self.run_checkpoint(converted_csv_file) reader = read_csv(converted_csv_file) else: reader = f return reader - def run_checkpoint(self, path): - checkpoint = ConvertedResourceCheckpoint(dataset_path=path) - checkpoint.load() - result = checkpoint.run() - - # Check if the result is not None and is iterable (unpackable) - if result is not None and isinstance(result, tuple) and len(result) == 2: - checkpoint_result, issues = result - else: - logging.error("Checkpoint did not return the expected result format.") - return - - if issues: - for issue in issues: - log_message = self.format_issue_message(issue) - - if issue["severity"] == "error": - logging.error(log_message) - elif issue["severity"] == "warning": - logging.warning(log_message) - else: - logging.info(log_message) - else: - logging.info(f"Checkpoint completed with result: {checkpoint_result}") - def format_issue_message(self, issue): return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})" diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py new file mode 100644 index 00000000..801aaed8 --- /dev/null +++ b/digital_land/phase/post_conversion.py @@ -0,0 +1,38 @@ +from expectations.commands import run_converted_resource_checkpoint + + +class PostConversionPhase: + def __init__( + self, + converted_resource_path, + output_dir, + dataset, + typology, + act_on_critical_error=False, + ): + """ + Initializes the PostConversionPhase with necessary parameters. + :param converted_resource_path: Path to the converted CSV file. + :param output_dir: Directory to store output files. + :param dataset: Dataset related information for the checkpoint. + :param typology: Typology information for the checkpoint. + :param act_on_critical_error: Whether to act on critical errors during the checkpoint. + """ + self.converted_resource_path = converted_resource_path + self.output_dir = output_dir + self.dataset = dataset + self.typology = typology + self.act_on_critical_error = act_on_critical_error + + def run(self): + """ + Executes the converted resource checkpoint using the provided parameters. + """ + # Run the checkpoint on the converted resource + run_converted_resource_checkpoint( + self.converted_resource_path, + self.output_dir, + self.dataset, + self.typology, + self.act_on_critical_error, + ) diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index f7d8d7c8..9276d20f 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -149,7 +149,7 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): def test_check_for_duplicate_references(csv_path): - issues = check_for_duplicate_references(csv_path) + _, _, issues = check_for_duplicate_references(csv_path) assert issues, "The function should successfully identify issues." assert len(issues) == 1, "There should be one issue identified." @@ -162,7 +162,7 @@ def test_check_for_duplicate_references(csv_path): def test_validate_references(csv_path): - issues = validate_references(csv_path) + _, _, issues = validate_references(csv_path) assert issues, "The function should fail due to invalid references." assert len(issues) == 1, "There should be one issue identified." From c1c218c20049dd89244cc3694eb6262a6b4429c1 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 12:49:24 +0100 Subject: [PATCH 16/58] Import change --- digital_land/phase/post_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index 801aaed8..e312644d 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -1,4 +1,4 @@ -from expectations.commands import run_converted_resource_checkpoint +from ..expectations.commands import run_converted_resource_checkpoint class PostConversionPhase: From a046e7d592ec662230c761516369d26604a3caf9 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:03:10 +0100 Subject: [PATCH 17/58] Parameter changes --- digital_land/commands.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 07d7c488..ca9224cf 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -128,6 +128,7 @@ def pipeline_run( issue_log = IssueLog(dataset=dataset, resource=resource) column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + converted_resource_path = custom_temp_dir / f"{resource}_converted.csv" # load pipeline configuration skip_patterns = pipeline.skip_patterns(resource) @@ -164,12 +165,8 @@ def pipeline_run( custom_temp_dir=custom_temp_dir, ), PostConversionPhase( - converted_resource_path=os.path.join( - custom_temp_dir, f"{resource}_converted.csv" - ), - output_dir=os.path.join( - os.path.dirname(output_path), "post_conversion_outputs" - ), + converted_resource_path=converted_resource_path, + output_dir=output_path, dataset=dataset, typology=specification.get_typology_for_dataset(dataset), ), From 7769264b967cd000923c52c0d62eb9c3e096fcdf Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:39:52 +0100 Subject: [PATCH 18/58] Changes to convert --- digital_land/commands.py | 11 +++++++++++ digital_land/phase/convert.py | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/digital_land/commands.py b/digital_land/commands.py index ca9224cf..1c719b63 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -158,6 +158,17 @@ def pipeline_run( if entry_date: default_values["entry-date"] = entry_date + convert_phase = ConvertPhase( + path=input_path, + dataset_resource_log=DatasetResourceLog(), + custom_temp_dir=custom_temp_dir, + output_path=output_path, + ) + + # Execute the ConvertPhase to set the converted_resource_path + convert_phase.process() + converted_resource_path = convert_phase.converted_resource_path + run_pipeline( ConvertPhase( path=input_path, diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index b57c22c1..9cd99f45 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -125,6 +125,9 @@ def __init__( self.path = path self.log = dataset_resource_log self.charset = "" + self.converted_resource_path = ( + None # This will hold the path to the converted file + ) # Allows for custom temporary directory to be specified # This allows symlink creation in case of /tmp & path being on different partitions if custom_temp_dir: @@ -155,6 +158,8 @@ def process(self, stream=None): # raise StopIteration() reader = iter(()) + if self.output_path: + self.converted_resource_path = self.output_path return Stream(input_path, f=reader, log=self.log) From cc64e30aaa929fbab3619f72316d852308eab61c Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:43:39 +0100 Subject: [PATCH 19/58] Fix --- digital_land/commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 1c719b63..8c4767dc 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -128,7 +128,6 @@ def pipeline_run( issue_log = IssueLog(dataset=dataset, resource=resource) column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - converted_resource_path = custom_temp_dir / f"{resource}_converted.csv" # load pipeline configuration skip_patterns = pipeline.skip_patterns(resource) From e1311643be429533c639eee5ed7bc14babdd492c Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:49:43 +0100 Subject: [PATCH 20/58] Typology change --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 8c4767dc..9062c36c 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -178,7 +178,7 @@ def pipeline_run( converted_resource_path=converted_resource_path, output_dir=output_path, dataset=dataset, - typology=specification.get_typology_for_dataset(dataset), + typology=specification.get_dataset_typology(dataset), ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), From cc983ce72603357b812370aa4d8cef90f9f21dc7 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:55:08 +0100 Subject: [PATCH 21/58] Add Process --- digital_land/commands.py | 5 ----- digital_land/phase/post_conversion.py | 11 +++-------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 9062c36c..a0f3fc26 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -169,11 +169,6 @@ def pipeline_run( converted_resource_path = convert_phase.converted_resource_path run_pipeline( - ConvertPhase( - path=input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), PostConversionPhase( converted_resource_path=converted_resource_path, output_dir=output_path, diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index e312644d..2216f8dd 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -10,20 +10,15 @@ def __init__( typology, act_on_critical_error=False, ): - """ - Initializes the PostConversionPhase with necessary parameters. - :param converted_resource_path: Path to the converted CSV file. - :param output_dir: Directory to store output files. - :param dataset: Dataset related information for the checkpoint. - :param typology: Typology information for the checkpoint. - :param act_on_critical_error: Whether to act on critical errors during the checkpoint. - """ self.converted_resource_path = converted_resource_path self.output_dir = output_dir self.dataset = dataset self.typology = typology self.act_on_critical_error = act_on_critical_error + def process(self): + return self.run() + def run(self): """ Executes the converted resource checkpoint using the provided parameters. From d26369f8ec93f141df6ebb0febc04552b74f9eed Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 14:00:24 +0100 Subject: [PATCH 22/58] Add process parameter --- digital_land/phase/post_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index 2216f8dd..00dcdd77 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -16,7 +16,7 @@ def __init__( self.typology = typology self.act_on_critical_error = act_on_critical_error - def process(self): + def process(self, stream=None): return self.run() def run(self): From fceb81ac1ebbba744ff341bd9ece901b748c41b6 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 14:09:35 +0100 Subject: [PATCH 23/58] Query runner adjustments --- .../expectation_functions/resource_validations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py index 2acbe669..c6acae74 100644 --- a/digital_land/expectations/expectation_functions/resource_validations.py +++ b/digital_land/expectations/expectation_functions/resource_validations.py @@ -1,7 +1,7 @@ import csv -def check_for_duplicate_references(csv_path): +def check_for_duplicate_references(csv_path, **kwargs): duplicates = {} issues = [] with csv_path.open(newline="") as csvfile: @@ -30,7 +30,7 @@ def check_for_duplicate_references(csv_path): return True, "Checked for duplicate references.", issues -def validate_references(csv_path): +def validate_references(csv_path, **kwargs): issues = [] with csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) From 324b2c132aeb50b50f5a218f4ae077c5624bd81e Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 14:15:47 +0100 Subject: [PATCH 24/58] Fix converted resource --- digital_land/expectations/checkpoints/converted_resource.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 59c1c307..14be3c21 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -14,12 +14,14 @@ "name": "Check for Duplicate References", "severity": "error", "responsibility": "system", + "csv_path": None, }, { "function": validate_references, "name": "Validate References", "severity": "error", "responsibility": "system", + "csv_path": None, }, ] @@ -49,6 +51,7 @@ def load(self): # Assign a QueryRunner instance to each expectation for expectation in self.expectations: + expectation["csv_path"] = self.csv_path expectation["query_runner"] = QueryRunner(self.csv_path) def save(self, output_dir, format="csv"): From 1c5d64071e1c7c9f2d586be6c6a5b67763722bbe Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 17 Apr 2024 16:33:12 +0100 Subject: [PATCH 25/58] Change pathing --- digital_land/commands.py | 1647 +++++++++++++++++++------------------- 1 file changed, 821 insertions(+), 826 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index a0f3fc26..07befebf 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1,826 +1,821 @@ -from collections import OrderedDict -import csv -import itertools -import os -import sys -import json -import logging -from pathlib import Path - -import geojson -import shapely - -from digital_land.specification import Specification -from digital_land.collect import Collector -from digital_land.collection import Collection, resource_path -from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog -from digital_land.organisation import Organisation -from digital_land.package.dataset import DatasetPackage -from digital_land.phase.combine import FactCombinePhase -from digital_land.phase.concat import ConcatFieldPhase -from digital_land.phase.convert import ConvertPhase, execute -from digital_land.phase.post_conversion import PostConversionPhase -from digital_land.phase.default import DefaultPhase -from digital_land.phase.dump import DumpPhase -from digital_land.phase.factor import FactorPhase -from digital_land.phase.filter import FilterPhase -from digital_land.phase.harmonise import HarmonisePhase -from digital_land.phase.lookup import ( - EntityLookupPhase, - FactLookupPhase, - PrintLookupPhase, -) -from digital_land.phase.map import MapPhase -from digital_land.phase.migrate import MigratePhase -from digital_land.phase.normalise import NormalisePhase -from digital_land.phase.organisation import OrganisationPhase -from digital_land.phase.parse import ParsePhase -from digital_land.phase.patch import PatchPhase -from digital_land.phase.pivot import PivotPhase -from digital_land.phase.prefix import EntityPrefixPhase -from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase -from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase -from digital_land.phase.save import SavePhase -from digital_land.pipeline import run_pipeline, Lookups, Pipeline -from digital_land.schema import Schema -from digital_land.update import add_source_endpoint -from .register import hash_value - -logger = logging.getLogger(__name__) - - -def fetch(url, pipeline): - """fetch a single source endpoint URL, and add it to the collection""" - collector = Collector(pipeline.name) - collector.fetch(url) - - -def collect(endpoint_path, collection_dir, pipeline): - """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file""" - collector = Collector(pipeline.name, Path(collection_dir)) - collector.collect(endpoint_path) - - -# -# collection commands -# TBD: make sub commands -# -def collection_list_resources(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - for resource in sorted(collection.resource.records): - print(resource_path(resource, directory=collection_dir)) - - -def collection_pipeline_makerules(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - collection.pipeline_makerules() - - -def collection_save_csv(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - collection.update() - collection.save_csv() - - -# -# pipeline commands -# -def convert(input_path, output_path, custom_temp_dir=None): - if not output_path: - output_path = default_output_path("converted", input_path) - dataset_resource_log = DatasetResourceLog() - run_pipeline( - ConvertPhase( - input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), - DumpPhase(output_path), - ) - dataset_resource_log.save(f=sys.stdout) - - -def pipeline_run( - dataset, - pipeline, - specification, - input_path, - output_path, - collection_dir="./collection", # TBD: remove, replaced by endpoints, organisations and entry_date - null_path=None, # TBD: remove this - issue_dir=None, - organisation_path=None, - save_harmonised=False, - column_field_dir=None, - dataset_resource_dir=None, - custom_temp_dir=None, # TBD: rename to "tmpdir" - endpoints=[], - organisations=[], - entry_date="", -): - resource = resource_from_path(input_path) - dataset = dataset - schema = specification.pipeline[pipeline.name]["schema"] - intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) - issue_log = IssueLog(dataset=dataset, resource=resource) - column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) - dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - - # load pipeline configuration - skip_patterns = pipeline.skip_patterns(resource) - columns = pipeline.columns(resource, endpoints=endpoints) - concats = pipeline.concatenations(resource, endpoints=endpoints) - patches = pipeline.patches(resource=resource) - lookups = pipeline.lookups(resource=resource) - default_fields = pipeline.default_fields(resource=resource) - default_values = pipeline.default_values(endpoints=endpoints) - combine_fields = pipeline.combine_fields(endpoints=endpoints) - - # load organisations - organisation = Organisation(organisation_path, Path(pipeline.path)) - - # load the resource default values from the collection - if not endpoints: - collection = Collection(name=None, directory=collection_dir) - collection.load() - endpoints = collection.resource_endpoints(resource) - organisations = collection.resource_organisations(resource) - entry_date = collection.resource_start_date(resource) - - # resource specific default values - if len(organisations) == 1: - default_values["organisation"] = organisations[0] - - if entry_date: - default_values["entry-date"] = entry_date - - convert_phase = ConvertPhase( - path=input_path, - dataset_resource_log=DatasetResourceLog(), - custom_temp_dir=custom_temp_dir, - output_path=output_path, - ) - - # Execute the ConvertPhase to set the converted_resource_path - convert_phase.process() - converted_resource_path = convert_phase.converted_resource_path - - run_pipeline( - PostConversionPhase( - converted_resource_path=converted_resource_path, - output_dir=output_path, - dataset=dataset, - typology=specification.get_dataset_typology(dataset), - ), - NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), - ParsePhase(), - ConcatFieldPhase(concats=concats, log=column_field_log), - MapPhase( - fieldnames=intermediate_fieldnames, - columns=columns, - log=column_field_log, - ), - FilterPhase(filters=pipeline.filters(resource)), - PatchPhase( - issues=issue_log, - patches=patches, - ), - HarmonisePhase( - field_datatype_map=specification.get_field_datatype_map(), - issues=issue_log, - dataset=dataset, - ), - DefaultPhase( - default_fields=default_fields, - default_values=default_values, - issues=issue_log, - ), - # TBD: move migrating columns to fields to be immediately after map - # this will simplify harmonisation and remove intermediate_fieldnames - # but effects brownfield-land and other pipelines which operate on columns - MigratePhase( - fields=specification.schema_field[schema], - migrations=pipeline.migrations(), - ), - OrganisationPhase(organisation=organisation, issues=issue_log), - FieldPrunePhase(fields=specification.current_fieldnames(schema)), - EntityReferencePhase( - dataset=dataset, - prefix=specification.dataset_prefix(dataset), - ), - EntityPrefixPhase(dataset=dataset), - EntityLookupPhase(lookups), - SavePhase( - default_output_path("harmonised", input_path), - fieldnames=intermediate_fieldnames, - enabled=save_harmonised, - ), - EntityPrunePhase( - issue_log=issue_log, dataset_resource_log=dataset_resource_log - ), - PivotPhase(), - FactCombinePhase(issue_log=issue_log, fields=combine_fields), - FactorPhase(), - FactReferencePhase( - field_typology_map=specification.get_field_typology_map(), - field_prefix_map=specification.get_field_prefix_map(), - ), - FactLookupPhase(lookups), - FactPrunePhase(), - SavePhase( - output_path, - fieldnames=specification.factor_fieldnames(), - ), - ) - - issue_log.save(os.path.join(issue_dir, resource + ".csv")) - column_field_log.save(os.path.join(column_field_dir, resource + ".csv")) - dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv")) - - -# -# build dataset from processed resources -# -def dataset_create( - input_paths, - output_path, - organisation_path, - pipeline, - dataset, - specification, - issue_dir="issue", -): - if not output_path: - print("missing output path", file=sys.stderr) - sys.exit(2) - organisation = Organisation(organisation_path, Path(pipeline.path)) - package = DatasetPackage( - dataset, - organisation=organisation, - path=output_path, - specification_dir=None, # TBD: package should use this specification object - ) - package.create() - for path in input_paths: - package.load_transformed(path) - package.load_entities() - - old_entity_path = os.path.join(pipeline.path, "old-entity.csv") - if os.path.exists(old_entity_path): - package.load_old_entities(old_entity_path) - - issue_paths = os.path.join(issue_dir, dataset) - if os.path.exists(issue_paths): - for issue_path in os.listdir(issue_paths): - package.load_issues(os.path.join(issue_paths, issue_path)) - else: - logging.warning("No directory for this dataset in the provided issue_directory") - - package.add_counts() - - -def dataset_dump(input_path, output_path): - cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" - logging.info(cmd) - os.system(cmd) - - -def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset): - if isinstance(csv_path, str): - path = Path(csv_path) - dataset_name = path.stem - elif isinstance(csv_path, Path): - dataset_name = csv_path.stem - else: - logging.error(f"Can't extract datapackage name from {csv_path}") - sys.exit(-1) - - flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv") - with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file: - reader = csv.DictReader(read_file) - - spec_field_names = [ - field - for field in itertools.chain( - *[ - specification.current_fieldnames(schema) - for schema in specification.dataset_schema[dataset] - ] - ) - ] - reader_fieldnames = [ - field.replace("_", "-") - for field in list(reader.fieldnames) - if field != "json" - ] - - flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames)) - # Make sure we put flattened fieldnames last - field_names = reader_fieldnames + sorted(list(flattened_field_names)) - - writer = csv.DictWriter(write_file, fieldnames=field_names) - writer.writeheader() - entities = [] - for row in reader: - row.pop("geojson", None) - row = OrderedDict(row) - json_string = row.pop("json") or "{}" - row.update(json.loads(json_string)) - kebab_case_row = dict( - [(key.replace("_", "-"), val) for key, val in row.items()] - ) - writer.writerow(kebab_case_row) - entities.append(kebab_case_row) - - # write the entities to json file as well - flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json") - with open(flattened_json_path, "w") as out_json: - out_json.write(json.dumps({"entities": entities})) - batch_size = 100000 - temp_geojson_files = [] - geography_entities = [e for e in entities if e["typology"] == "geography"] - for i in range(0, len(geography_entities), batch_size): - batch = geography_entities[i : i + batch_size] - feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name) - - geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson") - temp_geojson_files.append(geojson_path) - try: - with open(geojson_path, "w", encoding="utf-8") as out_geojson: - out_geojson.write(geojson.dumps(feature_collection)) - except Exception as e: - logging.error(f"Error writing to GeoJSON file: {e}") - - if all(os.path.isfile(path) for path in temp_geojson_files): - rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson") - for temp_path in temp_geojson_files: - responseCode, _, _ = execute( - [ - "ogr2ogr", - "-f", - "GeoJSON", - "-lco", - "RFC7946=YES", - "-append", - rfc7946_geojson_path, - temp_path, - ] - ) - - if responseCode != 0: - logging.error( - "Could not generate rfc7946 compliant geojson. Use existing file." - ) - execute( - [ - "ogr2ogr", - "-f", - "GeoJSON", - "-append", - rfc7946_geojson_path, - temp_path, - ] - ) - # clear up input geojson file - if os.path.isfile(temp_path): - os.remove(temp_path) - - -# -# configuration commands -# -def collection_add_source(entry, collection, endpoint_url, collection_dir): - """ - followed by a sequence of optional name and value pairs including the following names: - "attribution", "licence", "pipelines", "status", "plugin", - "parameters", "start-date", "end-date" - """ - entry["collection"] = collection - entry["endpoint-url"] = endpoint_url - allowed_names = set( - list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames) - ) - for key in entry.keys(): - if key not in allowed_names: - logging.error(f"unrecognised argument '{key}'") - sys.exit(2) - add_source_endpoint(entry, directory=collection_dir) - - -def add_endpoints_and_lookups( - csv_file_path, - collection_name, - collection_dir, - pipeline_dir, - specification_dir, - organisation_path, - tmp_dir="./var/cache", -): - """ - :param csv_file_path: - :param collection_name: - :param collection_dir: - :param pipeline_dir: - :param specification_dir: - :param organisation_path: - :param tmp_dir: - :return: - """ - - expected_cols = [ - "pipelines", - "organisation", - "documentation-url", - "endpoint-url", - "start-date", - "licence", - ] - - licence_csv_path = os.path.join(specification_dir, "licence.csv") - valid_licenses = [] - with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile: - reader = csv.DictReader(csvfile) - valid_licenses = [row["licence"] for row in reader] - - # need to get collection name from somewhere - # collection name is NOT the dataset name - collection = Collection(name=collection_name, directory=collection_dir) - collection.load() - - # read and process each record of the new endpoints csv at csv_file_path - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - - # validate the columns - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") - - # this is not perfect we should riase validation errors in our code and below should include a try and except statement - endpoints = [] - for row in reader: - if row["licence"] not in valid_licenses: - raise ValueError( - f"Licence '{row['licence']}' is not a valid licence according to the specification." - ) - if not row["documentation-url"].strip(): - raise ValueError( - "The 'documentation-url' must be populated for each row." - ) - if collection.add_source_endpoint(row): - endpoint = { - "endpoint-url": row["endpoint-url"], - "endpoint": hash_value(row["endpoint-url"]), - "end-date": row.get("end-date", ""), - "plugin": row.get("plugin"), - "licence": row["licence"], - } - endpoints.append(endpoint) - - # endpoints have been added now lets collect the resources using the endpoint information - collector = Collector(collection_dir=collection_dir) - - for endpoint in endpoints: - collector.fetch( - url=endpoint["endpoint-url"], - endpoint=endpoint["endpoint"], - end_date=endpoint["end-date"], - plugin=endpoint["plugin"], - ) - # reload log items - collection.load_log_items() - - dataset_resource_map = collection.dataset_resource_map() - - # searching for the specific resources that we have downloaded - for dataset in dataset_resource_map: - resources_to_assign = [] - for resource in dataset_resource_map[dataset]: - resource_endpoints = collection.resource_endpoints(resource) - if any( - endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints] - for endpoint in resource_endpoints - ): - resource_file_path = Path(collection_dir) / "resource" / resource - resources_to_assign.append(resource_file_path) - assign_entities( - resource_file_paths=resources_to_assign, - collection=collection, - pipeline_dir=pipeline_dir, - specification_dir=specification_dir, - organisation_path=organisation_path, - tmp_dir=tmp_dir, - dataset=dataset, - ) - - -def resource_from_path(path): - return Path(path).stem - - -def default_output_path(command, input_path): - directory = "" if command in ["harmonised", "transformed"] else "var/" - return f"{directory}{command}/{resource_from_path(input_path)}.csv" - - -def assign_entities( - resource_file_paths, - collection, - pipeline_dir, - specification_dir, - organisation_path, - tmp_dir="./var/cache", - dataset=None, -): - """ - Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection - :param resource_file_paths: - :param collection: - :param pipeline_dir: - :param specification_dir: - :param organisation_path: - :param tmp_dir: - :return: - """ - - specification = Specification(specification_dir) - - print("") - print("======================================================================") - print("New Lookups") - print("======================================================================") - - dataset_resource_map = collection.dataset_resource_map() - new_lookups = [] - - pipeline_name = None - # establish pipeline if dataset is known - else have to find dataset for each resource - if dataset is not None: - pipeline = Pipeline(pipeline_dir, dataset) - pipeline_name = pipeline.name - - for resource_file_path in resource_file_paths: - resource = os.path.splitext(os.path.basename(resource_file_path))[0] - # Find dataset for resource if not given - if dataset is None: - for dataset_key, resources in dataset_resource_map.items(): - if resource in list(resources): - dataset = dataset_key - continue - # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline - if dataset is not None: - pipeline = Pipeline(pipeline_dir, dataset) - pipeline_name = pipeline.name - else: - logging.error( - "Resource '%s' has not been processed by pipeline - no lookups added" - % (resource) - ) - break - - resource_lookups = get_resource_unidentified_lookups( - input_path=Path(resource_file_path), - dataset=dataset, - organisations=collection.resource_organisations(resource), - pipeline=pipeline, - specification=specification, - tmp_dir=Path(tmp_dir).absolute(), - org_csv_path=organisation_path, - ) - new_lookups.append(resource_lookups) - - if pipeline_name is not None: - # save new lookups to file - lookups = Lookups(pipeline_dir) - # Check if the lookups file exists, create it if not - if not os.path.exists(lookups.lookups_path): - with open(lookups.lookups_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(list(lookups.schema.fieldnames)) - - lookups.load_csv() - for new_lookup in new_lookups: - for idx, entry in enumerate(new_lookup): - lookups.add_entry(entry[0]) - - # save edited csvs - max_entity_num = lookups.get_max_entity(pipeline_name) - lookups.entity_num_gen.state["current"] = max_entity_num - lookups.entity_num_gen.state["range_max"] = ( - specification.get_dataset_entity_max(pipeline_name) - ) - lookups.entity_num_gen.state["range_min"] = ( - specification.get_dataset_entity_min(pipeline_name) - ) - - # TO DO: Currently using pipeline_name to find dataset min, max, current - # This would not function properly if each resource had a different dataset - - collection.save_csv() - new_lookups = lookups.save_csv() - - for entity in new_lookups: - print( - entity["prefix"], - ",", - entity["organisation"], - ",", - entity["reference"], - ",", - entity["entity"], - ) - - -def get_resource_unidentified_lookups( - input_path: Path, - dataset: str, - pipeline: Pipeline, - specification: Specification, - organisations: list = [], - tmp_dir: Path = None, - org_csv_path: Path = None, -): - # convert phase inputs - # could alter resource_from_path to file from path and promote to a utils folder - resource = resource_from_path(input_path) - dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - custom_temp_dir = tmp_dir # './var' - - print("") - print("----------------------------------------------------------------------") - print(f">>> organisations:{organisations}") - print(f">>> resource:{resource}") - print("----------------------------------------------------------------------") - - # normalise phase inputs - skip_patterns = pipeline.skip_patterns(resource) - null_path = None - - # concat field phase - concats = pipeline.concatenations(resource) - column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) - - # map phase - intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) - columns = pipeline.columns(resource) - - # patch phase - patches = pipeline.patches(resource=resource) - - # harmonize phase - issue_log = IssueLog(dataset=dataset, resource=resource) - - # default phase - default_fields = pipeline.default_fields(resource=resource) - default_values = pipeline.default_values(endpoints=[]) - - if len(organisations) == 1: - default_values["organisation"] = organisations[0] - - # migrate phase - schema = specification.pipeline[pipeline.name]["schema"] - - # organisation phase - organisation = Organisation(org_csv_path, Path(pipeline.path)) - - # print lookups phase - pipeline_lookups = pipeline.lookups() - redirect_lookups = pipeline.redirect_lookups() - print_lookup_phase = PrintLookupPhase( - lookups=pipeline_lookups, redirect_lookups=redirect_lookups - ) - - run_pipeline( - ConvertPhase( - path=input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), - NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), - ParsePhase(), - ConcatFieldPhase(concats=concats, log=column_field_log), - MapPhase( - fieldnames=intermediate_fieldnames, - columns=columns, - log=column_field_log, - ), - FilterPhase(filters=pipeline.filters(resource)), - PatchPhase( - issues=issue_log, - patches=patches, - ), - HarmonisePhase( - field_datatype_map=specification.get_field_datatype_map(), - issues=issue_log, - ), - DefaultPhase( - default_fields=default_fields, - default_values=default_values, - issues=issue_log, - ), - # TBD: move migrating columns to fields to be immediately after map - # this will simplify harmonisation and remove intermediate_fieldnames - # but effects brownfield-land and other pipelines which operate on columns - MigratePhase( - fields=specification.schema_field[schema], - migrations=pipeline.migrations(), - ), - OrganisationPhase(organisation=organisation, issues=issue_log), - FieldPrunePhase(fields=specification.current_fieldnames(schema)), - EntityReferencePhase( - dataset=dataset, - prefix=specification.dataset_prefix(dataset), - ), - EntityPrefixPhase(dataset=dataset), - print_lookup_phase, - ) - - return print_lookup_phase.new_lookup_entries - - -def process_data_in_batches(entities, flattened_dir, dataset_name): - features = [] - feature_collection = "" - for entity in entities: - geom = entity.pop("geometry") - point = entity.pop("point") - if geom: - try: - geometry = shapely.wkt.loads(geom) - feature = geojson.Feature(geometry=geometry, properties=entity) - features.append(feature) - except Exception as e: - logging.error(f"Error loading wkt from entity {entity['entity']}") - logging.error(e) - elif point: - try: - geometry = shapely.wkt.loads(point) - feature = geojson.Feature(geometry=geometry, properties=entity) - features.append(feature) - except Exception as e: - logging.error(f"Error loading wkt from entity {entity['entity']}") - logging.error(e) - else: - logging.error( - f"No geometry or point data for entity {entity['entity']} with typology 'geography'" - ) - - if features: - feature_collection = geojson.FeatureCollection( - features=features, name=dataset_name - ) - - return feature_collection - - -def add_redirections(csv_file_path, pipeline_dir): - """ - :param csv_file_path: - :param pipeline_dir: - :return: - """ - expected_cols = [ - "entity_source", - "entity_destination", - ] - - old_entity_path = Path(pipeline_dir) / "old-entity.csv" - - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") - - fieldnames = ["old-entity", "status", "entity"] - - f = open(old_entity_path, "a", newline="") - writer = csv.DictWriter(f, fieldnames=fieldnames) - if f.tell() == 0: - writer.writeheader() - - for row in reader: - if row["entity_source"] == "" or row["entity_destination"] == "": - print( - "Missing entity number for", - ( - row["entity_destination"] - if row["entity_source"] == "" - else row["entity_source"] - ), - ) - else: - writer.writerow( - { - "old-entity": row["entity_source"], - "status": "301", - "entity": row["entity_destination"], - } - ) - print("Redirections added to old-entity.csv") +from collections import OrderedDict +import csv +import itertools +import os +import sys +import json +import logging +from pathlib import Path + +import geojson +import shapely + +from digital_land.specification import Specification +from digital_land.collect import Collector +from digital_land.collection import Collection, resource_path +from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog +from digital_land.organisation import Organisation +from digital_land.package.dataset import DatasetPackage +from digital_land.phase.combine import FactCombinePhase +from digital_land.phase.concat import ConcatFieldPhase +from digital_land.phase.convert import ConvertPhase, execute +from digital_land.phase.post_conversion import PostConversionPhase +from digital_land.phase.default import DefaultPhase +from digital_land.phase.dump import DumpPhase +from digital_land.phase.factor import FactorPhase +from digital_land.phase.filter import FilterPhase +from digital_land.phase.harmonise import HarmonisePhase +from digital_land.phase.lookup import ( + EntityLookupPhase, + FactLookupPhase, + PrintLookupPhase, +) +from digital_land.phase.map import MapPhase +from digital_land.phase.migrate import MigratePhase +from digital_land.phase.normalise import NormalisePhase +from digital_land.phase.organisation import OrganisationPhase +from digital_land.phase.parse import ParsePhase +from digital_land.phase.patch import PatchPhase +from digital_land.phase.pivot import PivotPhase +from digital_land.phase.prefix import EntityPrefixPhase +from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase +from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase +from digital_land.phase.save import SavePhase +from digital_land.pipeline import run_pipeline, Lookups, Pipeline +from digital_land.schema import Schema +from digital_land.update import add_source_endpoint +from .register import hash_value + +logger = logging.getLogger(__name__) + + +def fetch(url, pipeline): + """fetch a single source endpoint URL, and add it to the collection""" + collector = Collector(pipeline.name) + collector.fetch(url) + + +def collect(endpoint_path, collection_dir, pipeline): + """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file""" + collector = Collector(pipeline.name, Path(collection_dir)) + collector.collect(endpoint_path) + + +# +# collection commands +# TBD: make sub commands +# +def collection_list_resources(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + for resource in sorted(collection.resource.records): + print(resource_path(resource, directory=collection_dir)) + + +def collection_pipeline_makerules(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + collection.pipeline_makerules() + + +def collection_save_csv(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + collection.update() + collection.save_csv() + + +# +# pipeline commands +# +def convert(input_path, output_path, custom_temp_dir=None): + if not output_path: + output_path = default_output_path("converted", input_path) + dataset_resource_log = DatasetResourceLog() + run_pipeline( + ConvertPhase( + input_path, + dataset_resource_log=dataset_resource_log, + custom_temp_dir=custom_temp_dir, + ), + DumpPhase(output_path), + ) + dataset_resource_log.save(f=sys.stdout) + + +def pipeline_run( + dataset, + pipeline, + specification, + input_path, + output_path, + collection_dir="./collection", # TBD: remove, replaced by endpoints, organisations and entry_date + null_path=None, # TBD: remove this + issue_dir=None, + organisation_path=None, + save_harmonised=False, + column_field_dir=None, + dataset_resource_dir=None, + custom_temp_dir=None, # TBD: rename to "tmpdir" + endpoints=[], + organisations=[], + entry_date="", +): + resource = resource_from_path(input_path) + dataset = dataset + schema = specification.pipeline[pipeline.name]["schema"] + intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) + issue_log = IssueLog(dataset=dataset, resource=resource) + column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) + dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + + # load pipeline configuration + skip_patterns = pipeline.skip_patterns(resource) + columns = pipeline.columns(resource, endpoints=endpoints) + concats = pipeline.concatenations(resource, endpoints=endpoints) + patches = pipeline.patches(resource=resource) + lookups = pipeline.lookups(resource=resource) + default_fields = pipeline.default_fields(resource=resource) + default_values = pipeline.default_values(endpoints=endpoints) + combine_fields = pipeline.combine_fields(endpoints=endpoints) + + # load organisations + organisation = Organisation(organisation_path, Path(pipeline.path)) + + # load the resource default values from the collection + if not endpoints: + collection = Collection(name=None, directory=collection_dir) + collection.load() + endpoints = collection.resource_endpoints(resource) + organisations = collection.resource_organisations(resource) + entry_date = collection.resource_start_date(resource) + + # resource specific default values + if len(organisations) == 1: + default_values["organisation"] = organisations[0] + + if entry_date: + default_values["entry-date"] = entry_date + + run_pipeline( + ConvertPhase( + path=input_path, + dataset_resource_log=DatasetResourceLog(), + custom_temp_dir=custom_temp_dir, + output_path=output_path, + ), + PostConversionPhase( + converted_resource_path=input_path, + output_dir=output_path, + dataset=dataset, + typology=specification.get_dataset_typology(dataset), + ), + NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), + ParsePhase(), + ConcatFieldPhase(concats=concats, log=column_field_log), + MapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + FilterPhase(filters=pipeline.filters(resource)), + PatchPhase( + issues=issue_log, + patches=patches, + ), + HarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + dataset=dataset, + ), + DefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + # TBD: move migrating columns to fields to be immediately after map + # this will simplify harmonisation and remove intermediate_fieldnames + # but effects brownfield-land and other pipelines which operate on columns + MigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + OrganisationPhase(organisation=organisation, issues=issue_log), + FieldPrunePhase(fields=specification.current_fieldnames(schema)), + EntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + ), + EntityPrefixPhase(dataset=dataset), + EntityLookupPhase(lookups), + SavePhase( + default_output_path("harmonised", input_path), + fieldnames=intermediate_fieldnames, + enabled=save_harmonised, + ), + EntityPrunePhase( + issue_log=issue_log, dataset_resource_log=dataset_resource_log + ), + PivotPhase(), + FactCombinePhase(issue_log=issue_log, fields=combine_fields), + FactorPhase(), + FactReferencePhase( + field_typology_map=specification.get_field_typology_map(), + field_prefix_map=specification.get_field_prefix_map(), + ), + FactLookupPhase(lookups), + FactPrunePhase(), + SavePhase( + output_path, + fieldnames=specification.factor_fieldnames(), + ), + ) + + issue_log.save(os.path.join(issue_dir, resource + ".csv")) + column_field_log.save(os.path.join(column_field_dir, resource + ".csv")) + dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv")) + + +# +# build dataset from processed resources +# +def dataset_create( + input_paths, + output_path, + organisation_path, + pipeline, + dataset, + specification, + issue_dir="issue", +): + if not output_path: + print("missing output path", file=sys.stderr) + sys.exit(2) + organisation = Organisation(organisation_path, Path(pipeline.path)) + package = DatasetPackage( + dataset, + organisation=organisation, + path=output_path, + specification_dir=None, # TBD: package should use this specification object + ) + package.create() + for path in input_paths: + package.load_transformed(path) + package.load_entities() + + old_entity_path = os.path.join(pipeline.path, "old-entity.csv") + if os.path.exists(old_entity_path): + package.load_old_entities(old_entity_path) + + issue_paths = os.path.join(issue_dir, dataset) + if os.path.exists(issue_paths): + for issue_path in os.listdir(issue_paths): + package.load_issues(os.path.join(issue_paths, issue_path)) + else: + logging.warning("No directory for this dataset in the provided issue_directory") + + package.add_counts() + + +def dataset_dump(input_path, output_path): + cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" + logging.info(cmd) + os.system(cmd) + + +def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset): + if isinstance(csv_path, str): + path = Path(csv_path) + dataset_name = path.stem + elif isinstance(csv_path, Path): + dataset_name = csv_path.stem + else: + logging.error(f"Can't extract datapackage name from {csv_path}") + sys.exit(-1) + + flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv") + with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file: + reader = csv.DictReader(read_file) + + spec_field_names = [ + field + for field in itertools.chain( + *[ + specification.current_fieldnames(schema) + for schema in specification.dataset_schema[dataset] + ] + ) + ] + reader_fieldnames = [ + field.replace("_", "-") + for field in list(reader.fieldnames) + if field != "json" + ] + + flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames)) + # Make sure we put flattened fieldnames last + field_names = reader_fieldnames + sorted(list(flattened_field_names)) + + writer = csv.DictWriter(write_file, fieldnames=field_names) + writer.writeheader() + entities = [] + for row in reader: + row.pop("geojson", None) + row = OrderedDict(row) + json_string = row.pop("json") or "{}" + row.update(json.loads(json_string)) + kebab_case_row = dict( + [(key.replace("_", "-"), val) for key, val in row.items()] + ) + writer.writerow(kebab_case_row) + entities.append(kebab_case_row) + + # write the entities to json file as well + flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json") + with open(flattened_json_path, "w") as out_json: + out_json.write(json.dumps({"entities": entities})) + batch_size = 100000 + temp_geojson_files = [] + geography_entities = [e for e in entities if e["typology"] == "geography"] + for i in range(0, len(geography_entities), batch_size): + batch = geography_entities[i : i + batch_size] + feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name) + + geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson") + temp_geojson_files.append(geojson_path) + try: + with open(geojson_path, "w", encoding="utf-8") as out_geojson: + out_geojson.write(geojson.dumps(feature_collection)) + except Exception as e: + logging.error(f"Error writing to GeoJSON file: {e}") + + if all(os.path.isfile(path) for path in temp_geojson_files): + rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson") + for temp_path in temp_geojson_files: + responseCode, _, _ = execute( + [ + "ogr2ogr", + "-f", + "GeoJSON", + "-lco", + "RFC7946=YES", + "-append", + rfc7946_geojson_path, + temp_path, + ] + ) + + if responseCode != 0: + logging.error( + "Could not generate rfc7946 compliant geojson. Use existing file." + ) + execute( + [ + "ogr2ogr", + "-f", + "GeoJSON", + "-append", + rfc7946_geojson_path, + temp_path, + ] + ) + # clear up input geojson file + if os.path.isfile(temp_path): + os.remove(temp_path) + + +# +# configuration commands +# +def collection_add_source(entry, collection, endpoint_url, collection_dir): + """ + followed by a sequence of optional name and value pairs including the following names: + "attribution", "licence", "pipelines", "status", "plugin", + "parameters", "start-date", "end-date" + """ + entry["collection"] = collection + entry["endpoint-url"] = endpoint_url + allowed_names = set( + list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames) + ) + for key in entry.keys(): + if key not in allowed_names: + logging.error(f"unrecognised argument '{key}'") + sys.exit(2) + add_source_endpoint(entry, directory=collection_dir) + + +def add_endpoints_and_lookups( + csv_file_path, + collection_name, + collection_dir, + pipeline_dir, + specification_dir, + organisation_path, + tmp_dir="./var/cache", +): + """ + :param csv_file_path: + :param collection_name: + :param collection_dir: + :param pipeline_dir: + :param specification_dir: + :param organisation_path: + :param tmp_dir: + :return: + """ + + expected_cols = [ + "pipelines", + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "licence", + ] + + licence_csv_path = os.path.join(specification_dir, "licence.csv") + valid_licenses = [] + with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile) + valid_licenses = [row["licence"] for row in reader] + + # need to get collection name from somewhere + # collection name is NOT the dataset name + collection = Collection(name=collection_name, directory=collection_dir) + collection.load() + + # read and process each record of the new endpoints csv at csv_file_path + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + # validate the columns + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + # this is not perfect we should riase validation errors in our code and below should include a try and except statement + endpoints = [] + for row in reader: + if row["licence"] not in valid_licenses: + raise ValueError( + f"Licence '{row['licence']}' is not a valid licence according to the specification." + ) + if not row["documentation-url"].strip(): + raise ValueError( + "The 'documentation-url' must be populated for each row." + ) + if collection.add_source_endpoint(row): + endpoint = { + "endpoint-url": row["endpoint-url"], + "endpoint": hash_value(row["endpoint-url"]), + "end-date": row.get("end-date", ""), + "plugin": row.get("plugin"), + "licence": row["licence"], + } + endpoints.append(endpoint) + + # endpoints have been added now lets collect the resources using the endpoint information + collector = Collector(collection_dir=collection_dir) + + for endpoint in endpoints: + collector.fetch( + url=endpoint["endpoint-url"], + endpoint=endpoint["endpoint"], + end_date=endpoint["end-date"], + plugin=endpoint["plugin"], + ) + # reload log items + collection.load_log_items() + + dataset_resource_map = collection.dataset_resource_map() + + # searching for the specific resources that we have downloaded + for dataset in dataset_resource_map: + resources_to_assign = [] + for resource in dataset_resource_map[dataset]: + resource_endpoints = collection.resource_endpoints(resource) + if any( + endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints] + for endpoint in resource_endpoints + ): + resource_file_path = Path(collection_dir) / "resource" / resource + resources_to_assign.append(resource_file_path) + assign_entities( + resource_file_paths=resources_to_assign, + collection=collection, + pipeline_dir=pipeline_dir, + specification_dir=specification_dir, + organisation_path=organisation_path, + tmp_dir=tmp_dir, + dataset=dataset, + ) + + +def resource_from_path(path): + return Path(path).stem + + +def default_output_path(command, input_path): + directory = "" if command in ["harmonised", "transformed"] else "var/" + return f"{directory}{command}/{resource_from_path(input_path)}.csv" + + +def assign_entities( + resource_file_paths, + collection, + pipeline_dir, + specification_dir, + organisation_path, + tmp_dir="./var/cache", + dataset=None, +): + """ + Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection + :param resource_file_paths: + :param collection: + :param pipeline_dir: + :param specification_dir: + :param organisation_path: + :param tmp_dir: + :return: + """ + + specification = Specification(specification_dir) + + print("") + print("======================================================================") + print("New Lookups") + print("======================================================================") + + dataset_resource_map = collection.dataset_resource_map() + new_lookups = [] + + pipeline_name = None + # establish pipeline if dataset is known - else have to find dataset for each resource + if dataset is not None: + pipeline = Pipeline(pipeline_dir, dataset) + pipeline_name = pipeline.name + + for resource_file_path in resource_file_paths: + resource = os.path.splitext(os.path.basename(resource_file_path))[0] + # Find dataset for resource if not given + if dataset is None: + for dataset_key, resources in dataset_resource_map.items(): + if resource in list(resources): + dataset = dataset_key + continue + # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline + if dataset is not None: + pipeline = Pipeline(pipeline_dir, dataset) + pipeline_name = pipeline.name + else: + logging.error( + "Resource '%s' has not been processed by pipeline - no lookups added" + % (resource) + ) + break + + resource_lookups = get_resource_unidentified_lookups( + input_path=Path(resource_file_path), + dataset=dataset, + organisations=collection.resource_organisations(resource), + pipeline=pipeline, + specification=specification, + tmp_dir=Path(tmp_dir).absolute(), + org_csv_path=organisation_path, + ) + new_lookups.append(resource_lookups) + + if pipeline_name is not None: + # save new lookups to file + lookups = Lookups(pipeline_dir) + # Check if the lookups file exists, create it if not + if not os.path.exists(lookups.lookups_path): + with open(lookups.lookups_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(list(lookups.schema.fieldnames)) + + lookups.load_csv() + for new_lookup in new_lookups: + for idx, entry in enumerate(new_lookup): + lookups.add_entry(entry[0]) + + # save edited csvs + max_entity_num = lookups.get_max_entity(pipeline_name) + lookups.entity_num_gen.state["current"] = max_entity_num + lookups.entity_num_gen.state["range_max"] = ( + specification.get_dataset_entity_max(pipeline_name) + ) + lookups.entity_num_gen.state["range_min"] = ( + specification.get_dataset_entity_min(pipeline_name) + ) + + # TO DO: Currently using pipeline_name to find dataset min, max, current + # This would not function properly if each resource had a different dataset + + collection.save_csv() + new_lookups = lookups.save_csv() + + for entity in new_lookups: + print( + entity["prefix"], + ",", + entity["organisation"], + ",", + entity["reference"], + ",", + entity["entity"], + ) + + +def get_resource_unidentified_lookups( + input_path: Path, + dataset: str, + pipeline: Pipeline, + specification: Specification, + organisations: list = [], + tmp_dir: Path = None, + org_csv_path: Path = None, +): + # convert phase inputs + # could alter resource_from_path to file from path and promote to a utils folder + resource = resource_from_path(input_path) + dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + custom_temp_dir = tmp_dir # './var' + + print("") + print("----------------------------------------------------------------------") + print(f">>> organisations:{organisations}") + print(f">>> resource:{resource}") + print("----------------------------------------------------------------------") + + # normalise phase inputs + skip_patterns = pipeline.skip_patterns(resource) + null_path = None + + # concat field phase + concats = pipeline.concatenations(resource) + column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) + + # map phase + intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) + columns = pipeline.columns(resource) + + # patch phase + patches = pipeline.patches(resource=resource) + + # harmonize phase + issue_log = IssueLog(dataset=dataset, resource=resource) + + # default phase + default_fields = pipeline.default_fields(resource=resource) + default_values = pipeline.default_values(endpoints=[]) + + if len(organisations) == 1: + default_values["organisation"] = organisations[0] + + # migrate phase + schema = specification.pipeline[pipeline.name]["schema"] + + # organisation phase + organisation = Organisation(org_csv_path, Path(pipeline.path)) + + # print lookups phase + pipeline_lookups = pipeline.lookups() + redirect_lookups = pipeline.redirect_lookups() + print_lookup_phase = PrintLookupPhase( + lookups=pipeline_lookups, redirect_lookups=redirect_lookups + ) + + run_pipeline( + ConvertPhase( + path=input_path, + dataset_resource_log=dataset_resource_log, + custom_temp_dir=custom_temp_dir, + ), + NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), + ParsePhase(), + ConcatFieldPhase(concats=concats, log=column_field_log), + MapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + FilterPhase(filters=pipeline.filters(resource)), + PatchPhase( + issues=issue_log, + patches=patches, + ), + HarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + ), + DefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + # TBD: move migrating columns to fields to be immediately after map + # this will simplify harmonisation and remove intermediate_fieldnames + # but effects brownfield-land and other pipelines which operate on columns + MigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + OrganisationPhase(organisation=organisation, issues=issue_log), + FieldPrunePhase(fields=specification.current_fieldnames(schema)), + EntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + ), + EntityPrefixPhase(dataset=dataset), + print_lookup_phase, + ) + + return print_lookup_phase.new_lookup_entries + + +def process_data_in_batches(entities, flattened_dir, dataset_name): + features = [] + feature_collection = "" + for entity in entities: + geom = entity.pop("geometry") + point = entity.pop("point") + if geom: + try: + geometry = shapely.wkt.loads(geom) + feature = geojson.Feature(geometry=geometry, properties=entity) + features.append(feature) + except Exception as e: + logging.error(f"Error loading wkt from entity {entity['entity']}") + logging.error(e) + elif point: + try: + geometry = shapely.wkt.loads(point) + feature = geojson.Feature(geometry=geometry, properties=entity) + features.append(feature) + except Exception as e: + logging.error(f"Error loading wkt from entity {entity['entity']}") + logging.error(e) + else: + logging.error( + f"No geometry or point data for entity {entity['entity']} with typology 'geography'" + ) + + if features: + feature_collection = geojson.FeatureCollection( + features=features, name=dataset_name + ) + + return feature_collection + + +def add_redirections(csv_file_path, pipeline_dir): + """ + :param csv_file_path: + :param pipeline_dir: + :return: + """ + expected_cols = [ + "entity_source", + "entity_destination", + ] + + old_entity_path = Path(pipeline_dir) / "old-entity.csv" + + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + fieldnames = ["old-entity", "status", "entity"] + + f = open(old_entity_path, "a", newline="") + writer = csv.DictWriter(f, fieldnames=fieldnames) + if f.tell() == 0: + writer.writeheader() + + for row in reader: + if row["entity_source"] == "" or row["entity_destination"] == "": + print( + "Missing entity number for", + ( + row["entity_destination"] + if row["entity_source"] == "" + else row["entity_source"] + ), + ) + else: + writer.writerow( + { + "old-entity": row["entity_source"], + "status": "301", + "entity": row["entity_destination"], + } + ) + print("Redirections added to old-entity.csv") From 92a4ae4ed9250c2c1df9c524eaf54ebf7f171016 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Fri, 5 Apr 2024 15:32:02 +0100 Subject: [PATCH 26/58] Updated --- .../checkpoints/converted_resource.py | 72 ++++++++++++++++++- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index e735a41d..f2eebfec 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -5,9 +5,75 @@ # a checkpoint represents the moment in the process where we tell it the # type of data it is validating and where the data is # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from) +from pathlib import Path +import csv +import re from .base import BaseCheckpoint -class CovertedResourceCheckpoint(BaseCheckpoint): - def load(): - pass +class ConvertedResourceCheckpoint(BaseCheckpoint): + def __init__(self, data_path): + super().__init__("converted_resource", data_path) + self.csv_path = Path(data_path) + + def load(self): + self.expectations = [ + { + "function": self.check_for_duplicate_references, + "name": "Check for Duplicate References", + "severity": "error", + "responsibility": "system", + }, + { + "function": self.validate_references, + "name": "Validate References", + "severity": "error", + "responsibility": "system", + }, + ] + + def check_for_duplicate_references(self): + duplicates = {} + issues = [] + + with self.csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if ref in duplicates: + duplicates[ref].append(row_number) + else: + duplicates[ref] = [row_number] + + for ref, rows in duplicates.items(): + if len(rows) > 1: + issues.append( + { + "scope": "duplicate_reference", + "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", + "rows": rows, + "reference": ref, + } + ) + + return True, "Checked for duplicate references.", issues + + def validate_references(self): + pattern = re.compile(r"^REF-\d+$") + issues = [] + + with self.csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if not pattern.match(ref): + issues.append( + { + "scope": "invalid_reference", + "message": f"Invalid reference '{ref}' on row {row_number}.", + "row": row_number, + "reference": ref, + } + ) + + return len(issues) == 0, "Checked for invalid references.", issues From f1e0d7aa55f963092e71b9dbcf9e4970651bf184 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 9 Apr 2024 14:41:41 +0100 Subject: [PATCH 27/58] Added unit tests and integrated into convert --- digital_land/phase/convert.py | 25 +++++++++ .../expectations/test_checkpoint.py | 53 ++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 091fa006..8e057e7e 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -11,6 +11,9 @@ import pandas as pd from .load import Stream from .phase import Phase +from digital_land.expectations.checkpoints.converted_resource import ( + ConvertedResourceCheckpoint, +) def detect_file_encoding(path): @@ -187,12 +190,34 @@ def _read_text_file(self, input_path, encoding): if converted_csv_file: f.close() + self.run_checkpoint(converted_csv_file) reader = read_csv(converted_csv_file) else: reader = f return reader + def run_checkpoint(self, path): + checkpoint = ConvertedResourceCheckpoint(data_path=path) + checkpoint.load() + checkpoint_result, issues = checkpoint.run() + + if issues: + for issue in issues: + log_message = self.format_issue_message(issue) + + if issue["severity"] == "error": + logging.error(log_message) + elif issue["severity"] == "warning": + logging.warning(log_message) + else: + logging.info(log_message) + else: + logging.info(f"Checkpoint completed with result: {checkpoint_result}") + + def format_issue_message(self, issue): + return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})" + def _find_zip_file(self, input_file, suffix=".gml"): zip_ = zipfile.ZipFile(input_file) files = zip_.namelist() diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index 35984f6d..c5c2443f 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -2,8 +2,11 @@ import os import spatialite import pandas as pd -from csv import DictReader +from csv import DictReader, DictWriter from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint +from digital_land.expectations.checkpoints.converted_resource import ( + ConvertedResourceCheckpoint, +) @pytest.fixture @@ -43,6 +46,22 @@ def sqlite3_with_entity_tables_path(tmp_path): return dataset_path +@pytest.fixture +def csv_path(tmp_path): + data = [ + {"reference": "REF-001", "name": "Test 1"}, + {"reference": "REF-002", "name": "Test 2"}, + {"reference": "REF-001", "name": "Test 3"}, # Duplicate + {"reference": "INVALID-003", "name": "Test 4"}, # Invalid format + ] + csv_file = tmp_path / "test_data.csv" + with csv_file.open(mode="w", newline="") as f: + writer = DictWriter(f, fieldnames=["reference", "name"]) + writer.writeheader() + writer.writerows(data) + return csv_file + + def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path): # load data test_entity_data = pd.DataFrame.from_dict({"entity": [1], "name": ["test1"]}) @@ -132,3 +151,35 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): assert issues[0]["rows"] == "" assert issues[0]["row"] != "" # Just check it's there assert issues[0]["value"] == "" + + +def test_check_for_duplicate_references(csv_path): + checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) + checkpoint.load() + + success, message, issues = checkpoint.check_for_duplicate_references() + + assert success is True, "The function should successfully identify issues." + assert len(issues) == 1, "There should be one issue identified." + assert ( + issues[0]["scope"] == "duplicate_reference" + ), "The issue should be identified as a duplicate reference." + assert ( + "REF-001" in issues[0]["message"] + ), "REF-001 should be identified as a duplicate." + + +def test_validate_references(csv_path): + checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) + checkpoint.load() + + success, message, issues = checkpoint.validate_references() + + assert success is False, "The function should fail due to invalid references." + assert len(issues) == 1, "There should be one issue identified." + assert ( + issues[0]["scope"] == "invalid_reference" + ), "The issue should be identified as an invalid reference." + assert ( + "INVALID-003" in issues[0]["message"] + ), "INVALID-003 should be identified as invalid." From d4c98c06534a6ec5c2783bf4e23f1527bf2ec6fb Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 14:19:05 +0100 Subject: [PATCH 28/58] Updated verification --- .../expectations/checkpoints/converted_resource.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index f2eebfec..206eecb8 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -7,7 +7,6 @@ # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from) from pathlib import Path import csv -import re from .base import BaseCheckpoint @@ -59,21 +58,21 @@ def check_for_duplicate_references(self): return True, "Checked for duplicate references.", issues def validate_references(self): - pattern = re.compile(r"^REF-\d+$") issues = [] with self.csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): ref = row.get("reference") - if not pattern.match(ref): + # Check if reference is not populated (None or empty string) + if not ref: # This will be True for both None and empty strings issues.append( { "scope": "invalid_reference", - "message": f"Invalid reference '{ref}' on row {row_number}.", + "message": f"Reference is missing on row {row_number}.", "row": row_number, - "reference": ref, + "reference": ref, # This will be None or '' } ) - return len(issues) == 0, "Checked for invalid references.", issues + return len(issues) == 0, "Checked for unpopulated references.", issues From 2666a59fe21743a7151761c280f1afc04e67cc34 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 14:29:16 +0100 Subject: [PATCH 29/58] Adjust issue factory --- digital_land/expectations/issue.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index 68cd0ae8..1d6d5a1b 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -19,6 +19,8 @@ def issue_factory(scope): "row-group": RowGroupIssue, "row": RowIssue, "value": ValueIssue, + "duplicate_reference": RowIssue, + "invalid_reference": ValueIssue, } if scope in SCOPE_MAP: return SCOPE_MAP[scope] From 518196adb41389371ad700a8367c74b39238b955 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:05:42 +0100 Subject: [PATCH 30/58] Issue adjustments --- .../checkpoints/converted_resource.py | 14 ++++++-- digital_land/expectations/issue.py | 32 +++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 206eecb8..43671a56 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -52,6 +52,11 @@ def check_for_duplicate_references(self): "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", "rows": rows, "reference": ref, + "dataset": "dataset", + "field_name": "reference", + "row_id": str(rows[0]), + "value": ref, + "organisation": "organisation", } ) @@ -64,14 +69,19 @@ def validate_references(self): reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): ref = row.get("reference") - # Check if reference is not populated (None or empty string) + if not ref: # This will be True for both None and empty strings issues.append( { "scope": "invalid_reference", "message": f"Reference is missing on row {row_number}.", "row": row_number, - "reference": ref, # This will be None or '' + "reference": ref, + "dataset": "dataset", + "field_name": "reference", + "row_id": str(row_number), + "value": ref, + "organisation": "organisation", } ) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index 1d6d5a1b..75718d32 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -19,8 +19,8 @@ def issue_factory(scope): "row-group": RowGroupIssue, "row": RowIssue, "value": ValueIssue, - "duplicate_reference": RowIssue, - "invalid_reference": ValueIssue, + "duplicate_reference": DuplicateReferenceIssue, + "invalid_reference": InvalidReferenceIssue, } if scope in SCOPE_MAP: return SCOPE_MAP[scope] @@ -131,3 +131,31 @@ def __post_init__(self): issue_scope = "value" if self.scope != issue_scope: raise ValueError(f"scope must be '{issue_scope}'.") + + +@dataclass +class DuplicateReferenceIssue(Issue): + dataset: str + field_name: str = field(metadata=config(field_name="field_name")) + duplicated_value: str = field(metadata=config(field_name="duplicated_value")) + rows: list = field(metadata=config(field_name="rows")) + organisation: str + + def __post_init__(self): + issue_scope = "duplicate_reference" + if self.scope != issue_scope: + raise ValueError(f"scope must be '{issue_scope}'.") + + +@dataclass +class InvalidReferenceIssue(Issue): + dataset: str + field_name: str = field(metadata=config(field_name="field_name")) + invalid_value: str = field(metadata=config(field_name="invalid_value")) + row_id: str = field(metadata=config(field_name="row_id")) + organisation: str + + def __post_init__(self): + issue_scope = "invalid_reference" + if self.scope != issue_scope: + raise ValueError(f"scope must be '{issue_scope}'.") From febdbace578f93372f3b78c82386dbacd2af504e Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:09:51 +0100 Subject: [PATCH 31/58] Changed value --- digital_land/expectations/checkpoints/converted_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 43671a56..8a952d5e 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -55,7 +55,7 @@ def check_for_duplicate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(rows[0]), - "value": ref, + "invalid_value": ref, "organisation": "organisation", } ) @@ -80,7 +80,7 @@ def validate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(row_number), - "value": ref, + "invalid_value": ref, "organisation": "organisation", } ) From 4efc16dd2732ed141e4fff272f26e866bc1d605b Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:15:26 +0100 Subject: [PATCH 32/58] Value changes --- digital_land/expectations/checkpoints/converted_resource.py | 2 -- digital_land/expectations/issue.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 8a952d5e..8e7f1727 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -55,7 +55,6 @@ def check_for_duplicate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(rows[0]), - "invalid_value": ref, "organisation": "organisation", } ) @@ -80,7 +79,6 @@ def validate_references(self): "dataset": "dataset", "field_name": "reference", "row_id": str(row_number), - "invalid_value": ref, "organisation": "organisation", } ) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index 75718d32..dc45a1c3 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -137,7 +137,6 @@ def __post_init__(self): class DuplicateReferenceIssue(Issue): dataset: str field_name: str = field(metadata=config(field_name="field_name")) - duplicated_value: str = field(metadata=config(field_name="duplicated_value")) rows: list = field(metadata=config(field_name="rows")) organisation: str @@ -151,7 +150,6 @@ def __post_init__(self): class InvalidReferenceIssue(Issue): dataset: str field_name: str = field(metadata=config(field_name="field_name")) - invalid_value: str = field(metadata=config(field_name="invalid_value")) row_id: str = field(metadata=config(field_name="row_id")) organisation: str From 91e5c189a72a37aa2e8fce9f60e9b6319d38a820 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:22:06 +0100 Subject: [PATCH 33/58] Adjust convert.py --- digital_land/phase/convert.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 8e057e7e..74da23eb 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -200,7 +200,14 @@ def _read_text_file(self, input_path, encoding): def run_checkpoint(self, path): checkpoint = ConvertedResourceCheckpoint(data_path=path) checkpoint.load() - checkpoint_result, issues = checkpoint.run() + result = checkpoint.run() + + # Check if the result is not None and is iterable (unpackable) + if result is not None and isinstance(result, tuple) and len(result) == 2: + checkpoint_result, issues = result + else: + logging.error("Checkpoint did not return the expected result format.") + return if issues: for issue in issues: From 238607f148b03a31a62e9d4c526480ef26022100 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 10 Apr 2024 15:28:53 +0100 Subject: [PATCH 34/58] Test fixes --- tests/integration/expectations/test_checkpoint.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index c5c2443f..62a78568 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -52,7 +52,7 @@ def csv_path(tmp_path): {"reference": "REF-001", "name": "Test 1"}, {"reference": "REF-002", "name": "Test 2"}, {"reference": "REF-001", "name": "Test 3"}, # Duplicate - {"reference": "INVALID-003", "name": "Test 4"}, # Invalid format + {"reference": "", "name": "Test 4"}, # Invalid format ] csv_file = tmp_path / "test_data.csv" with csv_file.open(mode="w", newline="") as f: @@ -180,6 +180,4 @@ def test_validate_references(csv_path): assert ( issues[0]["scope"] == "invalid_reference" ), "The issue should be identified as an invalid reference." - assert ( - "INVALID-003" in issues[0]["message"] - ), "INVALID-003 should be identified as invalid." + assert "" in issues[0]["message"], " 4th value should be identified as invalid." From d3ecda39cc59d7318c5cf227c2dac91215e98f80 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 10:21:33 +0100 Subject: [PATCH 35/58] Chanegs to issues --- .../checkpoints/converted_resource.py | 13 ++++----- digital_land/expectations/issue.py | 28 ------------------- 2 files changed, 6 insertions(+), 35 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 8e7f1727..73b666cf 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -48,12 +48,11 @@ def check_for_duplicate_references(self): if len(rows) > 1: issues.append( { - "scope": "duplicate_reference", + "scope": "row-group", "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", - "rows": rows, - "reference": ref, "dataset": "dataset", - "field_name": "reference", + "table_name": "resource", + "rows": rows, "row_id": str(rows[0]), "organisation": "organisation", } @@ -72,13 +71,13 @@ def validate_references(self): if not ref: # This will be True for both None and empty strings issues.append( { - "scope": "invalid_reference", + "scope": "value", "message": f"Reference is missing on row {row_number}.", - "row": row_number, - "reference": ref, "dataset": "dataset", + "table_name": "resource", "field_name": "reference", "row_id": str(row_number), + "value": ref, "organisation": "organisation", } ) diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index dc45a1c3..68cd0ae8 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -19,8 +19,6 @@ def issue_factory(scope): "row-group": RowGroupIssue, "row": RowIssue, "value": ValueIssue, - "duplicate_reference": DuplicateReferenceIssue, - "invalid_reference": InvalidReferenceIssue, } if scope in SCOPE_MAP: return SCOPE_MAP[scope] @@ -131,29 +129,3 @@ def __post_init__(self): issue_scope = "value" if self.scope != issue_scope: raise ValueError(f"scope must be '{issue_scope}'.") - - -@dataclass -class DuplicateReferenceIssue(Issue): - dataset: str - field_name: str = field(metadata=config(field_name="field_name")) - rows: list = field(metadata=config(field_name="rows")) - organisation: str - - def __post_init__(self): - issue_scope = "duplicate_reference" - if self.scope != issue_scope: - raise ValueError(f"scope must be '{issue_scope}'.") - - -@dataclass -class InvalidReferenceIssue(Issue): - dataset: str - field_name: str = field(metadata=config(field_name="field_name")) - row_id: str = field(metadata=config(field_name="row_id")) - organisation: str - - def __post_init__(self): - issue_scope = "invalid_reference" - if self.scope != issue_scope: - raise ValueError(f"scope must be '{issue_scope}'.") From e22412faff0fd5c89c700515497a848ed6652a60 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 10:35:25 +0100 Subject: [PATCH 36/58] Change to reference --- digital_land/expectations/checkpoints/converted_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 73b666cf..512a8dce 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -77,7 +77,7 @@ def validate_references(self): "table_name": "resource", "field_name": "reference", "row_id": str(row_number), - "value": ref, + "value": "reference", "organisation": "organisation", } ) From 4bc8119f45840dcbd4a357a0fe2d5f9708826017 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 14:39:35 +0100 Subject: [PATCH 37/58] Separate functions and correct tests --- .../checkpoints/converted_resource.py | 69 ++----------------- .../resource_validations.py | 56 +++++++++++++++ .../expectations/test_checkpoint.py | 23 +++---- 3 files changed, 71 insertions(+), 77 deletions(-) create mode 100644 digital_land/expectations/expectation_functions/resource_validations.py diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 512a8dce..f00f24fc 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -1,13 +1,9 @@ -# checkpoint needs to assemble class state -# it needs to validate inputs specific for that checkpoint -# it then needs to run expectations -# then it needs to be able to save those expectation resultts -# a checkpoint represents the moment in the process where we tell it the -# type of data it is validating and where the data is -# the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from) from pathlib import Path -import csv from .base import BaseCheckpoint +from ..expectation_functions.resource_validations import ( + check_for_duplicate_references, + validate_references, +) class ConvertedResourceCheckpoint(BaseCheckpoint): @@ -18,68 +14,15 @@ def __init__(self, data_path): def load(self): self.expectations = [ { - "function": self.check_for_duplicate_references, + "function": check_for_duplicate_references(self.csv_path), "name": "Check for Duplicate References", "severity": "error", "responsibility": "system", }, { - "function": self.validate_references, + "function": validate_references(self.csv_path), "name": "Validate References", "severity": "error", "responsibility": "system", }, ] - - def check_for_duplicate_references(self): - duplicates = {} - issues = [] - - with self.csv_path.open(newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - if ref in duplicates: - duplicates[ref].append(row_number) - else: - duplicates[ref] = [row_number] - - for ref, rows in duplicates.items(): - if len(rows) > 1: - issues.append( - { - "scope": "row-group", - "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", - "dataset": "dataset", - "table_name": "resource", - "rows": rows, - "row_id": str(rows[0]), - "organisation": "organisation", - } - ) - - return True, "Checked for duplicate references.", issues - - def validate_references(self): - issues = [] - - with self.csv_path.open(newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - - if not ref: # This will be True for both None and empty strings - issues.append( - { - "scope": "value", - "message": f"Reference is missing on row {row_number}.", - "dataset": "dataset", - "table_name": "resource", - "field_name": "reference", - "row_id": str(row_number), - "value": "reference", - "organisation": "organisation", - } - ) - - return len(issues) == 0, "Checked for unpopulated references.", issues diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py new file mode 100644 index 00000000..23150be1 --- /dev/null +++ b/digital_land/expectations/expectation_functions/resource_validations.py @@ -0,0 +1,56 @@ +import csv + + +def check_for_duplicate_references(csv_path): + duplicates = {} + issues = [] + + with csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if ref in duplicates: + duplicates[ref].append(row_number) + else: + duplicates[ref] = [row_number] + + for ref, rows in duplicates.items(): + if len(rows) > 1: + issues.append( + { + "scope": "row-group", + "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", + "dataset": "dataset", + "table_name": "resource", + "rows": rows, + "row_id": str(rows[0]), + "organisation": "organisation", + } + ) + + return issues + + +def validate_references(csv_path): + issues = [] + + with csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + + if not ref: # This will be True for both None and empty strings + issues.append( + { + "scope": "value", + "message": f"Reference is missing on row {row_number}.", + "dataset": "dataset", + "table_name": "resource", + "field_name": "reference", + "row_id": str(row_number), + "value": "Missing", + "organisation": "organisation", + } + ) + + return issues diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index 62a78568..37c2ac04 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -4,8 +4,9 @@ import pandas as pd from csv import DictReader, DictWriter from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint -from digital_land.expectations.checkpoints.converted_resource import ( - ConvertedResourceCheckpoint, +from digital_land.expectations.expectation_functions.resource_validations import ( + check_for_duplicate_references, + validate_references, ) @@ -154,15 +155,12 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): def test_check_for_duplicate_references(csv_path): - checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) - checkpoint.load() - - success, message, issues = checkpoint.check_for_duplicate_references() + issues = check_for_duplicate_references(csv_path) - assert success is True, "The function should successfully identify issues." + assert issues, "The function should successfully identify issues." assert len(issues) == 1, "There should be one issue identified." assert ( - issues[0]["scope"] == "duplicate_reference" + issues[0]["scope"] == "row-group" ), "The issue should be identified as a duplicate reference." assert ( "REF-001" in issues[0]["message"] @@ -170,14 +168,11 @@ def test_check_for_duplicate_references(csv_path): def test_validate_references(csv_path): - checkpoint = ConvertedResourceCheckpoint(data_path=csv_path) - checkpoint.load() - - success, message, issues = checkpoint.validate_references() + issues = validate_references(csv_path) - assert success is False, "The function should fail due to invalid references." + assert issues, "The function should fail due to invalid references." assert len(issues) == 1, "There should be one issue identified." assert ( - issues[0]["scope"] == "invalid_reference" + issues[0]["scope"] == "value" ), "The issue should be identified as an invalid reference." assert "" in issues[0]["message"], " 4th value should be identified as invalid." From 4b0a43712f2551f1e5bb0bfe75914bc7c3f5e55f Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 15:02:58 +0100 Subject: [PATCH 38/58] Changes back to helpers --- .../expectations/checkpoints/converted_resource.py | 11 ++++++++--- .../expectation_functions/resource_validations.py | 7 ++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index f00f24fc..d82726c7 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -7,9 +7,14 @@ class ConvertedResourceCheckpoint(BaseCheckpoint): - def __init__(self, data_path): - super().__init__("converted_resource", data_path) - self.csv_path = Path(data_path) + def __init__(self, dataset_path, typology, dataset=None): + super().__init__("converted_resource", dataset_path) + self.csv_path = Path(dataset_path) + if dataset: + self.dataset = dataset + else: + self.dataset = self.csv_path.stem + self.typology = typology def load(self): self.expectations = [ diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py index 23150be1..2acbe669 100644 --- a/digital_land/expectations/expectation_functions/resource_validations.py +++ b/digital_land/expectations/expectation_functions/resource_validations.py @@ -4,7 +4,6 @@ def check_for_duplicate_references(csv_path): duplicates = {} issues = [] - with csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): @@ -28,17 +27,15 @@ def check_for_duplicate_references(csv_path): } ) - return issues + return True, "Checked for duplicate references.", issues def validate_references(csv_path): issues = [] - with csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) for row_number, row in enumerate(reader, start=1): ref = row.get("reference") - if not ref: # This will be True for both None and empty strings issues.append( { @@ -53,4 +50,4 @@ def validate_references(csv_path): } ) - return issues + return len(issues) == 0, "Checked for unpopulated references.", issues From 568f456fd1f9a752a695a7a2164a9bad647d5396 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Mon, 15 Apr 2024 15:07:19 +0100 Subject: [PATCH 39/58] Fix --- digital_land/phase/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 74da23eb..303609f9 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -198,7 +198,7 @@ def _read_text_file(self, input_path, encoding): return reader def run_checkpoint(self, path): - checkpoint = ConvertedResourceCheckpoint(data_path=path) + checkpoint = ConvertedResourceCheckpoint(dataset_path=path) checkpoint.load() result = checkpoint.run() From 2eb2134954cdcf48de796d98ad3d16d6194b5233 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 12:41:53 +0100 Subject: [PATCH 40/58] Core changes --- digital_land/commands.py | 11 +++ .../checkpoints/converted_resource.py | 71 ++++++++++++++----- digital_land/expectations/commands.py | 4 +- digital_land/phase/convert.py | 29 -------- digital_land/phase/post_conversion.py | 38 ++++++++++ .../expectations/test_checkpoint.py | 4 +- 6 files changed, 106 insertions(+), 51 deletions(-) create mode 100644 digital_land/phase/post_conversion.py diff --git a/digital_land/commands.py b/digital_land/commands.py index ad9d05b1..07d7c488 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -19,6 +19,7 @@ from digital_land.phase.combine import FactCombinePhase from digital_land.phase.concat import ConcatFieldPhase from digital_land.phase.convert import ConvertPhase, execute +from digital_land.phase.post_conversion import PostConversionPhase from digital_land.phase.default import DefaultPhase from digital_land.phase.dump import DumpPhase from digital_land.phase.factor import FactorPhase @@ -162,6 +163,16 @@ def pipeline_run( dataset_resource_log=dataset_resource_log, custom_temp_dir=custom_temp_dir, ), + PostConversionPhase( + converted_resource_path=os.path.join( + custom_temp_dir, f"{resource}_converted.csv" + ), + output_dir=os.path.join( + os.path.dirname(output_path), "post_conversion_outputs" + ), + dataset=dataset, + typology=specification.get_typology_for_dataset(dataset), + ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), ConcatFieldPhase(concats=concats, log=column_field_log), diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index d82726c7..59c1c307 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -1,33 +1,68 @@ from pathlib import Path from .base import BaseCheckpoint +from ..utils import QueryRunner +import os from ..expectation_functions.resource_validations import ( check_for_duplicate_references, validate_references, ) +# Define BASE expectations which should always run +BASE = [ + { + "function": check_for_duplicate_references, + "name": "Check for Duplicate References", + "severity": "error", + "responsibility": "system", + }, + { + "function": validate_references, + "name": "Validate References", + "severity": "error", + "responsibility": "system", + }, +] + +# Empty TYPOLOGY and DATASET for now as per advice +TYPOLOGY = {} +DATASET = {} + class ConvertedResourceCheckpoint(BaseCheckpoint): def __init__(self, dataset_path, typology, dataset=None): super().__init__("converted_resource", dataset_path) self.csv_path = Path(dataset_path) - if dataset: - self.dataset = dataset - else: - self.dataset = self.csv_path.stem + self.dataset = dataset if dataset else self.csv_path.stem self.typology = typology def load(self): - self.expectations = [ - { - "function": check_for_duplicate_references(self.csv_path), - "name": "Check for Duplicate References", - "severity": "error", - "responsibility": "system", - }, - { - "function": validate_references(self.csv_path), - "name": "Validate References", - "severity": "error", - "responsibility": "system", - }, - ] + self.expectations = [] + self.expectations.extend(BASE) + typology_expectations = TYPOLOGY.get(self.typology, []) + dataset_expectations = DATASET.get(self.dataset, []) + + # Extend the expectations list with relevant typology and dataset-specific expectations + if typology_expectations: + self.expectations.extend(typology_expectations) + if dataset_expectations: + self.expectations.extend(dataset_expectations) + + # Assign a QueryRunner instance to each expectation + for expectation in self.expectations: + expectation["query_runner"] = QueryRunner(self.csv_path) + + def save(self, output_dir, format="csv"): + responses_file_path = os.path.join( + output_dir, self.checkpoint, f"{self.dataset}-responses.csv" + ) + issues_file_path = os.path.join( + output_dir, self.checkpoint, f"{self.dataset}-issues.csv" + ) + + self.save_responses( + self.responses, + responses_file_path, + format=format, + ) + + self.save_issues(self.issues, issues_file_path, format=format) diff --git a/digital_land/expectations/commands.py b/digital_land/expectations/commands.py index d16c6533..7b7f7922 100644 --- a/digital_land/expectations/commands.py +++ b/digital_land/expectations/commands.py @@ -1,5 +1,5 @@ from .checkpoints.dataset import DatasetCheckpoint -from .checkpoints.converted_resource import CovertedResourceCheckpoint +from .checkpoints.converted_resource import ConvertedResourceCheckpoint def run_dataset_checkpoint( @@ -30,7 +30,7 @@ def run_converted_resource_checkpoint( """ Function to run the expectation checkpoint for a converted resource """ - checkpoint = CovertedResourceCheckpoint(converted_resource_path, dataset, typology) + checkpoint = ConvertedResourceCheckpoint(converted_resource_path, dataset, typology) checkpoint.load() checkpoint.run() checkpoint.save(output_dir, format="csv") diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 303609f9..b57c22c1 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -11,9 +11,6 @@ import pandas as pd from .load import Stream from .phase import Phase -from digital_land.expectations.checkpoints.converted_resource import ( - ConvertedResourceCheckpoint, -) def detect_file_encoding(path): @@ -190,38 +187,12 @@ def _read_text_file(self, input_path, encoding): if converted_csv_file: f.close() - self.run_checkpoint(converted_csv_file) reader = read_csv(converted_csv_file) else: reader = f return reader - def run_checkpoint(self, path): - checkpoint = ConvertedResourceCheckpoint(dataset_path=path) - checkpoint.load() - result = checkpoint.run() - - # Check if the result is not None and is iterable (unpackable) - if result is not None and isinstance(result, tuple) and len(result) == 2: - checkpoint_result, issues = result - else: - logging.error("Checkpoint did not return the expected result format.") - return - - if issues: - for issue in issues: - log_message = self.format_issue_message(issue) - - if issue["severity"] == "error": - logging.error(log_message) - elif issue["severity"] == "warning": - logging.warning(log_message) - else: - logging.info(log_message) - else: - logging.info(f"Checkpoint completed with result: {checkpoint_result}") - def format_issue_message(self, issue): return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})" diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py new file mode 100644 index 00000000..801aaed8 --- /dev/null +++ b/digital_land/phase/post_conversion.py @@ -0,0 +1,38 @@ +from expectations.commands import run_converted_resource_checkpoint + + +class PostConversionPhase: + def __init__( + self, + converted_resource_path, + output_dir, + dataset, + typology, + act_on_critical_error=False, + ): + """ + Initializes the PostConversionPhase with necessary parameters. + :param converted_resource_path: Path to the converted CSV file. + :param output_dir: Directory to store output files. + :param dataset: Dataset related information for the checkpoint. + :param typology: Typology information for the checkpoint. + :param act_on_critical_error: Whether to act on critical errors during the checkpoint. + """ + self.converted_resource_path = converted_resource_path + self.output_dir = output_dir + self.dataset = dataset + self.typology = typology + self.act_on_critical_error = act_on_critical_error + + def run(self): + """ + Executes the converted resource checkpoint using the provided parameters. + """ + # Run the checkpoint on the converted resource + run_converted_resource_checkpoint( + self.converted_resource_path, + self.output_dir, + self.dataset, + self.typology, + self.act_on_critical_error, + ) diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index 37c2ac04..13ab54c0 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -155,7 +155,7 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): def test_check_for_duplicate_references(csv_path): - issues = check_for_duplicate_references(csv_path) + _, _, issues = check_for_duplicate_references(csv_path) assert issues, "The function should successfully identify issues." assert len(issues) == 1, "There should be one issue identified." @@ -168,7 +168,7 @@ def test_check_for_duplicate_references(csv_path): def test_validate_references(csv_path): - issues = validate_references(csv_path) + _, _, issues = validate_references(csv_path) assert issues, "The function should fail due to invalid references." assert len(issues) == 1, "There should be one issue identified." From 4338c8babe37cea958645376e525bdc5b76a2d44 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 12:49:24 +0100 Subject: [PATCH 41/58] Import change --- digital_land/phase/post_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index 801aaed8..e312644d 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -1,4 +1,4 @@ -from expectations.commands import run_converted_resource_checkpoint +from ..expectations.commands import run_converted_resource_checkpoint class PostConversionPhase: From 0851420efaa94106fc8212bba132869f3ead74b8 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:03:10 +0100 Subject: [PATCH 42/58] Parameter changes --- digital_land/commands.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 07d7c488..ca9224cf 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -128,6 +128,7 @@ def pipeline_run( issue_log = IssueLog(dataset=dataset, resource=resource) column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + converted_resource_path = custom_temp_dir / f"{resource}_converted.csv" # load pipeline configuration skip_patterns = pipeline.skip_patterns(resource) @@ -164,12 +165,8 @@ def pipeline_run( custom_temp_dir=custom_temp_dir, ), PostConversionPhase( - converted_resource_path=os.path.join( - custom_temp_dir, f"{resource}_converted.csv" - ), - output_dir=os.path.join( - os.path.dirname(output_path), "post_conversion_outputs" - ), + converted_resource_path=converted_resource_path, + output_dir=output_path, dataset=dataset, typology=specification.get_typology_for_dataset(dataset), ), From 0db28da558a817ba4feae69cb6d1b3b8a5f8f09d Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:39:52 +0100 Subject: [PATCH 43/58] Changes to convert --- digital_land/commands.py | 11 +++++++++++ digital_land/phase/convert.py | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/digital_land/commands.py b/digital_land/commands.py index ca9224cf..1c719b63 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -158,6 +158,17 @@ def pipeline_run( if entry_date: default_values["entry-date"] = entry_date + convert_phase = ConvertPhase( + path=input_path, + dataset_resource_log=DatasetResourceLog(), + custom_temp_dir=custom_temp_dir, + output_path=output_path, + ) + + # Execute the ConvertPhase to set the converted_resource_path + convert_phase.process() + converted_resource_path = convert_phase.converted_resource_path + run_pipeline( ConvertPhase( path=input_path, diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index b57c22c1..9cd99f45 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -125,6 +125,9 @@ def __init__( self.path = path self.log = dataset_resource_log self.charset = "" + self.converted_resource_path = ( + None # This will hold the path to the converted file + ) # Allows for custom temporary directory to be specified # This allows symlink creation in case of /tmp & path being on different partitions if custom_temp_dir: @@ -155,6 +158,8 @@ def process(self, stream=None): # raise StopIteration() reader = iter(()) + if self.output_path: + self.converted_resource_path = self.output_path return Stream(input_path, f=reader, log=self.log) From e552ff51a9739ce767c716aa1df09e16f1bc0802 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:43:39 +0100 Subject: [PATCH 44/58] Fix --- digital_land/commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 1c719b63..8c4767dc 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -128,7 +128,6 @@ def pipeline_run( issue_log = IssueLog(dataset=dataset, resource=resource) column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - converted_resource_path = custom_temp_dir / f"{resource}_converted.csv" # load pipeline configuration skip_patterns = pipeline.skip_patterns(resource) From 965d1bce843e233825717d37d28a36287b75cd7a Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:49:43 +0100 Subject: [PATCH 45/58] Typology change --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 8c4767dc..9062c36c 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -178,7 +178,7 @@ def pipeline_run( converted_resource_path=converted_resource_path, output_dir=output_path, dataset=dataset, - typology=specification.get_typology_for_dataset(dataset), + typology=specification.get_dataset_typology(dataset), ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), From 13df751bfe3f90711035462f9861e72ae99cb6c3 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 13:55:08 +0100 Subject: [PATCH 46/58] Add Process --- digital_land/commands.py | 5 ----- digital_land/phase/post_conversion.py | 11 +++-------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 9062c36c..a0f3fc26 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -169,11 +169,6 @@ def pipeline_run( converted_resource_path = convert_phase.converted_resource_path run_pipeline( - ConvertPhase( - path=input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), PostConversionPhase( converted_resource_path=converted_resource_path, output_dir=output_path, diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index e312644d..2216f8dd 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -10,20 +10,15 @@ def __init__( typology, act_on_critical_error=False, ): - """ - Initializes the PostConversionPhase with necessary parameters. - :param converted_resource_path: Path to the converted CSV file. - :param output_dir: Directory to store output files. - :param dataset: Dataset related information for the checkpoint. - :param typology: Typology information for the checkpoint. - :param act_on_critical_error: Whether to act on critical errors during the checkpoint. - """ self.converted_resource_path = converted_resource_path self.output_dir = output_dir self.dataset = dataset self.typology = typology self.act_on_critical_error = act_on_critical_error + def process(self): + return self.run() + def run(self): """ Executes the converted resource checkpoint using the provided parameters. From eb3b67ed7bce0b8eabc1b351dbe58a0015057397 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 14:00:24 +0100 Subject: [PATCH 47/58] Add process parameter --- digital_land/phase/post_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index 2216f8dd..00dcdd77 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -16,7 +16,7 @@ def __init__( self.typology = typology self.act_on_critical_error = act_on_critical_error - def process(self): + def process(self, stream=None): return self.run() def run(self): From d7fc4f7ef8740660f938423658b1aa0bd42d237e Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 14:09:35 +0100 Subject: [PATCH 48/58] Query runner adjustments --- .../expectation_functions/resource_validations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py index 2acbe669..c6acae74 100644 --- a/digital_land/expectations/expectation_functions/resource_validations.py +++ b/digital_land/expectations/expectation_functions/resource_validations.py @@ -1,7 +1,7 @@ import csv -def check_for_duplicate_references(csv_path): +def check_for_duplicate_references(csv_path, **kwargs): duplicates = {} issues = [] with csv_path.open(newline="") as csvfile: @@ -30,7 +30,7 @@ def check_for_duplicate_references(csv_path): return True, "Checked for duplicate references.", issues -def validate_references(csv_path): +def validate_references(csv_path, **kwargs): issues = [] with csv_path.open(newline="") as csvfile: reader = csv.DictReader(csvfile) From b5ebc71a2d6862c6007b86e1731e6a02b602efa2 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Tue, 16 Apr 2024 14:15:47 +0100 Subject: [PATCH 49/58] Fix converted resource --- digital_land/expectations/checkpoints/converted_resource.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 59c1c307..14be3c21 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -14,12 +14,14 @@ "name": "Check for Duplicate References", "severity": "error", "responsibility": "system", + "csv_path": None, }, { "function": validate_references, "name": "Validate References", "severity": "error", "responsibility": "system", + "csv_path": None, }, ] @@ -49,6 +51,7 @@ def load(self): # Assign a QueryRunner instance to each expectation for expectation in self.expectations: + expectation["csv_path"] = self.csv_path expectation["query_runner"] = QueryRunner(self.csv_path) def save(self, output_dir, format="csv"): From 7b60741395410f21956ce804562eecb6a8b09239 Mon Sep 17 00:00:00 2001 From: James Bannister Date: Wed, 17 Apr 2024 16:33:12 +0100 Subject: [PATCH 50/58] Change pathing --- digital_land/commands.py | 1647 +++++++++++++++++++------------------- 1 file changed, 821 insertions(+), 826 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index a0f3fc26..07befebf 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1,826 +1,821 @@ -from collections import OrderedDict -import csv -import itertools -import os -import sys -import json -import logging -from pathlib import Path - -import geojson -import shapely - -from digital_land.specification import Specification -from digital_land.collect import Collector -from digital_land.collection import Collection, resource_path -from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog -from digital_land.organisation import Organisation -from digital_land.package.dataset import DatasetPackage -from digital_land.phase.combine import FactCombinePhase -from digital_land.phase.concat import ConcatFieldPhase -from digital_land.phase.convert import ConvertPhase, execute -from digital_land.phase.post_conversion import PostConversionPhase -from digital_land.phase.default import DefaultPhase -from digital_land.phase.dump import DumpPhase -from digital_land.phase.factor import FactorPhase -from digital_land.phase.filter import FilterPhase -from digital_land.phase.harmonise import HarmonisePhase -from digital_land.phase.lookup import ( - EntityLookupPhase, - FactLookupPhase, - PrintLookupPhase, -) -from digital_land.phase.map import MapPhase -from digital_land.phase.migrate import MigratePhase -from digital_land.phase.normalise import NormalisePhase -from digital_land.phase.organisation import OrganisationPhase -from digital_land.phase.parse import ParsePhase -from digital_land.phase.patch import PatchPhase -from digital_land.phase.pivot import PivotPhase -from digital_land.phase.prefix import EntityPrefixPhase -from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase -from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase -from digital_land.phase.save import SavePhase -from digital_land.pipeline import run_pipeline, Lookups, Pipeline -from digital_land.schema import Schema -from digital_land.update import add_source_endpoint -from .register import hash_value - -logger = logging.getLogger(__name__) - - -def fetch(url, pipeline): - """fetch a single source endpoint URL, and add it to the collection""" - collector = Collector(pipeline.name) - collector.fetch(url) - - -def collect(endpoint_path, collection_dir, pipeline): - """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file""" - collector = Collector(pipeline.name, Path(collection_dir)) - collector.collect(endpoint_path) - - -# -# collection commands -# TBD: make sub commands -# -def collection_list_resources(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - for resource in sorted(collection.resource.records): - print(resource_path(resource, directory=collection_dir)) - - -def collection_pipeline_makerules(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - collection.pipeline_makerules() - - -def collection_save_csv(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - collection.update() - collection.save_csv() - - -# -# pipeline commands -# -def convert(input_path, output_path, custom_temp_dir=None): - if not output_path: - output_path = default_output_path("converted", input_path) - dataset_resource_log = DatasetResourceLog() - run_pipeline( - ConvertPhase( - input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), - DumpPhase(output_path), - ) - dataset_resource_log.save(f=sys.stdout) - - -def pipeline_run( - dataset, - pipeline, - specification, - input_path, - output_path, - collection_dir="./collection", # TBD: remove, replaced by endpoints, organisations and entry_date - null_path=None, # TBD: remove this - issue_dir=None, - organisation_path=None, - save_harmonised=False, - column_field_dir=None, - dataset_resource_dir=None, - custom_temp_dir=None, # TBD: rename to "tmpdir" - endpoints=[], - organisations=[], - entry_date="", -): - resource = resource_from_path(input_path) - dataset = dataset - schema = specification.pipeline[pipeline.name]["schema"] - intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) - issue_log = IssueLog(dataset=dataset, resource=resource) - column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) - dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - - # load pipeline configuration - skip_patterns = pipeline.skip_patterns(resource) - columns = pipeline.columns(resource, endpoints=endpoints) - concats = pipeline.concatenations(resource, endpoints=endpoints) - patches = pipeline.patches(resource=resource) - lookups = pipeline.lookups(resource=resource) - default_fields = pipeline.default_fields(resource=resource) - default_values = pipeline.default_values(endpoints=endpoints) - combine_fields = pipeline.combine_fields(endpoints=endpoints) - - # load organisations - organisation = Organisation(organisation_path, Path(pipeline.path)) - - # load the resource default values from the collection - if not endpoints: - collection = Collection(name=None, directory=collection_dir) - collection.load() - endpoints = collection.resource_endpoints(resource) - organisations = collection.resource_organisations(resource) - entry_date = collection.resource_start_date(resource) - - # resource specific default values - if len(organisations) == 1: - default_values["organisation"] = organisations[0] - - if entry_date: - default_values["entry-date"] = entry_date - - convert_phase = ConvertPhase( - path=input_path, - dataset_resource_log=DatasetResourceLog(), - custom_temp_dir=custom_temp_dir, - output_path=output_path, - ) - - # Execute the ConvertPhase to set the converted_resource_path - convert_phase.process() - converted_resource_path = convert_phase.converted_resource_path - - run_pipeline( - PostConversionPhase( - converted_resource_path=converted_resource_path, - output_dir=output_path, - dataset=dataset, - typology=specification.get_dataset_typology(dataset), - ), - NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), - ParsePhase(), - ConcatFieldPhase(concats=concats, log=column_field_log), - MapPhase( - fieldnames=intermediate_fieldnames, - columns=columns, - log=column_field_log, - ), - FilterPhase(filters=pipeline.filters(resource)), - PatchPhase( - issues=issue_log, - patches=patches, - ), - HarmonisePhase( - field_datatype_map=specification.get_field_datatype_map(), - issues=issue_log, - dataset=dataset, - ), - DefaultPhase( - default_fields=default_fields, - default_values=default_values, - issues=issue_log, - ), - # TBD: move migrating columns to fields to be immediately after map - # this will simplify harmonisation and remove intermediate_fieldnames - # but effects brownfield-land and other pipelines which operate on columns - MigratePhase( - fields=specification.schema_field[schema], - migrations=pipeline.migrations(), - ), - OrganisationPhase(organisation=organisation, issues=issue_log), - FieldPrunePhase(fields=specification.current_fieldnames(schema)), - EntityReferencePhase( - dataset=dataset, - prefix=specification.dataset_prefix(dataset), - ), - EntityPrefixPhase(dataset=dataset), - EntityLookupPhase(lookups), - SavePhase( - default_output_path("harmonised", input_path), - fieldnames=intermediate_fieldnames, - enabled=save_harmonised, - ), - EntityPrunePhase( - issue_log=issue_log, dataset_resource_log=dataset_resource_log - ), - PivotPhase(), - FactCombinePhase(issue_log=issue_log, fields=combine_fields), - FactorPhase(), - FactReferencePhase( - field_typology_map=specification.get_field_typology_map(), - field_prefix_map=specification.get_field_prefix_map(), - ), - FactLookupPhase(lookups), - FactPrunePhase(), - SavePhase( - output_path, - fieldnames=specification.factor_fieldnames(), - ), - ) - - issue_log.save(os.path.join(issue_dir, resource + ".csv")) - column_field_log.save(os.path.join(column_field_dir, resource + ".csv")) - dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv")) - - -# -# build dataset from processed resources -# -def dataset_create( - input_paths, - output_path, - organisation_path, - pipeline, - dataset, - specification, - issue_dir="issue", -): - if not output_path: - print("missing output path", file=sys.stderr) - sys.exit(2) - organisation = Organisation(organisation_path, Path(pipeline.path)) - package = DatasetPackage( - dataset, - organisation=organisation, - path=output_path, - specification_dir=None, # TBD: package should use this specification object - ) - package.create() - for path in input_paths: - package.load_transformed(path) - package.load_entities() - - old_entity_path = os.path.join(pipeline.path, "old-entity.csv") - if os.path.exists(old_entity_path): - package.load_old_entities(old_entity_path) - - issue_paths = os.path.join(issue_dir, dataset) - if os.path.exists(issue_paths): - for issue_path in os.listdir(issue_paths): - package.load_issues(os.path.join(issue_paths, issue_path)) - else: - logging.warning("No directory for this dataset in the provided issue_directory") - - package.add_counts() - - -def dataset_dump(input_path, output_path): - cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" - logging.info(cmd) - os.system(cmd) - - -def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset): - if isinstance(csv_path, str): - path = Path(csv_path) - dataset_name = path.stem - elif isinstance(csv_path, Path): - dataset_name = csv_path.stem - else: - logging.error(f"Can't extract datapackage name from {csv_path}") - sys.exit(-1) - - flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv") - with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file: - reader = csv.DictReader(read_file) - - spec_field_names = [ - field - for field in itertools.chain( - *[ - specification.current_fieldnames(schema) - for schema in specification.dataset_schema[dataset] - ] - ) - ] - reader_fieldnames = [ - field.replace("_", "-") - for field in list(reader.fieldnames) - if field != "json" - ] - - flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames)) - # Make sure we put flattened fieldnames last - field_names = reader_fieldnames + sorted(list(flattened_field_names)) - - writer = csv.DictWriter(write_file, fieldnames=field_names) - writer.writeheader() - entities = [] - for row in reader: - row.pop("geojson", None) - row = OrderedDict(row) - json_string = row.pop("json") or "{}" - row.update(json.loads(json_string)) - kebab_case_row = dict( - [(key.replace("_", "-"), val) for key, val in row.items()] - ) - writer.writerow(kebab_case_row) - entities.append(kebab_case_row) - - # write the entities to json file as well - flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json") - with open(flattened_json_path, "w") as out_json: - out_json.write(json.dumps({"entities": entities})) - batch_size = 100000 - temp_geojson_files = [] - geography_entities = [e for e in entities if e["typology"] == "geography"] - for i in range(0, len(geography_entities), batch_size): - batch = geography_entities[i : i + batch_size] - feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name) - - geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson") - temp_geojson_files.append(geojson_path) - try: - with open(geojson_path, "w", encoding="utf-8") as out_geojson: - out_geojson.write(geojson.dumps(feature_collection)) - except Exception as e: - logging.error(f"Error writing to GeoJSON file: {e}") - - if all(os.path.isfile(path) for path in temp_geojson_files): - rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson") - for temp_path in temp_geojson_files: - responseCode, _, _ = execute( - [ - "ogr2ogr", - "-f", - "GeoJSON", - "-lco", - "RFC7946=YES", - "-append", - rfc7946_geojson_path, - temp_path, - ] - ) - - if responseCode != 0: - logging.error( - "Could not generate rfc7946 compliant geojson. Use existing file." - ) - execute( - [ - "ogr2ogr", - "-f", - "GeoJSON", - "-append", - rfc7946_geojson_path, - temp_path, - ] - ) - # clear up input geojson file - if os.path.isfile(temp_path): - os.remove(temp_path) - - -# -# configuration commands -# -def collection_add_source(entry, collection, endpoint_url, collection_dir): - """ - followed by a sequence of optional name and value pairs including the following names: - "attribution", "licence", "pipelines", "status", "plugin", - "parameters", "start-date", "end-date" - """ - entry["collection"] = collection - entry["endpoint-url"] = endpoint_url - allowed_names = set( - list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames) - ) - for key in entry.keys(): - if key not in allowed_names: - logging.error(f"unrecognised argument '{key}'") - sys.exit(2) - add_source_endpoint(entry, directory=collection_dir) - - -def add_endpoints_and_lookups( - csv_file_path, - collection_name, - collection_dir, - pipeline_dir, - specification_dir, - organisation_path, - tmp_dir="./var/cache", -): - """ - :param csv_file_path: - :param collection_name: - :param collection_dir: - :param pipeline_dir: - :param specification_dir: - :param organisation_path: - :param tmp_dir: - :return: - """ - - expected_cols = [ - "pipelines", - "organisation", - "documentation-url", - "endpoint-url", - "start-date", - "licence", - ] - - licence_csv_path = os.path.join(specification_dir, "licence.csv") - valid_licenses = [] - with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile: - reader = csv.DictReader(csvfile) - valid_licenses = [row["licence"] for row in reader] - - # need to get collection name from somewhere - # collection name is NOT the dataset name - collection = Collection(name=collection_name, directory=collection_dir) - collection.load() - - # read and process each record of the new endpoints csv at csv_file_path - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - - # validate the columns - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") - - # this is not perfect we should riase validation errors in our code and below should include a try and except statement - endpoints = [] - for row in reader: - if row["licence"] not in valid_licenses: - raise ValueError( - f"Licence '{row['licence']}' is not a valid licence according to the specification." - ) - if not row["documentation-url"].strip(): - raise ValueError( - "The 'documentation-url' must be populated for each row." - ) - if collection.add_source_endpoint(row): - endpoint = { - "endpoint-url": row["endpoint-url"], - "endpoint": hash_value(row["endpoint-url"]), - "end-date": row.get("end-date", ""), - "plugin": row.get("plugin"), - "licence": row["licence"], - } - endpoints.append(endpoint) - - # endpoints have been added now lets collect the resources using the endpoint information - collector = Collector(collection_dir=collection_dir) - - for endpoint in endpoints: - collector.fetch( - url=endpoint["endpoint-url"], - endpoint=endpoint["endpoint"], - end_date=endpoint["end-date"], - plugin=endpoint["plugin"], - ) - # reload log items - collection.load_log_items() - - dataset_resource_map = collection.dataset_resource_map() - - # searching for the specific resources that we have downloaded - for dataset in dataset_resource_map: - resources_to_assign = [] - for resource in dataset_resource_map[dataset]: - resource_endpoints = collection.resource_endpoints(resource) - if any( - endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints] - for endpoint in resource_endpoints - ): - resource_file_path = Path(collection_dir) / "resource" / resource - resources_to_assign.append(resource_file_path) - assign_entities( - resource_file_paths=resources_to_assign, - collection=collection, - pipeline_dir=pipeline_dir, - specification_dir=specification_dir, - organisation_path=organisation_path, - tmp_dir=tmp_dir, - dataset=dataset, - ) - - -def resource_from_path(path): - return Path(path).stem - - -def default_output_path(command, input_path): - directory = "" if command in ["harmonised", "transformed"] else "var/" - return f"{directory}{command}/{resource_from_path(input_path)}.csv" - - -def assign_entities( - resource_file_paths, - collection, - pipeline_dir, - specification_dir, - organisation_path, - tmp_dir="./var/cache", - dataset=None, -): - """ - Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection - :param resource_file_paths: - :param collection: - :param pipeline_dir: - :param specification_dir: - :param organisation_path: - :param tmp_dir: - :return: - """ - - specification = Specification(specification_dir) - - print("") - print("======================================================================") - print("New Lookups") - print("======================================================================") - - dataset_resource_map = collection.dataset_resource_map() - new_lookups = [] - - pipeline_name = None - # establish pipeline if dataset is known - else have to find dataset for each resource - if dataset is not None: - pipeline = Pipeline(pipeline_dir, dataset) - pipeline_name = pipeline.name - - for resource_file_path in resource_file_paths: - resource = os.path.splitext(os.path.basename(resource_file_path))[0] - # Find dataset for resource if not given - if dataset is None: - for dataset_key, resources in dataset_resource_map.items(): - if resource in list(resources): - dataset = dataset_key - continue - # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline - if dataset is not None: - pipeline = Pipeline(pipeline_dir, dataset) - pipeline_name = pipeline.name - else: - logging.error( - "Resource '%s' has not been processed by pipeline - no lookups added" - % (resource) - ) - break - - resource_lookups = get_resource_unidentified_lookups( - input_path=Path(resource_file_path), - dataset=dataset, - organisations=collection.resource_organisations(resource), - pipeline=pipeline, - specification=specification, - tmp_dir=Path(tmp_dir).absolute(), - org_csv_path=organisation_path, - ) - new_lookups.append(resource_lookups) - - if pipeline_name is not None: - # save new lookups to file - lookups = Lookups(pipeline_dir) - # Check if the lookups file exists, create it if not - if not os.path.exists(lookups.lookups_path): - with open(lookups.lookups_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(list(lookups.schema.fieldnames)) - - lookups.load_csv() - for new_lookup in new_lookups: - for idx, entry in enumerate(new_lookup): - lookups.add_entry(entry[0]) - - # save edited csvs - max_entity_num = lookups.get_max_entity(pipeline_name) - lookups.entity_num_gen.state["current"] = max_entity_num - lookups.entity_num_gen.state["range_max"] = ( - specification.get_dataset_entity_max(pipeline_name) - ) - lookups.entity_num_gen.state["range_min"] = ( - specification.get_dataset_entity_min(pipeline_name) - ) - - # TO DO: Currently using pipeline_name to find dataset min, max, current - # This would not function properly if each resource had a different dataset - - collection.save_csv() - new_lookups = lookups.save_csv() - - for entity in new_lookups: - print( - entity["prefix"], - ",", - entity["organisation"], - ",", - entity["reference"], - ",", - entity["entity"], - ) - - -def get_resource_unidentified_lookups( - input_path: Path, - dataset: str, - pipeline: Pipeline, - specification: Specification, - organisations: list = [], - tmp_dir: Path = None, - org_csv_path: Path = None, -): - # convert phase inputs - # could alter resource_from_path to file from path and promote to a utils folder - resource = resource_from_path(input_path) - dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - custom_temp_dir = tmp_dir # './var' - - print("") - print("----------------------------------------------------------------------") - print(f">>> organisations:{organisations}") - print(f">>> resource:{resource}") - print("----------------------------------------------------------------------") - - # normalise phase inputs - skip_patterns = pipeline.skip_patterns(resource) - null_path = None - - # concat field phase - concats = pipeline.concatenations(resource) - column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) - - # map phase - intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) - columns = pipeline.columns(resource) - - # patch phase - patches = pipeline.patches(resource=resource) - - # harmonize phase - issue_log = IssueLog(dataset=dataset, resource=resource) - - # default phase - default_fields = pipeline.default_fields(resource=resource) - default_values = pipeline.default_values(endpoints=[]) - - if len(organisations) == 1: - default_values["organisation"] = organisations[0] - - # migrate phase - schema = specification.pipeline[pipeline.name]["schema"] - - # organisation phase - organisation = Organisation(org_csv_path, Path(pipeline.path)) - - # print lookups phase - pipeline_lookups = pipeline.lookups() - redirect_lookups = pipeline.redirect_lookups() - print_lookup_phase = PrintLookupPhase( - lookups=pipeline_lookups, redirect_lookups=redirect_lookups - ) - - run_pipeline( - ConvertPhase( - path=input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), - NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), - ParsePhase(), - ConcatFieldPhase(concats=concats, log=column_field_log), - MapPhase( - fieldnames=intermediate_fieldnames, - columns=columns, - log=column_field_log, - ), - FilterPhase(filters=pipeline.filters(resource)), - PatchPhase( - issues=issue_log, - patches=patches, - ), - HarmonisePhase( - field_datatype_map=specification.get_field_datatype_map(), - issues=issue_log, - ), - DefaultPhase( - default_fields=default_fields, - default_values=default_values, - issues=issue_log, - ), - # TBD: move migrating columns to fields to be immediately after map - # this will simplify harmonisation and remove intermediate_fieldnames - # but effects brownfield-land and other pipelines which operate on columns - MigratePhase( - fields=specification.schema_field[schema], - migrations=pipeline.migrations(), - ), - OrganisationPhase(organisation=organisation, issues=issue_log), - FieldPrunePhase(fields=specification.current_fieldnames(schema)), - EntityReferencePhase( - dataset=dataset, - prefix=specification.dataset_prefix(dataset), - ), - EntityPrefixPhase(dataset=dataset), - print_lookup_phase, - ) - - return print_lookup_phase.new_lookup_entries - - -def process_data_in_batches(entities, flattened_dir, dataset_name): - features = [] - feature_collection = "" - for entity in entities: - geom = entity.pop("geometry") - point = entity.pop("point") - if geom: - try: - geometry = shapely.wkt.loads(geom) - feature = geojson.Feature(geometry=geometry, properties=entity) - features.append(feature) - except Exception as e: - logging.error(f"Error loading wkt from entity {entity['entity']}") - logging.error(e) - elif point: - try: - geometry = shapely.wkt.loads(point) - feature = geojson.Feature(geometry=geometry, properties=entity) - features.append(feature) - except Exception as e: - logging.error(f"Error loading wkt from entity {entity['entity']}") - logging.error(e) - else: - logging.error( - f"No geometry or point data for entity {entity['entity']} with typology 'geography'" - ) - - if features: - feature_collection = geojson.FeatureCollection( - features=features, name=dataset_name - ) - - return feature_collection - - -def add_redirections(csv_file_path, pipeline_dir): - """ - :param csv_file_path: - :param pipeline_dir: - :return: - """ - expected_cols = [ - "entity_source", - "entity_destination", - ] - - old_entity_path = Path(pipeline_dir) / "old-entity.csv" - - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") - - fieldnames = ["old-entity", "status", "entity"] - - f = open(old_entity_path, "a", newline="") - writer = csv.DictWriter(f, fieldnames=fieldnames) - if f.tell() == 0: - writer.writeheader() - - for row in reader: - if row["entity_source"] == "" or row["entity_destination"] == "": - print( - "Missing entity number for", - ( - row["entity_destination"] - if row["entity_source"] == "" - else row["entity_source"] - ), - ) - else: - writer.writerow( - { - "old-entity": row["entity_source"], - "status": "301", - "entity": row["entity_destination"], - } - ) - print("Redirections added to old-entity.csv") +from collections import OrderedDict +import csv +import itertools +import os +import sys +import json +import logging +from pathlib import Path + +import geojson +import shapely + +from digital_land.specification import Specification +from digital_land.collect import Collector +from digital_land.collection import Collection, resource_path +from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog +from digital_land.organisation import Organisation +from digital_land.package.dataset import DatasetPackage +from digital_land.phase.combine import FactCombinePhase +from digital_land.phase.concat import ConcatFieldPhase +from digital_land.phase.convert import ConvertPhase, execute +from digital_land.phase.post_conversion import PostConversionPhase +from digital_land.phase.default import DefaultPhase +from digital_land.phase.dump import DumpPhase +from digital_land.phase.factor import FactorPhase +from digital_land.phase.filter import FilterPhase +from digital_land.phase.harmonise import HarmonisePhase +from digital_land.phase.lookup import ( + EntityLookupPhase, + FactLookupPhase, + PrintLookupPhase, +) +from digital_land.phase.map import MapPhase +from digital_land.phase.migrate import MigratePhase +from digital_land.phase.normalise import NormalisePhase +from digital_land.phase.organisation import OrganisationPhase +from digital_land.phase.parse import ParsePhase +from digital_land.phase.patch import PatchPhase +from digital_land.phase.pivot import PivotPhase +from digital_land.phase.prefix import EntityPrefixPhase +from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase +from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase +from digital_land.phase.save import SavePhase +from digital_land.pipeline import run_pipeline, Lookups, Pipeline +from digital_land.schema import Schema +from digital_land.update import add_source_endpoint +from .register import hash_value + +logger = logging.getLogger(__name__) + + +def fetch(url, pipeline): + """fetch a single source endpoint URL, and add it to the collection""" + collector = Collector(pipeline.name) + collector.fetch(url) + + +def collect(endpoint_path, collection_dir, pipeline): + """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file""" + collector = Collector(pipeline.name, Path(collection_dir)) + collector.collect(endpoint_path) + + +# +# collection commands +# TBD: make sub commands +# +def collection_list_resources(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + for resource in sorted(collection.resource.records): + print(resource_path(resource, directory=collection_dir)) + + +def collection_pipeline_makerules(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + collection.pipeline_makerules() + + +def collection_save_csv(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + collection.update() + collection.save_csv() + + +# +# pipeline commands +# +def convert(input_path, output_path, custom_temp_dir=None): + if not output_path: + output_path = default_output_path("converted", input_path) + dataset_resource_log = DatasetResourceLog() + run_pipeline( + ConvertPhase( + input_path, + dataset_resource_log=dataset_resource_log, + custom_temp_dir=custom_temp_dir, + ), + DumpPhase(output_path), + ) + dataset_resource_log.save(f=sys.stdout) + + +def pipeline_run( + dataset, + pipeline, + specification, + input_path, + output_path, + collection_dir="./collection", # TBD: remove, replaced by endpoints, organisations and entry_date + null_path=None, # TBD: remove this + issue_dir=None, + organisation_path=None, + save_harmonised=False, + column_field_dir=None, + dataset_resource_dir=None, + custom_temp_dir=None, # TBD: rename to "tmpdir" + endpoints=[], + organisations=[], + entry_date="", +): + resource = resource_from_path(input_path) + dataset = dataset + schema = specification.pipeline[pipeline.name]["schema"] + intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) + issue_log = IssueLog(dataset=dataset, resource=resource) + column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) + dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + + # load pipeline configuration + skip_patterns = pipeline.skip_patterns(resource) + columns = pipeline.columns(resource, endpoints=endpoints) + concats = pipeline.concatenations(resource, endpoints=endpoints) + patches = pipeline.patches(resource=resource) + lookups = pipeline.lookups(resource=resource) + default_fields = pipeline.default_fields(resource=resource) + default_values = pipeline.default_values(endpoints=endpoints) + combine_fields = pipeline.combine_fields(endpoints=endpoints) + + # load organisations + organisation = Organisation(organisation_path, Path(pipeline.path)) + + # load the resource default values from the collection + if not endpoints: + collection = Collection(name=None, directory=collection_dir) + collection.load() + endpoints = collection.resource_endpoints(resource) + organisations = collection.resource_organisations(resource) + entry_date = collection.resource_start_date(resource) + + # resource specific default values + if len(organisations) == 1: + default_values["organisation"] = organisations[0] + + if entry_date: + default_values["entry-date"] = entry_date + + run_pipeline( + ConvertPhase( + path=input_path, + dataset_resource_log=DatasetResourceLog(), + custom_temp_dir=custom_temp_dir, + output_path=output_path, + ), + PostConversionPhase( + converted_resource_path=input_path, + output_dir=output_path, + dataset=dataset, + typology=specification.get_dataset_typology(dataset), + ), + NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), + ParsePhase(), + ConcatFieldPhase(concats=concats, log=column_field_log), + MapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + FilterPhase(filters=pipeline.filters(resource)), + PatchPhase( + issues=issue_log, + patches=patches, + ), + HarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + dataset=dataset, + ), + DefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + # TBD: move migrating columns to fields to be immediately after map + # this will simplify harmonisation and remove intermediate_fieldnames + # but effects brownfield-land and other pipelines which operate on columns + MigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + OrganisationPhase(organisation=organisation, issues=issue_log), + FieldPrunePhase(fields=specification.current_fieldnames(schema)), + EntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + ), + EntityPrefixPhase(dataset=dataset), + EntityLookupPhase(lookups), + SavePhase( + default_output_path("harmonised", input_path), + fieldnames=intermediate_fieldnames, + enabled=save_harmonised, + ), + EntityPrunePhase( + issue_log=issue_log, dataset_resource_log=dataset_resource_log + ), + PivotPhase(), + FactCombinePhase(issue_log=issue_log, fields=combine_fields), + FactorPhase(), + FactReferencePhase( + field_typology_map=specification.get_field_typology_map(), + field_prefix_map=specification.get_field_prefix_map(), + ), + FactLookupPhase(lookups), + FactPrunePhase(), + SavePhase( + output_path, + fieldnames=specification.factor_fieldnames(), + ), + ) + + issue_log.save(os.path.join(issue_dir, resource + ".csv")) + column_field_log.save(os.path.join(column_field_dir, resource + ".csv")) + dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv")) + + +# +# build dataset from processed resources +# +def dataset_create( + input_paths, + output_path, + organisation_path, + pipeline, + dataset, + specification, + issue_dir="issue", +): + if not output_path: + print("missing output path", file=sys.stderr) + sys.exit(2) + organisation = Organisation(organisation_path, Path(pipeline.path)) + package = DatasetPackage( + dataset, + organisation=organisation, + path=output_path, + specification_dir=None, # TBD: package should use this specification object + ) + package.create() + for path in input_paths: + package.load_transformed(path) + package.load_entities() + + old_entity_path = os.path.join(pipeline.path, "old-entity.csv") + if os.path.exists(old_entity_path): + package.load_old_entities(old_entity_path) + + issue_paths = os.path.join(issue_dir, dataset) + if os.path.exists(issue_paths): + for issue_path in os.listdir(issue_paths): + package.load_issues(os.path.join(issue_paths, issue_path)) + else: + logging.warning("No directory for this dataset in the provided issue_directory") + + package.add_counts() + + +def dataset_dump(input_path, output_path): + cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" + logging.info(cmd) + os.system(cmd) + + +def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset): + if isinstance(csv_path, str): + path = Path(csv_path) + dataset_name = path.stem + elif isinstance(csv_path, Path): + dataset_name = csv_path.stem + else: + logging.error(f"Can't extract datapackage name from {csv_path}") + sys.exit(-1) + + flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv") + with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file: + reader = csv.DictReader(read_file) + + spec_field_names = [ + field + for field in itertools.chain( + *[ + specification.current_fieldnames(schema) + for schema in specification.dataset_schema[dataset] + ] + ) + ] + reader_fieldnames = [ + field.replace("_", "-") + for field in list(reader.fieldnames) + if field != "json" + ] + + flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames)) + # Make sure we put flattened fieldnames last + field_names = reader_fieldnames + sorted(list(flattened_field_names)) + + writer = csv.DictWriter(write_file, fieldnames=field_names) + writer.writeheader() + entities = [] + for row in reader: + row.pop("geojson", None) + row = OrderedDict(row) + json_string = row.pop("json") or "{}" + row.update(json.loads(json_string)) + kebab_case_row = dict( + [(key.replace("_", "-"), val) for key, val in row.items()] + ) + writer.writerow(kebab_case_row) + entities.append(kebab_case_row) + + # write the entities to json file as well + flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json") + with open(flattened_json_path, "w") as out_json: + out_json.write(json.dumps({"entities": entities})) + batch_size = 100000 + temp_geojson_files = [] + geography_entities = [e for e in entities if e["typology"] == "geography"] + for i in range(0, len(geography_entities), batch_size): + batch = geography_entities[i : i + batch_size] + feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name) + + geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson") + temp_geojson_files.append(geojson_path) + try: + with open(geojson_path, "w", encoding="utf-8") as out_geojson: + out_geojson.write(geojson.dumps(feature_collection)) + except Exception as e: + logging.error(f"Error writing to GeoJSON file: {e}") + + if all(os.path.isfile(path) for path in temp_geojson_files): + rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson") + for temp_path in temp_geojson_files: + responseCode, _, _ = execute( + [ + "ogr2ogr", + "-f", + "GeoJSON", + "-lco", + "RFC7946=YES", + "-append", + rfc7946_geojson_path, + temp_path, + ] + ) + + if responseCode != 0: + logging.error( + "Could not generate rfc7946 compliant geojson. Use existing file." + ) + execute( + [ + "ogr2ogr", + "-f", + "GeoJSON", + "-append", + rfc7946_geojson_path, + temp_path, + ] + ) + # clear up input geojson file + if os.path.isfile(temp_path): + os.remove(temp_path) + + +# +# configuration commands +# +def collection_add_source(entry, collection, endpoint_url, collection_dir): + """ + followed by a sequence of optional name and value pairs including the following names: + "attribution", "licence", "pipelines", "status", "plugin", + "parameters", "start-date", "end-date" + """ + entry["collection"] = collection + entry["endpoint-url"] = endpoint_url + allowed_names = set( + list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames) + ) + for key in entry.keys(): + if key not in allowed_names: + logging.error(f"unrecognised argument '{key}'") + sys.exit(2) + add_source_endpoint(entry, directory=collection_dir) + + +def add_endpoints_and_lookups( + csv_file_path, + collection_name, + collection_dir, + pipeline_dir, + specification_dir, + organisation_path, + tmp_dir="./var/cache", +): + """ + :param csv_file_path: + :param collection_name: + :param collection_dir: + :param pipeline_dir: + :param specification_dir: + :param organisation_path: + :param tmp_dir: + :return: + """ + + expected_cols = [ + "pipelines", + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "licence", + ] + + licence_csv_path = os.path.join(specification_dir, "licence.csv") + valid_licenses = [] + with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile) + valid_licenses = [row["licence"] for row in reader] + + # need to get collection name from somewhere + # collection name is NOT the dataset name + collection = Collection(name=collection_name, directory=collection_dir) + collection.load() + + # read and process each record of the new endpoints csv at csv_file_path + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + # validate the columns + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + # this is not perfect we should riase validation errors in our code and below should include a try and except statement + endpoints = [] + for row in reader: + if row["licence"] not in valid_licenses: + raise ValueError( + f"Licence '{row['licence']}' is not a valid licence according to the specification." + ) + if not row["documentation-url"].strip(): + raise ValueError( + "The 'documentation-url' must be populated for each row." + ) + if collection.add_source_endpoint(row): + endpoint = { + "endpoint-url": row["endpoint-url"], + "endpoint": hash_value(row["endpoint-url"]), + "end-date": row.get("end-date", ""), + "plugin": row.get("plugin"), + "licence": row["licence"], + } + endpoints.append(endpoint) + + # endpoints have been added now lets collect the resources using the endpoint information + collector = Collector(collection_dir=collection_dir) + + for endpoint in endpoints: + collector.fetch( + url=endpoint["endpoint-url"], + endpoint=endpoint["endpoint"], + end_date=endpoint["end-date"], + plugin=endpoint["plugin"], + ) + # reload log items + collection.load_log_items() + + dataset_resource_map = collection.dataset_resource_map() + + # searching for the specific resources that we have downloaded + for dataset in dataset_resource_map: + resources_to_assign = [] + for resource in dataset_resource_map[dataset]: + resource_endpoints = collection.resource_endpoints(resource) + if any( + endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints] + for endpoint in resource_endpoints + ): + resource_file_path = Path(collection_dir) / "resource" / resource + resources_to_assign.append(resource_file_path) + assign_entities( + resource_file_paths=resources_to_assign, + collection=collection, + pipeline_dir=pipeline_dir, + specification_dir=specification_dir, + organisation_path=organisation_path, + tmp_dir=tmp_dir, + dataset=dataset, + ) + + +def resource_from_path(path): + return Path(path).stem + + +def default_output_path(command, input_path): + directory = "" if command in ["harmonised", "transformed"] else "var/" + return f"{directory}{command}/{resource_from_path(input_path)}.csv" + + +def assign_entities( + resource_file_paths, + collection, + pipeline_dir, + specification_dir, + organisation_path, + tmp_dir="./var/cache", + dataset=None, +): + """ + Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection + :param resource_file_paths: + :param collection: + :param pipeline_dir: + :param specification_dir: + :param organisation_path: + :param tmp_dir: + :return: + """ + + specification = Specification(specification_dir) + + print("") + print("======================================================================") + print("New Lookups") + print("======================================================================") + + dataset_resource_map = collection.dataset_resource_map() + new_lookups = [] + + pipeline_name = None + # establish pipeline if dataset is known - else have to find dataset for each resource + if dataset is not None: + pipeline = Pipeline(pipeline_dir, dataset) + pipeline_name = pipeline.name + + for resource_file_path in resource_file_paths: + resource = os.path.splitext(os.path.basename(resource_file_path))[0] + # Find dataset for resource if not given + if dataset is None: + for dataset_key, resources in dataset_resource_map.items(): + if resource in list(resources): + dataset = dataset_key + continue + # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline + if dataset is not None: + pipeline = Pipeline(pipeline_dir, dataset) + pipeline_name = pipeline.name + else: + logging.error( + "Resource '%s' has not been processed by pipeline - no lookups added" + % (resource) + ) + break + + resource_lookups = get_resource_unidentified_lookups( + input_path=Path(resource_file_path), + dataset=dataset, + organisations=collection.resource_organisations(resource), + pipeline=pipeline, + specification=specification, + tmp_dir=Path(tmp_dir).absolute(), + org_csv_path=organisation_path, + ) + new_lookups.append(resource_lookups) + + if pipeline_name is not None: + # save new lookups to file + lookups = Lookups(pipeline_dir) + # Check if the lookups file exists, create it if not + if not os.path.exists(lookups.lookups_path): + with open(lookups.lookups_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(list(lookups.schema.fieldnames)) + + lookups.load_csv() + for new_lookup in new_lookups: + for idx, entry in enumerate(new_lookup): + lookups.add_entry(entry[0]) + + # save edited csvs + max_entity_num = lookups.get_max_entity(pipeline_name) + lookups.entity_num_gen.state["current"] = max_entity_num + lookups.entity_num_gen.state["range_max"] = ( + specification.get_dataset_entity_max(pipeline_name) + ) + lookups.entity_num_gen.state["range_min"] = ( + specification.get_dataset_entity_min(pipeline_name) + ) + + # TO DO: Currently using pipeline_name to find dataset min, max, current + # This would not function properly if each resource had a different dataset + + collection.save_csv() + new_lookups = lookups.save_csv() + + for entity in new_lookups: + print( + entity["prefix"], + ",", + entity["organisation"], + ",", + entity["reference"], + ",", + entity["entity"], + ) + + +def get_resource_unidentified_lookups( + input_path: Path, + dataset: str, + pipeline: Pipeline, + specification: Specification, + organisations: list = [], + tmp_dir: Path = None, + org_csv_path: Path = None, +): + # convert phase inputs + # could alter resource_from_path to file from path and promote to a utils folder + resource = resource_from_path(input_path) + dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + custom_temp_dir = tmp_dir # './var' + + print("") + print("----------------------------------------------------------------------") + print(f">>> organisations:{organisations}") + print(f">>> resource:{resource}") + print("----------------------------------------------------------------------") + + # normalise phase inputs + skip_patterns = pipeline.skip_patterns(resource) + null_path = None + + # concat field phase + concats = pipeline.concatenations(resource) + column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) + + # map phase + intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) + columns = pipeline.columns(resource) + + # patch phase + patches = pipeline.patches(resource=resource) + + # harmonize phase + issue_log = IssueLog(dataset=dataset, resource=resource) + + # default phase + default_fields = pipeline.default_fields(resource=resource) + default_values = pipeline.default_values(endpoints=[]) + + if len(organisations) == 1: + default_values["organisation"] = organisations[0] + + # migrate phase + schema = specification.pipeline[pipeline.name]["schema"] + + # organisation phase + organisation = Organisation(org_csv_path, Path(pipeline.path)) + + # print lookups phase + pipeline_lookups = pipeline.lookups() + redirect_lookups = pipeline.redirect_lookups() + print_lookup_phase = PrintLookupPhase( + lookups=pipeline_lookups, redirect_lookups=redirect_lookups + ) + + run_pipeline( + ConvertPhase( + path=input_path, + dataset_resource_log=dataset_resource_log, + custom_temp_dir=custom_temp_dir, + ), + NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), + ParsePhase(), + ConcatFieldPhase(concats=concats, log=column_field_log), + MapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + FilterPhase(filters=pipeline.filters(resource)), + PatchPhase( + issues=issue_log, + patches=patches, + ), + HarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + ), + DefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + # TBD: move migrating columns to fields to be immediately after map + # this will simplify harmonisation and remove intermediate_fieldnames + # but effects brownfield-land and other pipelines which operate on columns + MigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + OrganisationPhase(organisation=organisation, issues=issue_log), + FieldPrunePhase(fields=specification.current_fieldnames(schema)), + EntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + ), + EntityPrefixPhase(dataset=dataset), + print_lookup_phase, + ) + + return print_lookup_phase.new_lookup_entries + + +def process_data_in_batches(entities, flattened_dir, dataset_name): + features = [] + feature_collection = "" + for entity in entities: + geom = entity.pop("geometry") + point = entity.pop("point") + if geom: + try: + geometry = shapely.wkt.loads(geom) + feature = geojson.Feature(geometry=geometry, properties=entity) + features.append(feature) + except Exception as e: + logging.error(f"Error loading wkt from entity {entity['entity']}") + logging.error(e) + elif point: + try: + geometry = shapely.wkt.loads(point) + feature = geojson.Feature(geometry=geometry, properties=entity) + features.append(feature) + except Exception as e: + logging.error(f"Error loading wkt from entity {entity['entity']}") + logging.error(e) + else: + logging.error( + f"No geometry or point data for entity {entity['entity']} with typology 'geography'" + ) + + if features: + feature_collection = geojson.FeatureCollection( + features=features, name=dataset_name + ) + + return feature_collection + + +def add_redirections(csv_file_path, pipeline_dir): + """ + :param csv_file_path: + :param pipeline_dir: + :return: + """ + expected_cols = [ + "entity_source", + "entity_destination", + ] + + old_entity_path = Path(pipeline_dir) / "old-entity.csv" + + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + fieldnames = ["old-entity", "status", "entity"] + + f = open(old_entity_path, "a", newline="") + writer = csv.DictWriter(f, fieldnames=fieldnames) + if f.tell() == 0: + writer.writeheader() + + for row in reader: + if row["entity_source"] == "" or row["entity_destination"] == "": + print( + "Missing entity number for", + ( + row["entity_destination"] + if row["entity_source"] == "" + else row["entity_source"] + ), + ) + else: + writer.writerow( + { + "old-entity": row["entity_source"], + "status": "301", + "entity": row["entity_destination"], + } + ) + print("Redirections added to old-entity.csv") From 954735a364dda5260c23faaec664559a2b3750a3 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Thu, 18 Apr 2024 15:35:40 +0100 Subject: [PATCH 51/58] Set field name of items in ValueIssue. Small fixes to PostConversionPhase. --- digital_land/commands.py | 2 +- digital_land/expectations/issue.py | 4 ++-- digital_land/phase/post_conversion.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 07befebf..f9aa4fcf 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -166,7 +166,7 @@ def pipeline_run( ), PostConversionPhase( converted_resource_path=input_path, - output_dir=output_path, + output_dir=os.path.dirname(output_path), dataset=dataset, typology=specification.get_dataset_typology(dataset), ), diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py index 68cd0ae8..f68b3e47 100644 --- a/digital_land/expectations/issue.py +++ b/digital_land/expectations/issue.py @@ -120,8 +120,8 @@ class ValueIssue(Issue): scope: str dataset: str table_name: str = field(metadata=config(field_name="table-name")) - field_name: str - row_id: str + field_name: str = field(metadata=config(field_name="field-name")) + row_id: str = field(metadata=config(field_name="row-id")) value: str organisation: str diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index 00dcdd77..9fc1eec3 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -17,7 +17,8 @@ def __init__( self.act_on_critical_error = act_on_critical_error def process(self, stream=None): - return self.run() + self.run() + return stream def run(self): """ From 18b9b34768855caa2b8d0f310170288b6a3f2442 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Thu, 18 Apr 2024 15:48:33 +0100 Subject: [PATCH 52/58] Converted file to unix format (so they diff easier with main) --- digital_land/commands.py | 1642 ++++++++--------- .../resource_validations.py | 106 +- digital_land/phase/post_conversion.py | 68 +- 3 files changed, 908 insertions(+), 908 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index f9aa4fcf..d7730d15 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1,821 +1,821 @@ -from collections import OrderedDict -import csv -import itertools -import os -import sys -import json -import logging -from pathlib import Path - -import geojson -import shapely - -from digital_land.specification import Specification -from digital_land.collect import Collector -from digital_land.collection import Collection, resource_path -from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog -from digital_land.organisation import Organisation -from digital_land.package.dataset import DatasetPackage -from digital_land.phase.combine import FactCombinePhase -from digital_land.phase.concat import ConcatFieldPhase -from digital_land.phase.convert import ConvertPhase, execute -from digital_land.phase.post_conversion import PostConversionPhase -from digital_land.phase.default import DefaultPhase -from digital_land.phase.dump import DumpPhase -from digital_land.phase.factor import FactorPhase -from digital_land.phase.filter import FilterPhase -from digital_land.phase.harmonise import HarmonisePhase -from digital_land.phase.lookup import ( - EntityLookupPhase, - FactLookupPhase, - PrintLookupPhase, -) -from digital_land.phase.map import MapPhase -from digital_land.phase.migrate import MigratePhase -from digital_land.phase.normalise import NormalisePhase -from digital_land.phase.organisation import OrganisationPhase -from digital_land.phase.parse import ParsePhase -from digital_land.phase.patch import PatchPhase -from digital_land.phase.pivot import PivotPhase -from digital_land.phase.prefix import EntityPrefixPhase -from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase -from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase -from digital_land.phase.save import SavePhase -from digital_land.pipeline import run_pipeline, Lookups, Pipeline -from digital_land.schema import Schema -from digital_land.update import add_source_endpoint -from .register import hash_value - -logger = logging.getLogger(__name__) - - -def fetch(url, pipeline): - """fetch a single source endpoint URL, and add it to the collection""" - collector = Collector(pipeline.name) - collector.fetch(url) - - -def collect(endpoint_path, collection_dir, pipeline): - """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file""" - collector = Collector(pipeline.name, Path(collection_dir)) - collector.collect(endpoint_path) - - -# -# collection commands -# TBD: make sub commands -# -def collection_list_resources(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - for resource in sorted(collection.resource.records): - print(resource_path(resource, directory=collection_dir)) - - -def collection_pipeline_makerules(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - collection.pipeline_makerules() - - -def collection_save_csv(collection_dir): - collection = Collection(name=None, directory=collection_dir) - collection.load() - collection.update() - collection.save_csv() - - -# -# pipeline commands -# -def convert(input_path, output_path, custom_temp_dir=None): - if not output_path: - output_path = default_output_path("converted", input_path) - dataset_resource_log = DatasetResourceLog() - run_pipeline( - ConvertPhase( - input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), - DumpPhase(output_path), - ) - dataset_resource_log.save(f=sys.stdout) - - -def pipeline_run( - dataset, - pipeline, - specification, - input_path, - output_path, - collection_dir="./collection", # TBD: remove, replaced by endpoints, organisations and entry_date - null_path=None, # TBD: remove this - issue_dir=None, - organisation_path=None, - save_harmonised=False, - column_field_dir=None, - dataset_resource_dir=None, - custom_temp_dir=None, # TBD: rename to "tmpdir" - endpoints=[], - organisations=[], - entry_date="", -): - resource = resource_from_path(input_path) - dataset = dataset - schema = specification.pipeline[pipeline.name]["schema"] - intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) - issue_log = IssueLog(dataset=dataset, resource=resource) - column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) - dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - - # load pipeline configuration - skip_patterns = pipeline.skip_patterns(resource) - columns = pipeline.columns(resource, endpoints=endpoints) - concats = pipeline.concatenations(resource, endpoints=endpoints) - patches = pipeline.patches(resource=resource) - lookups = pipeline.lookups(resource=resource) - default_fields = pipeline.default_fields(resource=resource) - default_values = pipeline.default_values(endpoints=endpoints) - combine_fields = pipeline.combine_fields(endpoints=endpoints) - - # load organisations - organisation = Organisation(organisation_path, Path(pipeline.path)) - - # load the resource default values from the collection - if not endpoints: - collection = Collection(name=None, directory=collection_dir) - collection.load() - endpoints = collection.resource_endpoints(resource) - organisations = collection.resource_organisations(resource) - entry_date = collection.resource_start_date(resource) - - # resource specific default values - if len(organisations) == 1: - default_values["organisation"] = organisations[0] - - if entry_date: - default_values["entry-date"] = entry_date - - run_pipeline( - ConvertPhase( - path=input_path, - dataset_resource_log=DatasetResourceLog(), - custom_temp_dir=custom_temp_dir, - output_path=output_path, - ), - PostConversionPhase( - converted_resource_path=input_path, - output_dir=os.path.dirname(output_path), - dataset=dataset, - typology=specification.get_dataset_typology(dataset), - ), - NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), - ParsePhase(), - ConcatFieldPhase(concats=concats, log=column_field_log), - MapPhase( - fieldnames=intermediate_fieldnames, - columns=columns, - log=column_field_log, - ), - FilterPhase(filters=pipeline.filters(resource)), - PatchPhase( - issues=issue_log, - patches=patches, - ), - HarmonisePhase( - field_datatype_map=specification.get_field_datatype_map(), - issues=issue_log, - dataset=dataset, - ), - DefaultPhase( - default_fields=default_fields, - default_values=default_values, - issues=issue_log, - ), - # TBD: move migrating columns to fields to be immediately after map - # this will simplify harmonisation and remove intermediate_fieldnames - # but effects brownfield-land and other pipelines which operate on columns - MigratePhase( - fields=specification.schema_field[schema], - migrations=pipeline.migrations(), - ), - OrganisationPhase(organisation=organisation, issues=issue_log), - FieldPrunePhase(fields=specification.current_fieldnames(schema)), - EntityReferencePhase( - dataset=dataset, - prefix=specification.dataset_prefix(dataset), - ), - EntityPrefixPhase(dataset=dataset), - EntityLookupPhase(lookups), - SavePhase( - default_output_path("harmonised", input_path), - fieldnames=intermediate_fieldnames, - enabled=save_harmonised, - ), - EntityPrunePhase( - issue_log=issue_log, dataset_resource_log=dataset_resource_log - ), - PivotPhase(), - FactCombinePhase(issue_log=issue_log, fields=combine_fields), - FactorPhase(), - FactReferencePhase( - field_typology_map=specification.get_field_typology_map(), - field_prefix_map=specification.get_field_prefix_map(), - ), - FactLookupPhase(lookups), - FactPrunePhase(), - SavePhase( - output_path, - fieldnames=specification.factor_fieldnames(), - ), - ) - - issue_log.save(os.path.join(issue_dir, resource + ".csv")) - column_field_log.save(os.path.join(column_field_dir, resource + ".csv")) - dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv")) - - -# -# build dataset from processed resources -# -def dataset_create( - input_paths, - output_path, - organisation_path, - pipeline, - dataset, - specification, - issue_dir="issue", -): - if not output_path: - print("missing output path", file=sys.stderr) - sys.exit(2) - organisation = Organisation(organisation_path, Path(pipeline.path)) - package = DatasetPackage( - dataset, - organisation=organisation, - path=output_path, - specification_dir=None, # TBD: package should use this specification object - ) - package.create() - for path in input_paths: - package.load_transformed(path) - package.load_entities() - - old_entity_path = os.path.join(pipeline.path, "old-entity.csv") - if os.path.exists(old_entity_path): - package.load_old_entities(old_entity_path) - - issue_paths = os.path.join(issue_dir, dataset) - if os.path.exists(issue_paths): - for issue_path in os.listdir(issue_paths): - package.load_issues(os.path.join(issue_paths, issue_path)) - else: - logging.warning("No directory for this dataset in the provided issue_directory") - - package.add_counts() - - -def dataset_dump(input_path, output_path): - cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" - logging.info(cmd) - os.system(cmd) - - -def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset): - if isinstance(csv_path, str): - path = Path(csv_path) - dataset_name = path.stem - elif isinstance(csv_path, Path): - dataset_name = csv_path.stem - else: - logging.error(f"Can't extract datapackage name from {csv_path}") - sys.exit(-1) - - flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv") - with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file: - reader = csv.DictReader(read_file) - - spec_field_names = [ - field - for field in itertools.chain( - *[ - specification.current_fieldnames(schema) - for schema in specification.dataset_schema[dataset] - ] - ) - ] - reader_fieldnames = [ - field.replace("_", "-") - for field in list(reader.fieldnames) - if field != "json" - ] - - flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames)) - # Make sure we put flattened fieldnames last - field_names = reader_fieldnames + sorted(list(flattened_field_names)) - - writer = csv.DictWriter(write_file, fieldnames=field_names) - writer.writeheader() - entities = [] - for row in reader: - row.pop("geojson", None) - row = OrderedDict(row) - json_string = row.pop("json") or "{}" - row.update(json.loads(json_string)) - kebab_case_row = dict( - [(key.replace("_", "-"), val) for key, val in row.items()] - ) - writer.writerow(kebab_case_row) - entities.append(kebab_case_row) - - # write the entities to json file as well - flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json") - with open(flattened_json_path, "w") as out_json: - out_json.write(json.dumps({"entities": entities})) - batch_size = 100000 - temp_geojson_files = [] - geography_entities = [e for e in entities if e["typology"] == "geography"] - for i in range(0, len(geography_entities), batch_size): - batch = geography_entities[i : i + batch_size] - feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name) - - geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson") - temp_geojson_files.append(geojson_path) - try: - with open(geojson_path, "w", encoding="utf-8") as out_geojson: - out_geojson.write(geojson.dumps(feature_collection)) - except Exception as e: - logging.error(f"Error writing to GeoJSON file: {e}") - - if all(os.path.isfile(path) for path in temp_geojson_files): - rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson") - for temp_path in temp_geojson_files: - responseCode, _, _ = execute( - [ - "ogr2ogr", - "-f", - "GeoJSON", - "-lco", - "RFC7946=YES", - "-append", - rfc7946_geojson_path, - temp_path, - ] - ) - - if responseCode != 0: - logging.error( - "Could not generate rfc7946 compliant geojson. Use existing file." - ) - execute( - [ - "ogr2ogr", - "-f", - "GeoJSON", - "-append", - rfc7946_geojson_path, - temp_path, - ] - ) - # clear up input geojson file - if os.path.isfile(temp_path): - os.remove(temp_path) - - -# -# configuration commands -# -def collection_add_source(entry, collection, endpoint_url, collection_dir): - """ - followed by a sequence of optional name and value pairs including the following names: - "attribution", "licence", "pipelines", "status", "plugin", - "parameters", "start-date", "end-date" - """ - entry["collection"] = collection - entry["endpoint-url"] = endpoint_url - allowed_names = set( - list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames) - ) - for key in entry.keys(): - if key not in allowed_names: - logging.error(f"unrecognised argument '{key}'") - sys.exit(2) - add_source_endpoint(entry, directory=collection_dir) - - -def add_endpoints_and_lookups( - csv_file_path, - collection_name, - collection_dir, - pipeline_dir, - specification_dir, - organisation_path, - tmp_dir="./var/cache", -): - """ - :param csv_file_path: - :param collection_name: - :param collection_dir: - :param pipeline_dir: - :param specification_dir: - :param organisation_path: - :param tmp_dir: - :return: - """ - - expected_cols = [ - "pipelines", - "organisation", - "documentation-url", - "endpoint-url", - "start-date", - "licence", - ] - - licence_csv_path = os.path.join(specification_dir, "licence.csv") - valid_licenses = [] - with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile: - reader = csv.DictReader(csvfile) - valid_licenses = [row["licence"] for row in reader] - - # need to get collection name from somewhere - # collection name is NOT the dataset name - collection = Collection(name=collection_name, directory=collection_dir) - collection.load() - - # read and process each record of the new endpoints csv at csv_file_path - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - - # validate the columns - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") - - # this is not perfect we should riase validation errors in our code and below should include a try and except statement - endpoints = [] - for row in reader: - if row["licence"] not in valid_licenses: - raise ValueError( - f"Licence '{row['licence']}' is not a valid licence according to the specification." - ) - if not row["documentation-url"].strip(): - raise ValueError( - "The 'documentation-url' must be populated for each row." - ) - if collection.add_source_endpoint(row): - endpoint = { - "endpoint-url": row["endpoint-url"], - "endpoint": hash_value(row["endpoint-url"]), - "end-date": row.get("end-date", ""), - "plugin": row.get("plugin"), - "licence": row["licence"], - } - endpoints.append(endpoint) - - # endpoints have been added now lets collect the resources using the endpoint information - collector = Collector(collection_dir=collection_dir) - - for endpoint in endpoints: - collector.fetch( - url=endpoint["endpoint-url"], - endpoint=endpoint["endpoint"], - end_date=endpoint["end-date"], - plugin=endpoint["plugin"], - ) - # reload log items - collection.load_log_items() - - dataset_resource_map = collection.dataset_resource_map() - - # searching for the specific resources that we have downloaded - for dataset in dataset_resource_map: - resources_to_assign = [] - for resource in dataset_resource_map[dataset]: - resource_endpoints = collection.resource_endpoints(resource) - if any( - endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints] - for endpoint in resource_endpoints - ): - resource_file_path = Path(collection_dir) / "resource" / resource - resources_to_assign.append(resource_file_path) - assign_entities( - resource_file_paths=resources_to_assign, - collection=collection, - pipeline_dir=pipeline_dir, - specification_dir=specification_dir, - organisation_path=organisation_path, - tmp_dir=tmp_dir, - dataset=dataset, - ) - - -def resource_from_path(path): - return Path(path).stem - - -def default_output_path(command, input_path): - directory = "" if command in ["harmonised", "transformed"] else "var/" - return f"{directory}{command}/{resource_from_path(input_path)}.csv" - - -def assign_entities( - resource_file_paths, - collection, - pipeline_dir, - specification_dir, - organisation_path, - tmp_dir="./var/cache", - dataset=None, -): - """ - Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection - :param resource_file_paths: - :param collection: - :param pipeline_dir: - :param specification_dir: - :param organisation_path: - :param tmp_dir: - :return: - """ - - specification = Specification(specification_dir) - - print("") - print("======================================================================") - print("New Lookups") - print("======================================================================") - - dataset_resource_map = collection.dataset_resource_map() - new_lookups = [] - - pipeline_name = None - # establish pipeline if dataset is known - else have to find dataset for each resource - if dataset is not None: - pipeline = Pipeline(pipeline_dir, dataset) - pipeline_name = pipeline.name - - for resource_file_path in resource_file_paths: - resource = os.path.splitext(os.path.basename(resource_file_path))[0] - # Find dataset for resource if not given - if dataset is None: - for dataset_key, resources in dataset_resource_map.items(): - if resource in list(resources): - dataset = dataset_key - continue - # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline - if dataset is not None: - pipeline = Pipeline(pipeline_dir, dataset) - pipeline_name = pipeline.name - else: - logging.error( - "Resource '%s' has not been processed by pipeline - no lookups added" - % (resource) - ) - break - - resource_lookups = get_resource_unidentified_lookups( - input_path=Path(resource_file_path), - dataset=dataset, - organisations=collection.resource_organisations(resource), - pipeline=pipeline, - specification=specification, - tmp_dir=Path(tmp_dir).absolute(), - org_csv_path=organisation_path, - ) - new_lookups.append(resource_lookups) - - if pipeline_name is not None: - # save new lookups to file - lookups = Lookups(pipeline_dir) - # Check if the lookups file exists, create it if not - if not os.path.exists(lookups.lookups_path): - with open(lookups.lookups_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(list(lookups.schema.fieldnames)) - - lookups.load_csv() - for new_lookup in new_lookups: - for idx, entry in enumerate(new_lookup): - lookups.add_entry(entry[0]) - - # save edited csvs - max_entity_num = lookups.get_max_entity(pipeline_name) - lookups.entity_num_gen.state["current"] = max_entity_num - lookups.entity_num_gen.state["range_max"] = ( - specification.get_dataset_entity_max(pipeline_name) - ) - lookups.entity_num_gen.state["range_min"] = ( - specification.get_dataset_entity_min(pipeline_name) - ) - - # TO DO: Currently using pipeline_name to find dataset min, max, current - # This would not function properly if each resource had a different dataset - - collection.save_csv() - new_lookups = lookups.save_csv() - - for entity in new_lookups: - print( - entity["prefix"], - ",", - entity["organisation"], - ",", - entity["reference"], - ",", - entity["entity"], - ) - - -def get_resource_unidentified_lookups( - input_path: Path, - dataset: str, - pipeline: Pipeline, - specification: Specification, - organisations: list = [], - tmp_dir: Path = None, - org_csv_path: Path = None, -): - # convert phase inputs - # could alter resource_from_path to file from path and promote to a utils folder - resource = resource_from_path(input_path) - dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) - custom_temp_dir = tmp_dir # './var' - - print("") - print("----------------------------------------------------------------------") - print(f">>> organisations:{organisations}") - print(f">>> resource:{resource}") - print("----------------------------------------------------------------------") - - # normalise phase inputs - skip_patterns = pipeline.skip_patterns(resource) - null_path = None - - # concat field phase - concats = pipeline.concatenations(resource) - column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) - - # map phase - intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) - columns = pipeline.columns(resource) - - # patch phase - patches = pipeline.patches(resource=resource) - - # harmonize phase - issue_log = IssueLog(dataset=dataset, resource=resource) - - # default phase - default_fields = pipeline.default_fields(resource=resource) - default_values = pipeline.default_values(endpoints=[]) - - if len(organisations) == 1: - default_values["organisation"] = organisations[0] - - # migrate phase - schema = specification.pipeline[pipeline.name]["schema"] - - # organisation phase - organisation = Organisation(org_csv_path, Path(pipeline.path)) - - # print lookups phase - pipeline_lookups = pipeline.lookups() - redirect_lookups = pipeline.redirect_lookups() - print_lookup_phase = PrintLookupPhase( - lookups=pipeline_lookups, redirect_lookups=redirect_lookups - ) - - run_pipeline( - ConvertPhase( - path=input_path, - dataset_resource_log=dataset_resource_log, - custom_temp_dir=custom_temp_dir, - ), - NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), - ParsePhase(), - ConcatFieldPhase(concats=concats, log=column_field_log), - MapPhase( - fieldnames=intermediate_fieldnames, - columns=columns, - log=column_field_log, - ), - FilterPhase(filters=pipeline.filters(resource)), - PatchPhase( - issues=issue_log, - patches=patches, - ), - HarmonisePhase( - field_datatype_map=specification.get_field_datatype_map(), - issues=issue_log, - ), - DefaultPhase( - default_fields=default_fields, - default_values=default_values, - issues=issue_log, - ), - # TBD: move migrating columns to fields to be immediately after map - # this will simplify harmonisation and remove intermediate_fieldnames - # but effects brownfield-land and other pipelines which operate on columns - MigratePhase( - fields=specification.schema_field[schema], - migrations=pipeline.migrations(), - ), - OrganisationPhase(organisation=organisation, issues=issue_log), - FieldPrunePhase(fields=specification.current_fieldnames(schema)), - EntityReferencePhase( - dataset=dataset, - prefix=specification.dataset_prefix(dataset), - ), - EntityPrefixPhase(dataset=dataset), - print_lookup_phase, - ) - - return print_lookup_phase.new_lookup_entries - - -def process_data_in_batches(entities, flattened_dir, dataset_name): - features = [] - feature_collection = "" - for entity in entities: - geom = entity.pop("geometry") - point = entity.pop("point") - if geom: - try: - geometry = shapely.wkt.loads(geom) - feature = geojson.Feature(geometry=geometry, properties=entity) - features.append(feature) - except Exception as e: - logging.error(f"Error loading wkt from entity {entity['entity']}") - logging.error(e) - elif point: - try: - geometry = shapely.wkt.loads(point) - feature = geojson.Feature(geometry=geometry, properties=entity) - features.append(feature) - except Exception as e: - logging.error(f"Error loading wkt from entity {entity['entity']}") - logging.error(e) - else: - logging.error( - f"No geometry or point data for entity {entity['entity']} with typology 'geography'" - ) - - if features: - feature_collection = geojson.FeatureCollection( - features=features, name=dataset_name - ) - - return feature_collection - - -def add_redirections(csv_file_path, pipeline_dir): - """ - :param csv_file_path: - :param pipeline_dir: - :return: - """ - expected_cols = [ - "entity_source", - "entity_destination", - ] - - old_entity_path = Path(pipeline_dir) / "old-entity.csv" - - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") - - fieldnames = ["old-entity", "status", "entity"] - - f = open(old_entity_path, "a", newline="") - writer = csv.DictWriter(f, fieldnames=fieldnames) - if f.tell() == 0: - writer.writeheader() - - for row in reader: - if row["entity_source"] == "" or row["entity_destination"] == "": - print( - "Missing entity number for", - ( - row["entity_destination"] - if row["entity_source"] == "" - else row["entity_source"] - ), - ) - else: - writer.writerow( - { - "old-entity": row["entity_source"], - "status": "301", - "entity": row["entity_destination"], - } - ) - print("Redirections added to old-entity.csv") +from collections import OrderedDict +import csv +import itertools +import os +import sys +import json +import logging +from pathlib import Path + +import geojson +import shapely + +from digital_land.specification import Specification +from digital_land.collect import Collector +from digital_land.collection import Collection, resource_path +from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog +from digital_land.organisation import Organisation +from digital_land.package.dataset import DatasetPackage +from digital_land.phase.combine import FactCombinePhase +from digital_land.phase.concat import ConcatFieldPhase +from digital_land.phase.convert import ConvertPhase, execute +from digital_land.phase.post_conversion import PostConversionPhase +from digital_land.phase.default import DefaultPhase +from digital_land.phase.dump import DumpPhase +from digital_land.phase.factor import FactorPhase +from digital_land.phase.filter import FilterPhase +from digital_land.phase.harmonise import HarmonisePhase +from digital_land.phase.lookup import ( + EntityLookupPhase, + FactLookupPhase, + PrintLookupPhase, +) +from digital_land.phase.map import MapPhase +from digital_land.phase.migrate import MigratePhase +from digital_land.phase.normalise import NormalisePhase +from digital_land.phase.organisation import OrganisationPhase +from digital_land.phase.parse import ParsePhase +from digital_land.phase.patch import PatchPhase +from digital_land.phase.pivot import PivotPhase +from digital_land.phase.prefix import EntityPrefixPhase +from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase +from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase +from digital_land.phase.save import SavePhase +from digital_land.pipeline import run_pipeline, Lookups, Pipeline +from digital_land.schema import Schema +from digital_land.update import add_source_endpoint +from .register import hash_value + +logger = logging.getLogger(__name__) + + +def fetch(url, pipeline): + """fetch a single source endpoint URL, and add it to the collection""" + collector = Collector(pipeline.name) + collector.fetch(url) + + +def collect(endpoint_path, collection_dir, pipeline): + """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file""" + collector = Collector(pipeline.name, Path(collection_dir)) + collector.collect(endpoint_path) + + +# +# collection commands +# TBD: make sub commands +# +def collection_list_resources(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + for resource in sorted(collection.resource.records): + print(resource_path(resource, directory=collection_dir)) + + +def collection_pipeline_makerules(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + collection.pipeline_makerules() + + +def collection_save_csv(collection_dir): + collection = Collection(name=None, directory=collection_dir) + collection.load() + collection.update() + collection.save_csv() + + +# +# pipeline commands +# +def convert(input_path, output_path, custom_temp_dir=None): + if not output_path: + output_path = default_output_path("converted", input_path) + dataset_resource_log = DatasetResourceLog() + run_pipeline( + ConvertPhase( + input_path, + dataset_resource_log=dataset_resource_log, + custom_temp_dir=custom_temp_dir, + ), + DumpPhase(output_path), + ) + dataset_resource_log.save(f=sys.stdout) + + +def pipeline_run( + dataset, + pipeline, + specification, + input_path, + output_path, + collection_dir="./collection", # TBD: remove, replaced by endpoints, organisations and entry_date + null_path=None, # TBD: remove this + issue_dir=None, + organisation_path=None, + save_harmonised=False, + column_field_dir=None, + dataset_resource_dir=None, + custom_temp_dir=None, # TBD: rename to "tmpdir" + endpoints=[], + organisations=[], + entry_date="", +): + resource = resource_from_path(input_path) + dataset = dataset + schema = specification.pipeline[pipeline.name]["schema"] + intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) + issue_log = IssueLog(dataset=dataset, resource=resource) + column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) + dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + + # load pipeline configuration + skip_patterns = pipeline.skip_patterns(resource) + columns = pipeline.columns(resource, endpoints=endpoints) + concats = pipeline.concatenations(resource, endpoints=endpoints) + patches = pipeline.patches(resource=resource) + lookups = pipeline.lookups(resource=resource) + default_fields = pipeline.default_fields(resource=resource) + default_values = pipeline.default_values(endpoints=endpoints) + combine_fields = pipeline.combine_fields(endpoints=endpoints) + + # load organisations + organisation = Organisation(organisation_path, Path(pipeline.path)) + + # load the resource default values from the collection + if not endpoints: + collection = Collection(name=None, directory=collection_dir) + collection.load() + endpoints = collection.resource_endpoints(resource) + organisations = collection.resource_organisations(resource) + entry_date = collection.resource_start_date(resource) + + # resource specific default values + if len(organisations) == 1: + default_values["organisation"] = organisations[0] + + if entry_date: + default_values["entry-date"] = entry_date + + run_pipeline( + ConvertPhase( + path=input_path, + dataset_resource_log=DatasetResourceLog(), + custom_temp_dir=custom_temp_dir, + output_path=output_path, + ), + PostConversionPhase( + converted_resource_path=input_path, + output_dir=os.path.dirname(output_path), + dataset=dataset, + typology=specification.get_dataset_typology(dataset), + ), + NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), + ParsePhase(), + ConcatFieldPhase(concats=concats, log=column_field_log), + MapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + FilterPhase(filters=pipeline.filters(resource)), + PatchPhase( + issues=issue_log, + patches=patches, + ), + HarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + dataset=dataset, + ), + DefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + # TBD: move migrating columns to fields to be immediately after map + # this will simplify harmonisation and remove intermediate_fieldnames + # but effects brownfield-land and other pipelines which operate on columns + MigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + OrganisationPhase(organisation=organisation, issues=issue_log), + FieldPrunePhase(fields=specification.current_fieldnames(schema)), + EntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + ), + EntityPrefixPhase(dataset=dataset), + EntityLookupPhase(lookups), + SavePhase( + default_output_path("harmonised", input_path), + fieldnames=intermediate_fieldnames, + enabled=save_harmonised, + ), + EntityPrunePhase( + issue_log=issue_log, dataset_resource_log=dataset_resource_log + ), + PivotPhase(), + FactCombinePhase(issue_log=issue_log, fields=combine_fields), + FactorPhase(), + FactReferencePhase( + field_typology_map=specification.get_field_typology_map(), + field_prefix_map=specification.get_field_prefix_map(), + ), + FactLookupPhase(lookups), + FactPrunePhase(), + SavePhase( + output_path, + fieldnames=specification.factor_fieldnames(), + ), + ) + + issue_log.save(os.path.join(issue_dir, resource + ".csv")) + column_field_log.save(os.path.join(column_field_dir, resource + ".csv")) + dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv")) + + +# +# build dataset from processed resources +# +def dataset_create( + input_paths, + output_path, + organisation_path, + pipeline, + dataset, + specification, + issue_dir="issue", +): + if not output_path: + print("missing output path", file=sys.stderr) + sys.exit(2) + organisation = Organisation(organisation_path, Path(pipeline.path)) + package = DatasetPackage( + dataset, + organisation=organisation, + path=output_path, + specification_dir=None, # TBD: package should use this specification object + ) + package.create() + for path in input_paths: + package.load_transformed(path) + package.load_entities() + + old_entity_path = os.path.join(pipeline.path, "old-entity.csv") + if os.path.exists(old_entity_path): + package.load_old_entities(old_entity_path) + + issue_paths = os.path.join(issue_dir, dataset) + if os.path.exists(issue_paths): + for issue_path in os.listdir(issue_paths): + package.load_issues(os.path.join(issue_paths, issue_path)) + else: + logging.warning("No directory for this dataset in the provided issue_directory") + + package.add_counts() + + +def dataset_dump(input_path, output_path): + cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}" + logging.info(cmd) + os.system(cmd) + + +def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset): + if isinstance(csv_path, str): + path = Path(csv_path) + dataset_name = path.stem + elif isinstance(csv_path, Path): + dataset_name = csv_path.stem + else: + logging.error(f"Can't extract datapackage name from {csv_path}") + sys.exit(-1) + + flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv") + with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file: + reader = csv.DictReader(read_file) + + spec_field_names = [ + field + for field in itertools.chain( + *[ + specification.current_fieldnames(schema) + for schema in specification.dataset_schema[dataset] + ] + ) + ] + reader_fieldnames = [ + field.replace("_", "-") + for field in list(reader.fieldnames) + if field != "json" + ] + + flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames)) + # Make sure we put flattened fieldnames last + field_names = reader_fieldnames + sorted(list(flattened_field_names)) + + writer = csv.DictWriter(write_file, fieldnames=field_names) + writer.writeheader() + entities = [] + for row in reader: + row.pop("geojson", None) + row = OrderedDict(row) + json_string = row.pop("json") or "{}" + row.update(json.loads(json_string)) + kebab_case_row = dict( + [(key.replace("_", "-"), val) for key, val in row.items()] + ) + writer.writerow(kebab_case_row) + entities.append(kebab_case_row) + + # write the entities to json file as well + flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json") + with open(flattened_json_path, "w") as out_json: + out_json.write(json.dumps({"entities": entities})) + batch_size = 100000 + temp_geojson_files = [] + geography_entities = [e for e in entities if e["typology"] == "geography"] + for i in range(0, len(geography_entities), batch_size): + batch = geography_entities[i : i + batch_size] + feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name) + + geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson") + temp_geojson_files.append(geojson_path) + try: + with open(geojson_path, "w", encoding="utf-8") as out_geojson: + out_geojson.write(geojson.dumps(feature_collection)) + except Exception as e: + logging.error(f"Error writing to GeoJSON file: {e}") + + if all(os.path.isfile(path) for path in temp_geojson_files): + rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson") + for temp_path in temp_geojson_files: + responseCode, _, _ = execute( + [ + "ogr2ogr", + "-f", + "GeoJSON", + "-lco", + "RFC7946=YES", + "-append", + rfc7946_geojson_path, + temp_path, + ] + ) + + if responseCode != 0: + logging.error( + "Could not generate rfc7946 compliant geojson. Use existing file." + ) + execute( + [ + "ogr2ogr", + "-f", + "GeoJSON", + "-append", + rfc7946_geojson_path, + temp_path, + ] + ) + # clear up input geojson file + if os.path.isfile(temp_path): + os.remove(temp_path) + + +# +# configuration commands +# +def collection_add_source(entry, collection, endpoint_url, collection_dir): + """ + followed by a sequence of optional name and value pairs including the following names: + "attribution", "licence", "pipelines", "status", "plugin", + "parameters", "start-date", "end-date" + """ + entry["collection"] = collection + entry["endpoint-url"] = endpoint_url + allowed_names = set( + list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames) + ) + for key in entry.keys(): + if key not in allowed_names: + logging.error(f"unrecognised argument '{key}'") + sys.exit(2) + add_source_endpoint(entry, directory=collection_dir) + + +def add_endpoints_and_lookups( + csv_file_path, + collection_name, + collection_dir, + pipeline_dir, + specification_dir, + organisation_path, + tmp_dir="./var/cache", +): + """ + :param csv_file_path: + :param collection_name: + :param collection_dir: + :param pipeline_dir: + :param specification_dir: + :param organisation_path: + :param tmp_dir: + :return: + """ + + expected_cols = [ + "pipelines", + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "licence", + ] + + licence_csv_path = os.path.join(specification_dir, "licence.csv") + valid_licenses = [] + with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile) + valid_licenses = [row["licence"] for row in reader] + + # need to get collection name from somewhere + # collection name is NOT the dataset name + collection = Collection(name=collection_name, directory=collection_dir) + collection.load() + + # read and process each record of the new endpoints csv at csv_file_path + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + # validate the columns + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + # this is not perfect we should riase validation errors in our code and below should include a try and except statement + endpoints = [] + for row in reader: + if row["licence"] not in valid_licenses: + raise ValueError( + f"Licence '{row['licence']}' is not a valid licence according to the specification." + ) + if not row["documentation-url"].strip(): + raise ValueError( + "The 'documentation-url' must be populated for each row." + ) + if collection.add_source_endpoint(row): + endpoint = { + "endpoint-url": row["endpoint-url"], + "endpoint": hash_value(row["endpoint-url"]), + "end-date": row.get("end-date", ""), + "plugin": row.get("plugin"), + "licence": row["licence"], + } + endpoints.append(endpoint) + + # endpoints have been added now lets collect the resources using the endpoint information + collector = Collector(collection_dir=collection_dir) + + for endpoint in endpoints: + collector.fetch( + url=endpoint["endpoint-url"], + endpoint=endpoint["endpoint"], + end_date=endpoint["end-date"], + plugin=endpoint["plugin"], + ) + # reload log items + collection.load_log_items() + + dataset_resource_map = collection.dataset_resource_map() + + # searching for the specific resources that we have downloaded + for dataset in dataset_resource_map: + resources_to_assign = [] + for resource in dataset_resource_map[dataset]: + resource_endpoints = collection.resource_endpoints(resource) + if any( + endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints] + for endpoint in resource_endpoints + ): + resource_file_path = Path(collection_dir) / "resource" / resource + resources_to_assign.append(resource_file_path) + assign_entities( + resource_file_paths=resources_to_assign, + collection=collection, + pipeline_dir=pipeline_dir, + specification_dir=specification_dir, + organisation_path=organisation_path, + tmp_dir=tmp_dir, + dataset=dataset, + ) + + +def resource_from_path(path): + return Path(path).stem + + +def default_output_path(command, input_path): + directory = "" if command in ["harmonised", "transformed"] else "var/" + return f"{directory}{command}/{resource_from_path(input_path)}.csv" + + +def assign_entities( + resource_file_paths, + collection, + pipeline_dir, + specification_dir, + organisation_path, + tmp_dir="./var/cache", + dataset=None, +): + """ + Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection + :param resource_file_paths: + :param collection: + :param pipeline_dir: + :param specification_dir: + :param organisation_path: + :param tmp_dir: + :return: + """ + + specification = Specification(specification_dir) + + print("") + print("======================================================================") + print("New Lookups") + print("======================================================================") + + dataset_resource_map = collection.dataset_resource_map() + new_lookups = [] + + pipeline_name = None + # establish pipeline if dataset is known - else have to find dataset for each resource + if dataset is not None: + pipeline = Pipeline(pipeline_dir, dataset) + pipeline_name = pipeline.name + + for resource_file_path in resource_file_paths: + resource = os.path.splitext(os.path.basename(resource_file_path))[0] + # Find dataset for resource if not given + if dataset is None: + for dataset_key, resources in dataset_resource_map.items(): + if resource in list(resources): + dataset = dataset_key + continue + # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline + if dataset is not None: + pipeline = Pipeline(pipeline_dir, dataset) + pipeline_name = pipeline.name + else: + logging.error( + "Resource '%s' has not been processed by pipeline - no lookups added" + % (resource) + ) + break + + resource_lookups = get_resource_unidentified_lookups( + input_path=Path(resource_file_path), + dataset=dataset, + organisations=collection.resource_organisations(resource), + pipeline=pipeline, + specification=specification, + tmp_dir=Path(tmp_dir).absolute(), + org_csv_path=organisation_path, + ) + new_lookups.append(resource_lookups) + + if pipeline_name is not None: + # save new lookups to file + lookups = Lookups(pipeline_dir) + # Check if the lookups file exists, create it if not + if not os.path.exists(lookups.lookups_path): + with open(lookups.lookups_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(list(lookups.schema.fieldnames)) + + lookups.load_csv() + for new_lookup in new_lookups: + for idx, entry in enumerate(new_lookup): + lookups.add_entry(entry[0]) + + # save edited csvs + max_entity_num = lookups.get_max_entity(pipeline_name) + lookups.entity_num_gen.state["current"] = max_entity_num + lookups.entity_num_gen.state["range_max"] = ( + specification.get_dataset_entity_max(pipeline_name) + ) + lookups.entity_num_gen.state["range_min"] = ( + specification.get_dataset_entity_min(pipeline_name) + ) + + # TO DO: Currently using pipeline_name to find dataset min, max, current + # This would not function properly if each resource had a different dataset + + collection.save_csv() + new_lookups = lookups.save_csv() + + for entity in new_lookups: + print( + entity["prefix"], + ",", + entity["organisation"], + ",", + entity["reference"], + ",", + entity["entity"], + ) + + +def get_resource_unidentified_lookups( + input_path: Path, + dataset: str, + pipeline: Pipeline, + specification: Specification, + organisations: list = [], + tmp_dir: Path = None, + org_csv_path: Path = None, +): + # convert phase inputs + # could alter resource_from_path to file from path and promote to a utils folder + resource = resource_from_path(input_path) + dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource) + custom_temp_dir = tmp_dir # './var' + + print("") + print("----------------------------------------------------------------------") + print(f">>> organisations:{organisations}") + print(f">>> resource:{resource}") + print("----------------------------------------------------------------------") + + # normalise phase inputs + skip_patterns = pipeline.skip_patterns(resource) + null_path = None + + # concat field phase + concats = pipeline.concatenations(resource) + column_field_log = ColumnFieldLog(dataset=dataset, resource=resource) + + # map phase + intermediate_fieldnames = specification.intermediate_fieldnames(pipeline) + columns = pipeline.columns(resource) + + # patch phase + patches = pipeline.patches(resource=resource) + + # harmonize phase + issue_log = IssueLog(dataset=dataset, resource=resource) + + # default phase + default_fields = pipeline.default_fields(resource=resource) + default_values = pipeline.default_values(endpoints=[]) + + if len(organisations) == 1: + default_values["organisation"] = organisations[0] + + # migrate phase + schema = specification.pipeline[pipeline.name]["schema"] + + # organisation phase + organisation = Organisation(org_csv_path, Path(pipeline.path)) + + # print lookups phase + pipeline_lookups = pipeline.lookups() + redirect_lookups = pipeline.redirect_lookups() + print_lookup_phase = PrintLookupPhase( + lookups=pipeline_lookups, redirect_lookups=redirect_lookups + ) + + run_pipeline( + ConvertPhase( + path=input_path, + dataset_resource_log=dataset_resource_log, + custom_temp_dir=custom_temp_dir, + ), + NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), + ParsePhase(), + ConcatFieldPhase(concats=concats, log=column_field_log), + MapPhase( + fieldnames=intermediate_fieldnames, + columns=columns, + log=column_field_log, + ), + FilterPhase(filters=pipeline.filters(resource)), + PatchPhase( + issues=issue_log, + patches=patches, + ), + HarmonisePhase( + field_datatype_map=specification.get_field_datatype_map(), + issues=issue_log, + ), + DefaultPhase( + default_fields=default_fields, + default_values=default_values, + issues=issue_log, + ), + # TBD: move migrating columns to fields to be immediately after map + # this will simplify harmonisation and remove intermediate_fieldnames + # but effects brownfield-land and other pipelines which operate on columns + MigratePhase( + fields=specification.schema_field[schema], + migrations=pipeline.migrations(), + ), + OrganisationPhase(organisation=organisation, issues=issue_log), + FieldPrunePhase(fields=specification.current_fieldnames(schema)), + EntityReferencePhase( + dataset=dataset, + prefix=specification.dataset_prefix(dataset), + ), + EntityPrefixPhase(dataset=dataset), + print_lookup_phase, + ) + + return print_lookup_phase.new_lookup_entries + + +def process_data_in_batches(entities, flattened_dir, dataset_name): + features = [] + feature_collection = "" + for entity in entities: + geom = entity.pop("geometry") + point = entity.pop("point") + if geom: + try: + geometry = shapely.wkt.loads(geom) + feature = geojson.Feature(geometry=geometry, properties=entity) + features.append(feature) + except Exception as e: + logging.error(f"Error loading wkt from entity {entity['entity']}") + logging.error(e) + elif point: + try: + geometry = shapely.wkt.loads(point) + feature = geojson.Feature(geometry=geometry, properties=entity) + features.append(feature) + except Exception as e: + logging.error(f"Error loading wkt from entity {entity['entity']}") + logging.error(e) + else: + logging.error( + f"No geometry or point data for entity {entity['entity']} with typology 'geography'" + ) + + if features: + feature_collection = geojson.FeatureCollection( + features=features, name=dataset_name + ) + + return feature_collection + + +def add_redirections(csv_file_path, pipeline_dir): + """ + :param csv_file_path: + :param pipeline_dir: + :return: + """ + expected_cols = [ + "entity_source", + "entity_destination", + ] + + old_entity_path = Path(pipeline_dir) / "old-entity.csv" + + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + fieldnames = ["old-entity", "status", "entity"] + + f = open(old_entity_path, "a", newline="") + writer = csv.DictWriter(f, fieldnames=fieldnames) + if f.tell() == 0: + writer.writeheader() + + for row in reader: + if row["entity_source"] == "" or row["entity_destination"] == "": + print( + "Missing entity number for", + ( + row["entity_destination"] + if row["entity_source"] == "" + else row["entity_source"] + ), + ) + else: + writer.writerow( + { + "old-entity": row["entity_source"], + "status": "301", + "entity": row["entity_destination"], + } + ) + print("Redirections added to old-entity.csv") diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py index c6acae74..6f6d0520 100644 --- a/digital_land/expectations/expectation_functions/resource_validations.py +++ b/digital_land/expectations/expectation_functions/resource_validations.py @@ -1,53 +1,53 @@ -import csv - - -def check_for_duplicate_references(csv_path, **kwargs): - duplicates = {} - issues = [] - with csv_path.open(newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - if ref in duplicates: - duplicates[ref].append(row_number) - else: - duplicates[ref] = [row_number] - - for ref, rows in duplicates.items(): - if len(rows) > 1: - issues.append( - { - "scope": "row-group", - "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", - "dataset": "dataset", - "table_name": "resource", - "rows": rows, - "row_id": str(rows[0]), - "organisation": "organisation", - } - ) - - return True, "Checked for duplicate references.", issues - - -def validate_references(csv_path, **kwargs): - issues = [] - with csv_path.open(newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - if not ref: # This will be True for both None and empty strings - issues.append( - { - "scope": "value", - "message": f"Reference is missing on row {row_number}.", - "dataset": "dataset", - "table_name": "resource", - "field_name": "reference", - "row_id": str(row_number), - "value": "Missing", - "organisation": "organisation", - } - ) - - return len(issues) == 0, "Checked for unpopulated references.", issues +import csv + + +def check_for_duplicate_references(csv_path, **kwargs): + duplicates = {} + issues = [] + with csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if ref in duplicates: + duplicates[ref].append(row_number) + else: + duplicates[ref] = [row_number] + + for ref, rows in duplicates.items(): + if len(rows) > 1: + issues.append( + { + "scope": "row-group", + "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", + "dataset": "dataset", + "table_name": "resource", + "rows": rows, + "row_id": str(rows[0]), + "organisation": "organisation", + } + ) + + return True, "Checked for duplicate references.", issues + + +def validate_references(csv_path, **kwargs): + issues = [] + with csv_path.open(newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if not ref: # This will be True for both None and empty strings + issues.append( + { + "scope": "value", + "message": f"Reference is missing on row {row_number}.", + "dataset": "dataset", + "table_name": "resource", + "field_name": "reference", + "row_id": str(row_number), + "value": "Missing", + "organisation": "organisation", + } + ) + + return len(issues) == 0, "Checked for unpopulated references.", issues diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index 9fc1eec3..f0a072e1 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -1,34 +1,34 @@ -from ..expectations.commands import run_converted_resource_checkpoint - - -class PostConversionPhase: - def __init__( - self, - converted_resource_path, - output_dir, - dataset, - typology, - act_on_critical_error=False, - ): - self.converted_resource_path = converted_resource_path - self.output_dir = output_dir - self.dataset = dataset - self.typology = typology - self.act_on_critical_error = act_on_critical_error - - def process(self, stream=None): - self.run() - return stream - - def run(self): - """ - Executes the converted resource checkpoint using the provided parameters. - """ - # Run the checkpoint on the converted resource - run_converted_resource_checkpoint( - self.converted_resource_path, - self.output_dir, - self.dataset, - self.typology, - self.act_on_critical_error, - ) +from ..expectations.commands import run_converted_resource_checkpoint + + +class PostConversionPhase: + def __init__( + self, + converted_resource_path, + output_dir, + dataset, + typology, + act_on_critical_error=False, + ): + self.converted_resource_path = converted_resource_path + self.output_dir = output_dir + self.dataset = dataset + self.typology = typology + self.act_on_critical_error = act_on_critical_error + + def process(self, stream=None): + self.run() + return stream + + def run(self): + """ + Executes the converted resource checkpoint using the provided parameters. + """ + # Run the checkpoint on the converted resource + run_converted_resource_checkpoint( + self.converted_resource_path, + self.output_dir, + self.dataset, + self.typology, + self.act_on_critical_error, + ) From 2cfb7506e95f88e71850b77ecf092a4f09aa5e11 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Thu, 18 Apr 2024 15:51:23 +0100 Subject: [PATCH 53/58] Renamed dataset checkpoint test names to make them a bit clearer. --- tests/integration/expectations/test_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py index 13ab54c0..3927c896 100644 --- a/tests/integration/expectations/test_checkpoint.py +++ b/tests/integration/expectations/test_checkpoint.py @@ -63,7 +63,7 @@ def csv_path(tmp_path): return csv_file -def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path): +def test_dataset_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path): # load data test_entity_data = pd.DataFrame.from_dict({"entity": [1], "name": ["test1"]}) test_old_entity_data = pd.DataFrame.from_dict({"old_entity": [100], "entity": [10]}) @@ -94,7 +94,7 @@ def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path): assert len(issues) == 0 -def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): +def test_dataset_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path): # load data test_entity_data = pd.DataFrame.from_dict( { From f6c2ce09abd94ac08d573e4baeb7edd774c9dc0e Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Mon, 22 Apr 2024 13:56:24 +0100 Subject: [PATCH 54/58] WIP --- digital_land/commands.py | 6 ++---- digital_land/expectations/checkpoints/base.py | 18 ++++++++---------- .../checkpoints/converted_resource.py | 10 +++++++--- .../expectations/checkpoints/dataset.py | 4 ++-- digital_land/phase/convert.py | 3 --- digital_land/phase/post_conversion.py | 11 +++++------ 6 files changed, 24 insertions(+), 28 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index d7730d15..2c2a1059 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -160,13 +160,11 @@ def pipeline_run( run_pipeline( ConvertPhase( path=input_path, - dataset_resource_log=DatasetResourceLog(), + dataset_resource_log=dataset_resource_log, custom_temp_dir=custom_temp_dir, - output_path=output_path, ), PostConversionPhase( - converted_resource_path=input_path, - output_dir=os.path.dirname(output_path), + output_dir=os.path.join("exxpectations", "post-conversion"), dataset=dataset, typology=specification.get_dataset_typology(dataset), ), diff --git a/digital_land/expectations/checkpoints/base.py b/digital_land/expectations/checkpoints/base.py index e0bbaabd..5553ffd9 100644 --- a/digital_land/expectations/checkpoints/base.py +++ b/digital_land/expectations/checkpoints/base.py @@ -16,7 +16,7 @@ def __init__(self, checkpoint, data_path): self.checkpoint = checkpoint self.data_path = data_path self.data_name = Path(data_path).stem - self.responses = [] + self.results = [] self.issues = [] # each issue is going to have different fields, so define here what all of them are # this will take some iterations to get right @@ -112,28 +112,26 @@ def run(self): self.failed_expectation_with_error_severity = 0 for expectation in self.expectations: - response = self.run_expectation(expectation) - self.responses.append(response) - self.issues.extend(response.issues) - self.failed_expectation_with_error_severity += response.act_on_failure() + result = self.run_expectation(expectation) + self.results.append(result) + self.issues.extend(result.issues) + self.failed_expectation_with_error_severity += result.act_on_failure() if self.failed_expectation_with_error_severity > 0: raise DataQualityException( "One or more expectations with severity RaiseError failed, see results for more details" ) - def save_responses(self, responses, file_path, format="csv"): + def save_results(self, results, file_path, format="csv"): os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "w") as f: if format == "csv": dictwriter = DictWriter(f, fieldnames=self.result_fieldnames) dictwriter.writeheader() - dictwriter.writerows( - [response.dict_for_export() for response in responses] - ) + dictwriter.writerows([result.dict_for_export() for result in results]) elif format == "json": - json.dump([response.to_dict() for response in responses], f) + json.dump([result.to_dict() for result in results], f) else: raise ValueError(f"format must be csv or json and cannot be {format}") diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index 14be3c21..b214912e 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -56,14 +56,18 @@ def load(self): def save(self, output_dir, format="csv"): responses_file_path = os.path.join( - output_dir, self.checkpoint, f"{self.dataset}-responses.csv" + output_dir, self.checkpoint, f"{self.dataset}-results.csv" ) issues_file_path = os.path.join( output_dir, self.checkpoint, f"{self.dataset}-issues.csv" ) - self.save_responses( - self.responses, + import pdb + + pdb.set_trace() + + self.save_results( + self.results, responses_file_path, format=format, ) diff --git a/digital_land/expectations/checkpoints/dataset.py b/digital_land/expectations/checkpoints/dataset.py index 2ff0d9c3..e7504cc3 100644 --- a/digital_land/expectations/checkpoints/dataset.py +++ b/digital_land/expectations/checkpoints/dataset.py @@ -71,13 +71,13 @@ def load(self): def save(self, output_dir, format="csv"): responses_file_path = os.path.join( - output_dir, self.checkpoint, f"{self.data_name}-responses.csv" + output_dir, self.checkpoint, f"{self.data_name}-results.csv" ) issues_file_path = os.path.join( output_dir, self.checkpoint, f"{self.data_name}-issues.csv" ) - self.save_responses( + self.save_results( self.responses, responses_file_path, format=format, diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 9cd99f45..7a372125 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -125,9 +125,6 @@ def __init__( self.path = path self.log = dataset_resource_log self.charset = "" - self.converted_resource_path = ( - None # This will hold the path to the converted file - ) # Allows for custom temporary directory to be specified # This allows symlink creation in case of /tmp & path being on different partitions if custom_temp_dir: diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index f0a072e1..41dee0d1 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -1,32 +1,31 @@ +from digital_land.phase.phase import Phase from ..expectations.commands import run_converted_resource_checkpoint -class PostConversionPhase: +class PostConversionPhase(Phase): def __init__( self, - converted_resource_path, output_dir, dataset, typology, act_on_critical_error=False, ): - self.converted_resource_path = converted_resource_path self.output_dir = output_dir self.dataset = dataset self.typology = typology self.act_on_critical_error = act_on_critical_error def process(self, stream=None): - self.run() + self.run(stream.f.name) return stream - def run(self): + def run(self, converted_resource_path): """ Executes the converted resource checkpoint using the provided parameters. """ # Run the checkpoint on the converted resource run_converted_resource_checkpoint( - self.converted_resource_path, + converted_resource_path, self.output_dir, self.dataset, self.typology, From 979e6e4a225a5072747890084935c70d0f2ce671 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Mon, 22 Apr 2024 14:46:35 +0100 Subject: [PATCH 55/58] Post-merge fixes. --- digital_land/expectations/checkpoints/converted_resource.py | 4 ---- digital_land/expectations/checkpoints/dataset.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py index b214912e..4ba6bcc9 100644 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ b/digital_land/expectations/checkpoints/converted_resource.py @@ -62,10 +62,6 @@ def save(self, output_dir, format="csv"): output_dir, self.checkpoint, f"{self.dataset}-issues.csv" ) - import pdb - - pdb.set_trace() - self.save_results( self.results, responses_file_path, diff --git a/digital_land/expectations/checkpoints/dataset.py b/digital_land/expectations/checkpoints/dataset.py index e7504cc3..6c42a3cf 100644 --- a/digital_land/expectations/checkpoints/dataset.py +++ b/digital_land/expectations/checkpoints/dataset.py @@ -78,7 +78,7 @@ def save(self, output_dir, format="csv"): ) self.save_results( - self.responses, + self.results, responses_file_path, format=format, ) From 8284c8ad3940bc8cbfa7c2de82f1297d3c1d57e7 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Tue, 23 Apr 2024 13:51:26 +0100 Subject: [PATCH 56/58] Updated PostConversionPhase to output to issues instead. --- digital_land/commands.py | 4 +- digital_land/phase/post_conversion.py | 65 +++++++++++++++++---------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 2c2a1059..5477a085 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -164,9 +164,7 @@ def pipeline_run( custom_temp_dir=custom_temp_dir, ), PostConversionPhase( - output_dir=os.path.join("exxpectations", "post-conversion"), - dataset=dataset, - typology=specification.get_dataset_typology(dataset), + issues=issue_log, ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index 41dee0d1..aa85707b 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -1,33 +1,52 @@ from digital_land.phase.phase import Phase -from ..expectations.commands import run_converted_resource_checkpoint +import csv class PostConversionPhase(Phase): def __init__( self, - output_dir, - dataset, - typology, - act_on_critical_error=False, + issues, ): - self.output_dir = output_dir - self.dataset = dataset - self.typology = typology - self.act_on_critical_error = act_on_critical_error + self.issues = issues - def process(self, stream=None): - self.run(stream.f.name) + def process(self, stream): + self.validate_references(stream.f.name) + self.check_for_duplicate_references(stream.f.name) return stream - def run(self, converted_resource_path): - """ - Executes the converted resource checkpoint using the provided parameters. - """ - # Run the checkpoint on the converted resource - run_converted_resource_checkpoint( - converted_resource_path, - self.output_dir, - self.dataset, - self.typology, - self.act_on_critical_error, - ) + def check_for_duplicate_references(self, csv_path): + duplicates = {} + with open(csv_path, newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if ( + ref + ): # Don't check None or empty references, as these will be picked up by validate_references + if ref in duplicates: + duplicates[ref].append(row_number) + else: + duplicates[ref] = [row_number] + + for ref, rows in duplicates.items(): + if len(rows) > 1: + self.issues.log_issue( + "reference", + "duplicate-reference", + ref, + f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", + ) + + def validate_references(self, csv_path): + with open(csv_path, newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row_number, row in enumerate(reader, start=1): + ref = row.get("reference") + if not ref: # This will be True for both None and empty strings + self.issues.log_issue( + "reference", + "missing-reference", + ref, + f"Reference missing on row {row_number}", + row_number + 1, + ) From 2a19aaea55ae156f114abc32c868f0c08f4f1ac4 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Tue, 23 Apr 2024 13:59:36 +0100 Subject: [PATCH 57/58] Removed converted resource expectation. --- digital_land/cli.py | 20 ------ .../checkpoints/converted_resource.py | 71 ------------------- digital_land/expectations/commands.py | 19 ----- digital_land/phase/convert.py | 2 - 4 files changed, 112 deletions(-) delete mode 100644 digital_land/expectations/checkpoints/converted_resource.py diff --git a/digital_land/cli.py b/digital_land/cli.py index 9a30f2df..9e47e9e5 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -238,26 +238,6 @@ def expectations_run_dataset_checkpoint(data_path, output_dir, specification_dir run_dataset_checkpoint(data_path, output_dir, dataset, typology) -@cli.command( - "expectations-converted-resource-checkpoint", - short_help="runs data quality expectations against a converted resource", -) -@click.option( - "--data-path", help="path to the converted resource to use", required=True -) -@click.option("--output-dir", help="path/name to sqlite3 dataset", required=True) -@click.option("--specification-dir", help="checkpoint to run", required=True) -@click.option("--dataset", help="checkpoint to run", required=True) -def expectations_run_converted_resource_checkpoint( - data_path, output_dir, specification_dir, dataset -): - from digital_land.expectations.commands import run_converted_resource_checkpoint - - spec = Specification(specification_dir) - typology = spec.get_dataset_typology(dataset) - run_converted_resource_checkpoint(data_path, output_dir, dataset, typology) - - # edit to add collection_name in @cli.command("add-endpoints-and-lookups") @click.argument("csv-path", nargs=1, type=click.Path()) diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py deleted file mode 100644 index 4ba6bcc9..00000000 --- a/digital_land/expectations/checkpoints/converted_resource.py +++ /dev/null @@ -1,71 +0,0 @@ -from pathlib import Path -from .base import BaseCheckpoint -from ..utils import QueryRunner -import os -from ..expectation_functions.resource_validations import ( - check_for_duplicate_references, - validate_references, -) - -# Define BASE expectations which should always run -BASE = [ - { - "function": check_for_duplicate_references, - "name": "Check for Duplicate References", - "severity": "error", - "responsibility": "system", - "csv_path": None, - }, - { - "function": validate_references, - "name": "Validate References", - "severity": "error", - "responsibility": "system", - "csv_path": None, - }, -] - -# Empty TYPOLOGY and DATASET for now as per advice -TYPOLOGY = {} -DATASET = {} - - -class ConvertedResourceCheckpoint(BaseCheckpoint): - def __init__(self, dataset_path, typology, dataset=None): - super().__init__("converted_resource", dataset_path) - self.csv_path = Path(dataset_path) - self.dataset = dataset if dataset else self.csv_path.stem - self.typology = typology - - def load(self): - self.expectations = [] - self.expectations.extend(BASE) - typology_expectations = TYPOLOGY.get(self.typology, []) - dataset_expectations = DATASET.get(self.dataset, []) - - # Extend the expectations list with relevant typology and dataset-specific expectations - if typology_expectations: - self.expectations.extend(typology_expectations) - if dataset_expectations: - self.expectations.extend(dataset_expectations) - - # Assign a QueryRunner instance to each expectation - for expectation in self.expectations: - expectation["csv_path"] = self.csv_path - expectation["query_runner"] = QueryRunner(self.csv_path) - - def save(self, output_dir, format="csv"): - responses_file_path = os.path.join( - output_dir, self.checkpoint, f"{self.dataset}-results.csv" - ) - issues_file_path = os.path.join( - output_dir, self.checkpoint, f"{self.dataset}-issues.csv" - ) - - self.save_results( - self.results, - responses_file_path, - format=format, - ) - - self.save_issues(self.issues, issues_file_path, format=format) diff --git a/digital_land/expectations/commands.py b/digital_land/expectations/commands.py index 7b7f7922..d75cf729 100644 --- a/digital_land/expectations/commands.py +++ b/digital_land/expectations/commands.py @@ -1,5 +1,4 @@ from .checkpoints.dataset import DatasetCheckpoint -from .checkpoints.converted_resource import ConvertedResourceCheckpoint def run_dataset_checkpoint( @@ -18,21 +17,3 @@ def run_dataset_checkpoint( checkpoint.save(output_dir, format="csv") if act_on_critical_error: checkpoint.act_on_critical_error() - - -def run_converted_resource_checkpoint( - converted_resource_path, - output_dir, - dataset, - typology, - act_on_critical_error=False, -): - """ - Function to run the expectation checkpoint for a converted resource - """ - checkpoint = ConvertedResourceCheckpoint(converted_resource_path, dataset, typology) - checkpoint.load() - checkpoint.run() - checkpoint.save(output_dir, format="csv") - if act_on_critical_error: - checkpoint.act_on_critical_error() diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index 7a372125..b57c22c1 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -155,8 +155,6 @@ def process(self, stream=None): # raise StopIteration() reader = iter(()) - if self.output_path: - self.converted_resource_path = self.output_path return Stream(input_path, f=reader, log=self.log) From 77bbff5abb397330579140df2e2e853f4068f901 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Tue, 23 Apr 2024 16:43:02 +0100 Subject: [PATCH 58/58] WIP: Run the ckecks on the pipeline data. --- digital_land/commands.py | 6 +-- digital_land/phase/post_conversion.py | 67 +++++++++++++-------------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 5477a085..e0aaee14 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -163,9 +163,6 @@ def pipeline_run( dataset_resource_log=dataset_resource_log, custom_temp_dir=custom_temp_dir, ), - PostConversionPhase( - issues=issue_log, - ), NormalisePhase(skip_patterns=skip_patterns, null_path=null_path), ParsePhase(), ConcatFieldPhase(concats=concats, log=column_field_log), @@ -198,6 +195,9 @@ def pipeline_run( ), OrganisationPhase(organisation=organisation, issues=issue_log), FieldPrunePhase(fields=specification.current_fieldnames(schema)), + PostConversionPhase( # Now badly named... + issues=issue_log, + ), EntityReferencePhase( dataset=dataset, prefix=specification.dataset_prefix(dataset), diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py index aa85707b..5198d112 100644 --- a/digital_land/phase/post_conversion.py +++ b/digital_land/phase/post_conversion.py @@ -1,5 +1,4 @@ from digital_land.phase.phase import Phase -import csv class PostConversionPhase(Phase): @@ -8,45 +7,43 @@ def __init__( issues, ): self.issues = issues + self.duplicates = {} def process(self, stream): - self.validate_references(stream.f.name) - self.check_for_duplicate_references(stream.f.name) - return stream - - def check_for_duplicate_references(self, csv_path): - duplicates = {} - with open(csv_path, newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - if ( - ref - ): # Don't check None or empty references, as these will be picked up by validate_references - if ref in duplicates: - duplicates[ref].append(row_number) - else: - duplicates[ref] = [row_number] - - for ref, rows in duplicates.items(): - if len(rows) > 1: + for block in stream: + row = block.get("row", None) + if not row: + return + + reference = row.get("reference", None) + line_number = block.get("line-number", None) + + if reference and line_number: + self.validate_references(reference, line_number) + self.check_for_duplicate_references(reference, line_number) + yield block + + for ref, lines in self.duplicates.items(): + if len(lines) > 1: self.issues.log_issue( "reference", "duplicate-reference", ref, - f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}", + f"Duplicate reference '{ref}' found on lines: {', '.join(map(str, lines))}", ) - def validate_references(self, csv_path): - with open(csv_path, newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row_number, row in enumerate(reader, start=1): - ref = row.get("reference") - if not ref: # This will be True for both None and empty strings - self.issues.log_issue( - "reference", - "missing-reference", - ref, - f"Reference missing on row {row_number}", - row_number + 1, - ) + def validate_references(self, reference, line_number): + if not reference: # This will be True for both None and empty strings + self.issues.log_issue( + "reference", + "missing-reference", + "", + "", + line_number, + ) + + def check_for_duplicate_references(self, reference, line_number): + if reference in self.duplicates: + self.duplicates[reference].append(line_number) + else: + self.duplicates[reference] = [line_number]