From 977a8a8291eb371eab722103b09ac31ff51792ca Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Fri, 5 Apr 2024 15:32:02 +0100
Subject: [PATCH 01/58] Updated

---
 .../checkpoints/converted_resource.py         | 72 ++++++++++++++++++-
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index e735a41d..f2eebfec 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -5,9 +5,75 @@
 # a checkpoint represents the moment in the process where we tell it the
 # type of data it is validating and where the data is
 # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
+from pathlib import Path
+import csv
+import re
 from .base import BaseCheckpoint
 
 
-class CovertedResourceCheckpoint(BaseCheckpoint):
-    def load():
-        pass
+class ConvertedResourceCheckpoint(BaseCheckpoint):
+    def __init__(self, data_path):
+        super().__init__("converted_resource", data_path)
+        self.csv_path = Path(data_path)
+
+    def load(self):
+        self.expectations = [
+            {
+                "function": self.check_for_duplicate_references,
+                "name": "Check for Duplicate References",
+                "severity": "error",
+                "responsibility": "system",
+            },
+            {
+                "function": self.validate_references,
+                "name": "Validate References",
+                "severity": "error",
+                "responsibility": "system",
+            },
+        ]
+
+    def check_for_duplicate_references(self):
+        duplicates = {}
+        issues = []
+
+        with self.csv_path.open(newline="") as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row_number, row in enumerate(reader, start=1):
+                ref = row.get("reference")
+                if ref in duplicates:
+                    duplicates[ref].append(row_number)
+                else:
+                    duplicates[ref] = [row_number]
+
+        for ref, rows in duplicates.items():
+            if len(rows) > 1:
+                issues.append(
+                    {
+                        "scope": "duplicate_reference",
+                        "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                        "rows": rows,
+                        "reference": ref,
+                    }
+                )
+
+        return True, "Checked for duplicate references.", issues
+
+    def validate_references(self):
+        pattern = re.compile(r"^REF-\d+$")
+        issues = []
+
+        with self.csv_path.open(newline="") as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row_number, row in enumerate(reader, start=1):
+                ref = row.get("reference")
+                if not pattern.match(ref):
+                    issues.append(
+                        {
+                            "scope": "invalid_reference",
+                            "message": f"Invalid reference '{ref}' on row {row_number}.",
+                            "row": row_number,
+                            "reference": ref,
+                        }
+                    )
+
+        return len(issues) == 0, "Checked for invalid references.", issues

From dec636c5e42ccd0b77f70d8ff786886094f969e7 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 9 Apr 2024 14:41:41 +0100
Subject: [PATCH 02/58] Added unit tests and integrated into convert

---
 digital_land/phase/convert.py                 | 25 +++++++++
 .../expectations/test_checkpoint.py           | 53 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 091fa006..8e057e7e 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -11,6 +11,9 @@
 import pandas as pd
 from .load import Stream
 from .phase import Phase
+from digital_land.expectations.checkpoints.converted_resource import (
+    ConvertedResourceCheckpoint,
+)
 
 
 def detect_file_encoding(path):
@@ -187,12 +190,34 @@ def _read_text_file(self, input_path, encoding):
 
         if converted_csv_file:
             f.close()
+            self.run_checkpoint(converted_csv_file)
             reader = read_csv(converted_csv_file)
         else:
             reader = f
 
         return reader
 
+    def run_checkpoint(self, path):
+        checkpoint = ConvertedResourceCheckpoint(data_path=path)
+        checkpoint.load()
+        checkpoint_result, issues = checkpoint.run()
+
+        if issues:
+            for issue in issues:
+                log_message = self.format_issue_message(issue)
+
+                if issue["severity"] == "error":
+                    logging.error(log_message)
+                elif issue["severity"] == "warning":
+                    logging.warning(log_message)
+                else:
+                    logging.info(log_message)
+        else:
+            logging.info(f"Checkpoint completed with result: {checkpoint_result}")
+
+    def format_issue_message(self, issue):
+        return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})"
+
     def _find_zip_file(self, input_file, suffix=".gml"):
         zip_ = zipfile.ZipFile(input_file)
         files = zip_.namelist()
diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index 6f5e4caa..e0efc11f 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -2,8 +2,11 @@
 import os
 import spatialite
 import pandas as pd
-from csv import DictReader
+from csv import DictReader, DictWriter
 from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint
+from digital_land.expectations.checkpoints.converted_resource import (
+    ConvertedResourceCheckpoint,
+)
 
 
 @pytest.fixture
@@ -43,6 +46,22 @@ def sqlite3_with_entity_tables_path(tmp_path):
     return dataset_path
 
 
+@pytest.fixture
+def csv_path(tmp_path):
+    data = [
+        {"reference": "REF-001", "name": "Test 1"},
+        {"reference": "REF-002", "name": "Test 2"},
+        {"reference": "REF-001", "name": "Test 3"},  # Duplicate
+        {"reference": "INVALID-003", "name": "Test 4"},  # Invalid format
+    ]
+    csv_file = tmp_path / "test_data.csv"
+    with csv_file.open(mode="w", newline="") as f:
+        writer = DictWriter(f, fieldnames=["reference", "name"])
+        writer.writeheader()
+        writer.writerows(data)
+    return csv_file
+
+
 def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path):
     # load data
     test_entity_data = pd.DataFrame.from_dict({"entity": [1], "name": ["test1"]})
@@ -126,3 +145,35 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
     assert issues[0]["rows"] == ""
     assert issues[0]["row"] != ""  # Just check it's there
     assert issues[0]["value"] == ""
+
+
+def test_check_for_duplicate_references(csv_path):
+    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
+    checkpoint.load()
+
+    success, message, issues = checkpoint.check_for_duplicate_references()
+
+    assert success is True, "The function should successfully identify issues."
+    assert len(issues) == 1, "There should be one issue identified."
+    assert (
+        issues[0]["scope"] == "duplicate_reference"
+    ), "The issue should be identified as a duplicate reference."
+    assert (
+        "REF-001" in issues[0]["message"]
+    ), "REF-001 should be identified as a duplicate."
+
+
+def test_validate_references(csv_path):
+    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
+    checkpoint.load()
+
+    success, message, issues = checkpoint.validate_references()
+
+    assert success is False, "The function should fail due to invalid references."
+    assert len(issues) == 1, "There should be one issue identified."
+    assert (
+        issues[0]["scope"] == "invalid_reference"
+    ), "The issue should be identified as an invalid reference."
+    assert (
+        "INVALID-003" in issues[0]["message"]
+    ), "INVALID-003 should be identified as invalid."

From 113dbefed0da8f9282b21fb9183352306ccb2d5f Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 14:19:05 +0100
Subject: [PATCH 03/58] Updated verification

---
 .../expectations/checkpoints/converted_resource.py    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index f2eebfec..206eecb8 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -7,7 +7,6 @@
 # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
 from pathlib import Path
 import csv
-import re
 from .base import BaseCheckpoint
 
 
@@ -59,21 +58,21 @@ def check_for_duplicate_references(self):
         return True, "Checked for duplicate references.", issues
 
     def validate_references(self):
-        pattern = re.compile(r"^REF-\d+$")
         issues = []
 
         with self.csv_path.open(newline="") as csvfile:
             reader = csv.DictReader(csvfile)
             for row_number, row in enumerate(reader, start=1):
                 ref = row.get("reference")
-                if not pattern.match(ref):
+                # Check if reference is not populated (None or empty string)
+                if not ref:  # This will be True for both None and empty strings
                     issues.append(
                         {
                             "scope": "invalid_reference",
-                            "message": f"Invalid reference '{ref}' on row {row_number}.",
+                            "message": f"Reference is missing on row {row_number}.",
                             "row": row_number,
-                            "reference": ref,
+                            "reference": ref,  # This will be None or ''
                         }
                     )
 
-        return len(issues) == 0, "Checked for invalid references.", issues
+        return len(issues) == 0, "Checked for unpopulated references.", issues

From 77b6cbc0920f67a02563cef468ceaac9d7a33753 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 14:29:16 +0100
Subject: [PATCH 04/58] Adjust issue factory

---
 digital_land/expectations/issue.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index 80a750cd..aad74641 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -19,6 +19,8 @@ def issue_factory(scope):
         "row-group": RowGroupIssue,
         "row": RowIssue,
         "value": ValueIssue,
+        "duplicate_reference": RowIssue,
+        "invalid_reference": ValueIssue,
     }
     if scope in SCOPE_MAP:
         return SCOPE_MAP[scope]

From 0b1f12f55883ad920a772bb7251732a91ac0e63e Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:05:42 +0100
Subject: [PATCH 05/58] Issue adjustments

---
 .../checkpoints/converted_resource.py         | 14 ++++++--
 digital_land/expectations/issue.py            | 32 +++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 206eecb8..43671a56 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -52,6 +52,11 @@ def check_for_duplicate_references(self):
                         "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
                         "rows": rows,
                         "reference": ref,
+                        "dataset": "dataset",
+                        "field_name": "reference",
+                        "row_id": str(rows[0]),
+                        "value": ref,
+                        "organisation": "organisation",
                     }
                 )
 
@@ -64,14 +69,19 @@ def validate_references(self):
             reader = csv.DictReader(csvfile)
             for row_number, row in enumerate(reader, start=1):
                 ref = row.get("reference")
-                # Check if reference is not populated (None or empty string)
+
                 if not ref:  # This will be True for both None and empty strings
                     issues.append(
                         {
                             "scope": "invalid_reference",
                             "message": f"Reference is missing on row {row_number}.",
                             "row": row_number,
-                            "reference": ref,  # This will be None or ''
+                            "reference": ref,
+                            "dataset": "dataset",
+                            "field_name": "reference",
+                            "row_id": str(row_number),
+                            "value": ref,
+                            "organisation": "organisation",
                         }
                     )
 
diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index aad74641..9ccad93b 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -19,8 +19,8 @@ def issue_factory(scope):
         "row-group": RowGroupIssue,
         "row": RowIssue,
         "value": ValueIssue,
-        "duplicate_reference": RowIssue,
-        "invalid_reference": ValueIssue,
+        "duplicate_reference": DuplicateReferenceIssue,
+        "invalid_reference": InvalidReferenceIssue,
     }
     if scope in SCOPE_MAP:
         return SCOPE_MAP[scope]
@@ -131,3 +131,31 @@ def __post_init__(self):
         issue_scope = "value"
         if self.scope != issue_scope:
             raise ValueError(f"scope must be '{issue_scope}'.")
+
+
+@dataclass
+class DuplicateReferenceIssue(Issue):
+    dataset: str
+    field_name: str = field(metadata=config(field_name="field_name"))
+    duplicated_value: str = field(metadata=config(field_name="duplicated_value"))
+    rows: list = field(metadata=config(field_name="rows"))
+    organisation: str
+
+    def __post_init__(self):
+        issue_scope = "duplicate_reference"
+        if self.scope != issue_scope:
+            raise ValueError(f"scope must be '{issue_scope}'.")
+
+
+@dataclass
+class InvalidReferenceIssue(Issue):
+    dataset: str
+    field_name: str = field(metadata=config(field_name="field_name"))
+    invalid_value: str = field(metadata=config(field_name="invalid_value"))
+    row_id: str = field(metadata=config(field_name="row_id"))
+    organisation: str
+
+    def __post_init__(self):
+        issue_scope = "invalid_reference"
+        if self.scope != issue_scope:
+            raise ValueError(f"scope must be '{issue_scope}'.")

From 9f683107cf6af628b252f135a876b8a4befe1927 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:09:51 +0100
Subject: [PATCH 06/58] Changed value

---
 digital_land/expectations/checkpoints/converted_resource.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 43671a56..8a952d5e 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -55,7 +55,7 @@ def check_for_duplicate_references(self):
                         "dataset": "dataset",
                         "field_name": "reference",
                         "row_id": str(rows[0]),
-                        "value": ref,
+                        "invalid_value": ref,
                         "organisation": "organisation",
                     }
                 )
@@ -80,7 +80,7 @@ def validate_references(self):
                             "dataset": "dataset",
                             "field_name": "reference",
                             "row_id": str(row_number),
-                            "value": ref,
+                            "invalid_value": ref,
                             "organisation": "organisation",
                         }
                     )

From b71b479d0ae921db123dbb2f67f976f31cda5704 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:15:26 +0100
Subject: [PATCH 07/58] Value changes

---
 digital_land/expectations/checkpoints/converted_resource.py | 2 --
 digital_land/expectations/issue.py                          | 2 --
 2 files changed, 4 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 8a952d5e..8e7f1727 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -55,7 +55,6 @@ def check_for_duplicate_references(self):
                         "dataset": "dataset",
                         "field_name": "reference",
                         "row_id": str(rows[0]),
-                        "invalid_value": ref,
                         "organisation": "organisation",
                     }
                 )
@@ -80,7 +79,6 @@ def validate_references(self):
                             "dataset": "dataset",
                             "field_name": "reference",
                             "row_id": str(row_number),
-                            "invalid_value": ref,
                             "organisation": "organisation",
                         }
                     )
diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index 9ccad93b..b93ca030 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -137,7 +137,6 @@ def __post_init__(self):
 class DuplicateReferenceIssue(Issue):
     dataset: str
     field_name: str = field(metadata=config(field_name="field_name"))
-    duplicated_value: str = field(metadata=config(field_name="duplicated_value"))
     rows: list = field(metadata=config(field_name="rows"))
     organisation: str
 
@@ -151,7 +150,6 @@ def __post_init__(self):
 class InvalidReferenceIssue(Issue):
     dataset: str
     field_name: str = field(metadata=config(field_name="field_name"))
-    invalid_value: str = field(metadata=config(field_name="invalid_value"))
     row_id: str = field(metadata=config(field_name="row_id"))
     organisation: str
 

From d8ef949369a06f6c694df3686750d9a403bcbe44 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:22:06 +0100
Subject: [PATCH 08/58] Adjust convert.py

---
 digital_land/phase/convert.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 8e057e7e..74da23eb 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -200,7 +200,14 @@ def _read_text_file(self, input_path, encoding):
     def run_checkpoint(self, path):
         checkpoint = ConvertedResourceCheckpoint(data_path=path)
         checkpoint.load()
-        checkpoint_result, issues = checkpoint.run()
+        result = checkpoint.run()
+
+        # Check if the result is not None and is iterable (unpackable)
+        if result is not None and isinstance(result, tuple) and len(result) == 2:
+            checkpoint_result, issues = result
+        else:
+            logging.error("Checkpoint did not return the expected result format.")
+            return
 
         if issues:
             for issue in issues:

From 9fde2aeb13b7683735e3310081b4b531f3713670 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:28:53 +0100
Subject: [PATCH 09/58] Test fixes

---
 tests/integration/expectations/test_checkpoint.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index e0efc11f..3ab31bdc 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -52,7 +52,7 @@ def csv_path(tmp_path):
         {"reference": "REF-001", "name": "Test 1"},
         {"reference": "REF-002", "name": "Test 2"},
         {"reference": "REF-001", "name": "Test 3"},  # Duplicate
-        {"reference": "INVALID-003", "name": "Test 4"},  # Invalid format
+        {"reference": "", "name": "Test 4"},  # Invalid format
     ]
     csv_file = tmp_path / "test_data.csv"
     with csv_file.open(mode="w", newline="") as f:
@@ -174,6 +174,4 @@ def test_validate_references(csv_path):
     assert (
         issues[0]["scope"] == "invalid_reference"
     ), "The issue should be identified as an invalid reference."
-    assert (
-        "INVALID-003" in issues[0]["message"]
-    ), "INVALID-003 should be identified as invalid."
+    assert "" in issues[0]["message"], " 4th value should be identified as invalid."

From 45b0e11a0d14f5e14c88746298867e8705179f8c Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 10:21:33 +0100
Subject: [PATCH 10/58] Chanegs to issues

---
 .../checkpoints/converted_resource.py         | 13 ++++-----
 digital_land/expectations/issue.py            | 28 -------------------
 2 files changed, 6 insertions(+), 35 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 8e7f1727..73b666cf 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -48,12 +48,11 @@ def check_for_duplicate_references(self):
             if len(rows) > 1:
                 issues.append(
                     {
-                        "scope": "duplicate_reference",
+                        "scope": "row-group",
                         "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
-                        "rows": rows,
-                        "reference": ref,
                         "dataset": "dataset",
-                        "field_name": "reference",
+                        "table_name": "resource",
+                        "rows": rows,
                         "row_id": str(rows[0]),
                         "organisation": "organisation",
                     }
@@ -72,13 +71,13 @@ def validate_references(self):
                 if not ref:  # This will be True for both None and empty strings
                     issues.append(
                         {
-                            "scope": "invalid_reference",
+                            "scope": "value",
                             "message": f"Reference is missing on row {row_number}.",
-                            "row": row_number,
-                            "reference": ref,
                             "dataset": "dataset",
+                            "table_name": "resource",
                             "field_name": "reference",
                             "row_id": str(row_number),
+                            "value": ref,
                             "organisation": "organisation",
                         }
                     )
diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index b93ca030..80a750cd 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -19,8 +19,6 @@ def issue_factory(scope):
         "row-group": RowGroupIssue,
         "row": RowIssue,
         "value": ValueIssue,
-        "duplicate_reference": DuplicateReferenceIssue,
-        "invalid_reference": InvalidReferenceIssue,
     }
     if scope in SCOPE_MAP:
         return SCOPE_MAP[scope]
@@ -131,29 +129,3 @@ def __post_init__(self):
         issue_scope = "value"
         if self.scope != issue_scope:
             raise ValueError(f"scope must be '{issue_scope}'.")
-
-
-@dataclass
-class DuplicateReferenceIssue(Issue):
-    dataset: str
-    field_name: str = field(metadata=config(field_name="field_name"))
-    rows: list = field(metadata=config(field_name="rows"))
-    organisation: str
-
-    def __post_init__(self):
-        issue_scope = "duplicate_reference"
-        if self.scope != issue_scope:
-            raise ValueError(f"scope must be '{issue_scope}'.")
-
-
-@dataclass
-class InvalidReferenceIssue(Issue):
-    dataset: str
-    field_name: str = field(metadata=config(field_name="field_name"))
-    row_id: str = field(metadata=config(field_name="row_id"))
-    organisation: str
-
-    def __post_init__(self):
-        issue_scope = "invalid_reference"
-        if self.scope != issue_scope:
-            raise ValueError(f"scope must be '{issue_scope}'.")

From 24594c2b6cc01ce8ed762f837fa31ec61d129c99 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 10:35:25 +0100
Subject: [PATCH 11/58] Change to reference

---
 digital_land/expectations/checkpoints/converted_resource.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 73b666cf..512a8dce 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -77,7 +77,7 @@ def validate_references(self):
                             "table_name": "resource",
                             "field_name": "reference",
                             "row_id": str(row_number),
-                            "value": ref,
+                            "value": "reference",
                             "organisation": "organisation",
                         }
                     )

From adddaa4fc21a71afa2ac62bb0bae7b2a4cbba697 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 14:39:35 +0100
Subject: [PATCH 12/58] Separate functions and correct tests

---
 .../checkpoints/converted_resource.py         | 69 ++-----------------
 .../resource_validations.py                   | 56 +++++++++++++++
 .../expectations/test_checkpoint.py           | 23 +++----
 3 files changed, 71 insertions(+), 77 deletions(-)
 create mode 100644 digital_land/expectations/expectation_functions/resource_validations.py

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 512a8dce..f00f24fc 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -1,13 +1,9 @@
-# checkpoint needs to assemble class state
-# it needs to validate inputs specific for that checkpoint
-# it then needs to run expectations
-# then it needs to be able to save those expectation resultts
-# a checkpoint represents the moment in the process where we tell it the
-# type of data it is validating and where the data is
-# the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
 from pathlib import Path
-import csv
 from .base import BaseCheckpoint
+from ..expectation_functions.resource_validations import (
+    check_for_duplicate_references,
+    validate_references,
+)
 
 
 class ConvertedResourceCheckpoint(BaseCheckpoint):
@@ -18,68 +14,15 @@ def __init__(self, data_path):
     def load(self):
         self.expectations = [
             {
-                "function": self.check_for_duplicate_references,
+                "function": check_for_duplicate_references(self.csv_path),
                 "name": "Check for Duplicate References",
                 "severity": "error",
                 "responsibility": "system",
             },
             {
-                "function": self.validate_references,
+                "function": validate_references(self.csv_path),
                 "name": "Validate References",
                 "severity": "error",
                 "responsibility": "system",
             },
         ]
-
-    def check_for_duplicate_references(self):
-        duplicates = {}
-        issues = []
-
-        with self.csv_path.open(newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-                if ref in duplicates:
-                    duplicates[ref].append(row_number)
-                else:
-                    duplicates[ref] = [row_number]
-
-        for ref, rows in duplicates.items():
-            if len(rows) > 1:
-                issues.append(
-                    {
-                        "scope": "row-group",
-                        "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
-                        "dataset": "dataset",
-                        "table_name": "resource",
-                        "rows": rows,
-                        "row_id": str(rows[0]),
-                        "organisation": "organisation",
-                    }
-                )
-
-        return True, "Checked for duplicate references.", issues
-
-    def validate_references(self):
-        issues = []
-
-        with self.csv_path.open(newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-
-                if not ref:  # This will be True for both None and empty strings
-                    issues.append(
-                        {
-                            "scope": "value",
-                            "message": f"Reference is missing on row {row_number}.",
-                            "dataset": "dataset",
-                            "table_name": "resource",
-                            "field_name": "reference",
-                            "row_id": str(row_number),
-                            "value": "reference",
-                            "organisation": "organisation",
-                        }
-                    )
-
-        return len(issues) == 0, "Checked for unpopulated references.", issues
diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
new file mode 100644
index 00000000..23150be1
--- /dev/null
+++ b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -0,0 +1,56 @@
+import csv
+
+
+def check_for_duplicate_references(csv_path):
+    duplicates = {}
+    issues = []
+
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+            if ref in duplicates:
+                duplicates[ref].append(row_number)
+            else:
+                duplicates[ref] = [row_number]
+
+    for ref, rows in duplicates.items():
+        if len(rows) > 1:
+            issues.append(
+                {
+                    "scope": "row-group",
+                    "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                    "dataset": "dataset",
+                    "table_name": "resource",
+                    "rows": rows,
+                    "row_id": str(rows[0]),
+                    "organisation": "organisation",
+                }
+            )
+
+    return issues
+
+
+def validate_references(csv_path):
+    issues = []
+
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+
+            if not ref:  # This will be True for both None and empty strings
+                issues.append(
+                    {
+                        "scope": "value",
+                        "message": f"Reference is missing on row {row_number}.",
+                        "dataset": "dataset",
+                        "table_name": "resource",
+                        "field_name": "reference",
+                        "row_id": str(row_number),
+                        "value": "Missing",
+                        "organisation": "organisation",
+                    }
+                )
+
+    return issues
diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index 3ab31bdc..f7d8d7c8 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -4,8 +4,9 @@
 import pandas as pd
 from csv import DictReader, DictWriter
 from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint
-from digital_land.expectations.checkpoints.converted_resource import (
-    ConvertedResourceCheckpoint,
+from digital_land.expectations.expectation_functions.resource_validations import (
+    check_for_duplicate_references,
+    validate_references,
 )
 
 
@@ -148,15 +149,12 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
 
 
 def test_check_for_duplicate_references(csv_path):
-    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
-    checkpoint.load()
-
-    success, message, issues = checkpoint.check_for_duplicate_references()
+    issues = check_for_duplicate_references(csv_path)
 
-    assert success is True, "The function should successfully identify issues."
+    assert issues, "The function should successfully identify issues."
     assert len(issues) == 1, "There should be one issue identified."
     assert (
-        issues[0]["scope"] == "duplicate_reference"
+        issues[0]["scope"] == "row-group"
     ), "The issue should be identified as a duplicate reference."
     assert (
         "REF-001" in issues[0]["message"]
@@ -164,14 +162,11 @@ def test_check_for_duplicate_references(csv_path):
 
 
 def test_validate_references(csv_path):
-    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
-    checkpoint.load()
-
-    success, message, issues = checkpoint.validate_references()
+    issues = validate_references(csv_path)
 
-    assert success is False, "The function should fail due to invalid references."
+    assert issues, "The function should fail due to invalid references."
     assert len(issues) == 1, "There should be one issue identified."
     assert (
-        issues[0]["scope"] == "invalid_reference"
+        issues[0]["scope"] == "value"
     ), "The issue should be identified as an invalid reference."
     assert "" in issues[0]["message"], " 4th value should be identified as invalid."

From dab0d77a346236438c03fefcc8eef76184ea8398 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 15:02:58 +0100
Subject: [PATCH 13/58] Changes back to helpers

---
 .../expectations/checkpoints/converted_resource.py    | 11 ++++++++---
 .../expectation_functions/resource_validations.py     |  7 ++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index f00f24fc..d82726c7 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -7,9 +7,14 @@
 
 
 class ConvertedResourceCheckpoint(BaseCheckpoint):
-    def __init__(self, data_path):
-        super().__init__("converted_resource", data_path)
-        self.csv_path = Path(data_path)
+    def __init__(self, dataset_path, typology, dataset=None):
+        super().__init__("converted_resource", dataset_path)
+        self.csv_path = Path(dataset_path)
+        if dataset:
+            self.dataset = dataset
+        else:
+            self.dataset = self.csv_path.stem
+        self.typology = typology
 
     def load(self):
         self.expectations = [
diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
index 23150be1..2acbe669 100644
--- a/digital_land/expectations/expectation_functions/resource_validations.py
+++ b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -4,7 +4,6 @@
 def check_for_duplicate_references(csv_path):
     duplicates = {}
     issues = []
-
     with csv_path.open(newline="") as csvfile:
         reader = csv.DictReader(csvfile)
         for row_number, row in enumerate(reader, start=1):
@@ -28,17 +27,15 @@ def check_for_duplicate_references(csv_path):
                 }
             )
 
-    return issues
+    return True, "Checked for duplicate references.", issues
 
 
 def validate_references(csv_path):
     issues = []
-
     with csv_path.open(newline="") as csvfile:
         reader = csv.DictReader(csvfile)
         for row_number, row in enumerate(reader, start=1):
             ref = row.get("reference")
-
             if not ref:  # This will be True for both None and empty strings
                 issues.append(
                     {
@@ -53,4 +50,4 @@ def validate_references(csv_path):
                     }
                 )
 
-    return issues
+    return len(issues) == 0, "Checked for unpopulated references.", issues

From f162dcf87ace42eb8ccb3a6d4d62e0cedb138ec1 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 15:07:19 +0100
Subject: [PATCH 14/58] Fix

---
 digital_land/phase/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 74da23eb..303609f9 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -198,7 +198,7 @@ def _read_text_file(self, input_path, encoding):
         return reader
 
     def run_checkpoint(self, path):
-        checkpoint = ConvertedResourceCheckpoint(data_path=path)
+        checkpoint = ConvertedResourceCheckpoint(dataset_path=path)
         checkpoint.load()
         result = checkpoint.run()
 

From c1f9081434daa457fcf9732c7dbbf4b123bc228e Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 12:41:53 +0100
Subject: [PATCH 15/58] Core changes

---
 digital_land/commands.py                      | 11 +++
 .../checkpoints/converted_resource.py         | 71 ++++++++++++++-----
 digital_land/expectations/commands.py         |  4 +-
 digital_land/phase/convert.py                 | 29 --------
 digital_land/phase/post_conversion.py         | 38 ++++++++++
 .../expectations/test_checkpoint.py           |  4 +-
 6 files changed, 106 insertions(+), 51 deletions(-)
 create mode 100644 digital_land/phase/post_conversion.py

diff --git a/digital_land/commands.py b/digital_land/commands.py
index ad9d05b1..07d7c488 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -19,6 +19,7 @@
 from digital_land.phase.combine import FactCombinePhase
 from digital_land.phase.concat import ConcatFieldPhase
 from digital_land.phase.convert import ConvertPhase, execute
+from digital_land.phase.post_conversion import PostConversionPhase
 from digital_land.phase.default import DefaultPhase
 from digital_land.phase.dump import DumpPhase
 from digital_land.phase.factor import FactorPhase
@@ -162,6 +163,16 @@ def pipeline_run(
             dataset_resource_log=dataset_resource_log,
             custom_temp_dir=custom_temp_dir,
         ),
+        PostConversionPhase(
+            converted_resource_path=os.path.join(
+                custom_temp_dir, f"{resource}_converted.csv"
+            ),
+            output_dir=os.path.join(
+                os.path.dirname(output_path), "post_conversion_outputs"
+            ),
+            dataset=dataset,
+            typology=specification.get_typology_for_dataset(dataset),
+        ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),
         ConcatFieldPhase(concats=concats, log=column_field_log),
diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index d82726c7..59c1c307 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -1,33 +1,68 @@
 from pathlib import Path
 from .base import BaseCheckpoint
+from ..utils import QueryRunner
+import os
 from ..expectation_functions.resource_validations import (
     check_for_duplicate_references,
     validate_references,
 )
 
+# Define BASE expectations which should always run
+BASE = [
+    {
+        "function": check_for_duplicate_references,
+        "name": "Check for Duplicate References",
+        "severity": "error",
+        "responsibility": "system",
+    },
+    {
+        "function": validate_references,
+        "name": "Validate References",
+        "severity": "error",
+        "responsibility": "system",
+    },
+]
+
+# Empty TYPOLOGY and DATASET for now as per advice
+TYPOLOGY = {}
+DATASET = {}
+
 
 class ConvertedResourceCheckpoint(BaseCheckpoint):
     def __init__(self, dataset_path, typology, dataset=None):
         super().__init__("converted_resource", dataset_path)
         self.csv_path = Path(dataset_path)
-        if dataset:
-            self.dataset = dataset
-        else:
-            self.dataset = self.csv_path.stem
+        self.dataset = dataset if dataset else self.csv_path.stem
         self.typology = typology
 
     def load(self):
-        self.expectations = [
-            {
-                "function": check_for_duplicate_references(self.csv_path),
-                "name": "Check for Duplicate References",
-                "severity": "error",
-                "responsibility": "system",
-            },
-            {
-                "function": validate_references(self.csv_path),
-                "name": "Validate References",
-                "severity": "error",
-                "responsibility": "system",
-            },
-        ]
+        self.expectations = []
+        self.expectations.extend(BASE)
+        typology_expectations = TYPOLOGY.get(self.typology, [])
+        dataset_expectations = DATASET.get(self.dataset, [])
+
+        # Extend the expectations list with relevant typology and dataset-specific expectations
+        if typology_expectations:
+            self.expectations.extend(typology_expectations)
+        if dataset_expectations:
+            self.expectations.extend(dataset_expectations)
+
+        # Assign a QueryRunner instance to each expectation
+        for expectation in self.expectations:
+            expectation["query_runner"] = QueryRunner(self.csv_path)
+
+    def save(self, output_dir, format="csv"):
+        responses_file_path = os.path.join(
+            output_dir, self.checkpoint, f"{self.dataset}-responses.csv"
+        )
+        issues_file_path = os.path.join(
+            output_dir, self.checkpoint, f"{self.dataset}-issues.csv"
+        )
+
+        self.save_responses(
+            self.responses,
+            responses_file_path,
+            format=format,
+        )
+
+        self.save_issues(self.issues, issues_file_path, format=format)
diff --git a/digital_land/expectations/commands.py b/digital_land/expectations/commands.py
index d16c6533..7b7f7922 100644
--- a/digital_land/expectations/commands.py
+++ b/digital_land/expectations/commands.py
@@ -1,5 +1,5 @@
 from .checkpoints.dataset import DatasetCheckpoint
-from .checkpoints.converted_resource import CovertedResourceCheckpoint
+from .checkpoints.converted_resource import ConvertedResourceCheckpoint
 
 
 def run_dataset_checkpoint(
@@ -30,7 +30,7 @@ def run_converted_resource_checkpoint(
     """
     Function to run the expectation checkpoint for a converted resource
     """
-    checkpoint = CovertedResourceCheckpoint(converted_resource_path, dataset, typology)
+    checkpoint = ConvertedResourceCheckpoint(converted_resource_path, dataset, typology)
     checkpoint.load()
     checkpoint.run()
     checkpoint.save(output_dir, format="csv")
diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 303609f9..b57c22c1 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -11,9 +11,6 @@
 import pandas as pd
 from .load import Stream
 from .phase import Phase
-from digital_land.expectations.checkpoints.converted_resource import (
-    ConvertedResourceCheckpoint,
-)
 
 
 def detect_file_encoding(path):
@@ -190,38 +187,12 @@ def _read_text_file(self, input_path, encoding):
 
         if converted_csv_file:
             f.close()
-            self.run_checkpoint(converted_csv_file)
             reader = read_csv(converted_csv_file)
         else:
             reader = f
 
         return reader
 
-    def run_checkpoint(self, path):
-        checkpoint = ConvertedResourceCheckpoint(dataset_path=path)
-        checkpoint.load()
-        result = checkpoint.run()
-
-        # Check if the result is not None and is iterable (unpackable)
-        if result is not None and isinstance(result, tuple) and len(result) == 2:
-            checkpoint_result, issues = result
-        else:
-            logging.error("Checkpoint did not return the expected result format.")
-            return
-
-        if issues:
-            for issue in issues:
-                log_message = self.format_issue_message(issue)
-
-                if issue["severity"] == "error":
-                    logging.error(log_message)
-                elif issue["severity"] == "warning":
-                    logging.warning(log_message)
-                else:
-                    logging.info(log_message)
-        else:
-            logging.info(f"Checkpoint completed with result: {checkpoint_result}")
-
     def format_issue_message(self, issue):
         return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})"
 
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
new file mode 100644
index 00000000..801aaed8
--- /dev/null
+++ b/digital_land/phase/post_conversion.py
@@ -0,0 +1,38 @@
+from expectations.commands import run_converted_resource_checkpoint
+
+
+class PostConversionPhase:
+    def __init__(
+        self,
+        converted_resource_path,
+        output_dir,
+        dataset,
+        typology,
+        act_on_critical_error=False,
+    ):
+        """
+        Initializes the PostConversionPhase with necessary parameters.
+        :param converted_resource_path: Path to the converted CSV file.
+        :param output_dir: Directory to store output files.
+        :param dataset: Dataset related information for the checkpoint.
+        :param typology: Typology information for the checkpoint.
+        :param act_on_critical_error: Whether to act on critical errors during the checkpoint.
+        """
+        self.converted_resource_path = converted_resource_path
+        self.output_dir = output_dir
+        self.dataset = dataset
+        self.typology = typology
+        self.act_on_critical_error = act_on_critical_error
+
+    def run(self):
+        """
+        Executes the converted resource checkpoint using the provided parameters.
+        """
+        # Run the checkpoint on the converted resource
+        run_converted_resource_checkpoint(
+            self.converted_resource_path,
+            self.output_dir,
+            self.dataset,
+            self.typology,
+            self.act_on_critical_error,
+        )
diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index f7d8d7c8..9276d20f 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -149,7 +149,7 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
 
 
 def test_check_for_duplicate_references(csv_path):
-    issues = check_for_duplicate_references(csv_path)
+    _, _, issues = check_for_duplicate_references(csv_path)
 
     assert issues, "The function should successfully identify issues."
     assert len(issues) == 1, "There should be one issue identified."
@@ -162,7 +162,7 @@ def test_check_for_duplicate_references(csv_path):
 
 
 def test_validate_references(csv_path):
-    issues = validate_references(csv_path)
+    _, _, issues = validate_references(csv_path)
 
     assert issues, "The function should fail due to invalid references."
     assert len(issues) == 1, "There should be one issue identified."

From c1c218c20049dd89244cc3694eb6262a6b4429c1 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 12:49:24 +0100
Subject: [PATCH 16/58] Import change

---
 digital_land/phase/post_conversion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index 801aaed8..e312644d 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -1,4 +1,4 @@
-from expectations.commands import run_converted_resource_checkpoint
+from ..expectations.commands import run_converted_resource_checkpoint
 
 
 class PostConversionPhase:

From a046e7d592ec662230c761516369d26604a3caf9 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:03:10 +0100
Subject: [PATCH 17/58] Parameter changes

---
 digital_land/commands.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 07d7c488..ca9224cf 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -128,6 +128,7 @@ def pipeline_run(
     issue_log = IssueLog(dataset=dataset, resource=resource)
     column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
     dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+    converted_resource_path = custom_temp_dir / f"{resource}_converted.csv"
 
     # load pipeline configuration
     skip_patterns = pipeline.skip_patterns(resource)
@@ -164,12 +165,8 @@ def pipeline_run(
             custom_temp_dir=custom_temp_dir,
         ),
         PostConversionPhase(
-            converted_resource_path=os.path.join(
-                custom_temp_dir, f"{resource}_converted.csv"
-            ),
-            output_dir=os.path.join(
-                os.path.dirname(output_path), "post_conversion_outputs"
-            ),
+            converted_resource_path=converted_resource_path,
+            output_dir=output_path,
             dataset=dataset,
             typology=specification.get_typology_for_dataset(dataset),
         ),

From 7769264b967cd000923c52c0d62eb9c3e096fcdf Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:39:52 +0100
Subject: [PATCH 18/58] Changes to convert

---
 digital_land/commands.py      | 11 +++++++++++
 digital_land/phase/convert.py |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index ca9224cf..1c719b63 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -158,6 +158,17 @@ def pipeline_run(
     if entry_date:
         default_values["entry-date"] = entry_date
 
+    convert_phase = ConvertPhase(
+        path=input_path,
+        dataset_resource_log=DatasetResourceLog(),
+        custom_temp_dir=custom_temp_dir,
+        output_path=output_path,
+    )
+
+    # Execute the ConvertPhase to set the converted_resource_path
+    convert_phase.process()
+    converted_resource_path = convert_phase.converted_resource_path
+
     run_pipeline(
         ConvertPhase(
             path=input_path,
diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index b57c22c1..9cd99f45 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -125,6 +125,9 @@ def __init__(
         self.path = path
         self.log = dataset_resource_log
         self.charset = ""
+        self.converted_resource_path = (
+            None  # This will hold the path to the converted file
+        )
         # Allows for custom temporary directory to be specified
         # This allows symlink creation in case of /tmp & path being on different partitions
         if custom_temp_dir:
@@ -155,6 +158,8 @@ def process(self, stream=None):
 
             # raise StopIteration()
             reader = iter(())
+        if self.output_path:
+            self.converted_resource_path = self.output_path
 
         return Stream(input_path, f=reader, log=self.log)
 

From cc64e30aaa929fbab3619f72316d852308eab61c Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:43:39 +0100
Subject: [PATCH 19/58] Fix

---
 digital_land/commands.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 1c719b63..8c4767dc 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -128,7 +128,6 @@ def pipeline_run(
     issue_log = IssueLog(dataset=dataset, resource=resource)
     column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
     dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-    converted_resource_path = custom_temp_dir / f"{resource}_converted.csv"
 
     # load pipeline configuration
     skip_patterns = pipeline.skip_patterns(resource)

From e1311643be429533c639eee5ed7bc14babdd492c Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:49:43 +0100
Subject: [PATCH 20/58] Typology change

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 8c4767dc..9062c36c 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -178,7 +178,7 @@ def pipeline_run(
             converted_resource_path=converted_resource_path,
             output_dir=output_path,
             dataset=dataset,
-            typology=specification.get_typology_for_dataset(dataset),
+            typology=specification.get_dataset_typology(dataset),
         ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),

From cc983ce72603357b812370aa4d8cef90f9f21dc7 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:55:08 +0100
Subject: [PATCH 21/58] Add Process

---
 digital_land/commands.py              |  5 -----
 digital_land/phase/post_conversion.py | 11 +++--------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 9062c36c..a0f3fc26 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -169,11 +169,6 @@ def pipeline_run(
     converted_resource_path = convert_phase.converted_resource_path
 
     run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
         PostConversionPhase(
             converted_resource_path=converted_resource_path,
             output_dir=output_path,
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index e312644d..2216f8dd 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -10,20 +10,15 @@ def __init__(
         typology,
         act_on_critical_error=False,
     ):
-        """
-        Initializes the PostConversionPhase with necessary parameters.
-        :param converted_resource_path: Path to the converted CSV file.
-        :param output_dir: Directory to store output files.
-        :param dataset: Dataset related information for the checkpoint.
-        :param typology: Typology information for the checkpoint.
-        :param act_on_critical_error: Whether to act on critical errors during the checkpoint.
-        """
         self.converted_resource_path = converted_resource_path
         self.output_dir = output_dir
         self.dataset = dataset
         self.typology = typology
         self.act_on_critical_error = act_on_critical_error
 
+    def process(self):
+        return self.run()
+
     def run(self):
         """
         Executes the converted resource checkpoint using the provided parameters.

From d26369f8ec93f141df6ebb0febc04552b74f9eed Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 14:00:24 +0100
Subject: [PATCH 22/58] Add process parameter

---
 digital_land/phase/post_conversion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index 2216f8dd..00dcdd77 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -16,7 +16,7 @@ def __init__(
         self.typology = typology
         self.act_on_critical_error = act_on_critical_error
 
-    def process(self):
+    def process(self, stream=None):
         return self.run()
 
     def run(self):

From fceb81ac1ebbba744ff341bd9ece901b748c41b6 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 14:09:35 +0100
Subject: [PATCH 23/58] Query runner adjustments

---
 .../expectation_functions/resource_validations.py             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
index 2acbe669..c6acae74 100644
--- a/digital_land/expectations/expectation_functions/resource_validations.py
+++ b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -1,7 +1,7 @@
 import csv
 
 
-def check_for_duplicate_references(csv_path):
+def check_for_duplicate_references(csv_path, **kwargs):
     duplicates = {}
     issues = []
     with csv_path.open(newline="") as csvfile:
@@ -30,7 +30,7 @@ def check_for_duplicate_references(csv_path):
     return True, "Checked for duplicate references.", issues
 
 
-def validate_references(csv_path):
+def validate_references(csv_path, **kwargs):
     issues = []
     with csv_path.open(newline="") as csvfile:
         reader = csv.DictReader(csvfile)

From 324b2c132aeb50b50f5a218f4ae077c5624bd81e Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 14:15:47 +0100
Subject: [PATCH 24/58] Fix converted resource

---
 digital_land/expectations/checkpoints/converted_resource.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 59c1c307..14be3c21 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -14,12 +14,14 @@
         "name": "Check for Duplicate References",
         "severity": "error",
         "responsibility": "system",
+        "csv_path": None,
     },
     {
         "function": validate_references,
         "name": "Validate References",
         "severity": "error",
         "responsibility": "system",
+        "csv_path": None,
     },
 ]
 
@@ -49,6 +51,7 @@ def load(self):
 
         # Assign a QueryRunner instance to each expectation
         for expectation in self.expectations:
+            expectation["csv_path"] = self.csv_path
             expectation["query_runner"] = QueryRunner(self.csv_path)
 
     def save(self, output_dir, format="csv"):

From 1c5d64071e1c7c9f2d586be6c6a5b67763722bbe Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 17 Apr 2024 16:33:12 +0100
Subject: [PATCH 25/58] Change pathing

---
 digital_land/commands.py | 1647 +++++++++++++++++++-------------------
 1 file changed, 821 insertions(+), 826 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index a0f3fc26..07befebf 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1,826 +1,821 @@
-from collections import OrderedDict
-import csv
-import itertools
-import os
-import sys
-import json
-import logging
-from pathlib import Path
-
-import geojson
-import shapely
-
-from digital_land.specification import Specification
-from digital_land.collect import Collector
-from digital_land.collection import Collection, resource_path
-from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog
-from digital_land.organisation import Organisation
-from digital_land.package.dataset import DatasetPackage
-from digital_land.phase.combine import FactCombinePhase
-from digital_land.phase.concat import ConcatFieldPhase
-from digital_land.phase.convert import ConvertPhase, execute
-from digital_land.phase.post_conversion import PostConversionPhase
-from digital_land.phase.default import DefaultPhase
-from digital_land.phase.dump import DumpPhase
-from digital_land.phase.factor import FactorPhase
-from digital_land.phase.filter import FilterPhase
-from digital_land.phase.harmonise import HarmonisePhase
-from digital_land.phase.lookup import (
-    EntityLookupPhase,
-    FactLookupPhase,
-    PrintLookupPhase,
-)
-from digital_land.phase.map import MapPhase
-from digital_land.phase.migrate import MigratePhase
-from digital_land.phase.normalise import NormalisePhase
-from digital_land.phase.organisation import OrganisationPhase
-from digital_land.phase.parse import ParsePhase
-from digital_land.phase.patch import PatchPhase
-from digital_land.phase.pivot import PivotPhase
-from digital_land.phase.prefix import EntityPrefixPhase
-from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
-from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
-from digital_land.phase.save import SavePhase
-from digital_land.pipeline import run_pipeline, Lookups, Pipeline
-from digital_land.schema import Schema
-from digital_land.update import add_source_endpoint
-from .register import hash_value
-
-logger = logging.getLogger(__name__)
-
-
-def fetch(url, pipeline):
-    """fetch a single source endpoint URL, and add it to the collection"""
-    collector = Collector(pipeline.name)
-    collector.fetch(url)
-
-
-def collect(endpoint_path, collection_dir, pipeline):
-    """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file"""
-    collector = Collector(pipeline.name, Path(collection_dir))
-    collector.collect(endpoint_path)
-
-
-#
-#  collection commands
-#  TBD: make sub commands
-#
-def collection_list_resources(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    for resource in sorted(collection.resource.records):
-        print(resource_path(resource, directory=collection_dir))
-
-
-def collection_pipeline_makerules(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    collection.pipeline_makerules()
-
-
-def collection_save_csv(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    collection.update()
-    collection.save_csv()
-
-
-#
-#  pipeline commands
-#
-def convert(input_path, output_path, custom_temp_dir=None):
-    if not output_path:
-        output_path = default_output_path("converted", input_path)
-    dataset_resource_log = DatasetResourceLog()
-    run_pipeline(
-        ConvertPhase(
-            input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
-        DumpPhase(output_path),
-    )
-    dataset_resource_log.save(f=sys.stdout)
-
-
-def pipeline_run(
-    dataset,
-    pipeline,
-    specification,
-    input_path,
-    output_path,
-    collection_dir="./collection",  # TBD: remove, replaced by endpoints, organisations and entry_date
-    null_path=None,  # TBD: remove this
-    issue_dir=None,
-    organisation_path=None,
-    save_harmonised=False,
-    column_field_dir=None,
-    dataset_resource_dir=None,
-    custom_temp_dir=None,  # TBD: rename to "tmpdir"
-    endpoints=[],
-    organisations=[],
-    entry_date="",
-):
-    resource = resource_from_path(input_path)
-    dataset = dataset
-    schema = specification.pipeline[pipeline.name]["schema"]
-    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
-    issue_log = IssueLog(dataset=dataset, resource=resource)
-    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
-    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-
-    # load pipeline configuration
-    skip_patterns = pipeline.skip_patterns(resource)
-    columns = pipeline.columns(resource, endpoints=endpoints)
-    concats = pipeline.concatenations(resource, endpoints=endpoints)
-    patches = pipeline.patches(resource=resource)
-    lookups = pipeline.lookups(resource=resource)
-    default_fields = pipeline.default_fields(resource=resource)
-    default_values = pipeline.default_values(endpoints=endpoints)
-    combine_fields = pipeline.combine_fields(endpoints=endpoints)
-
-    # load organisations
-    organisation = Organisation(organisation_path, Path(pipeline.path))
-
-    # load the resource default values from the collection
-    if not endpoints:
-        collection = Collection(name=None, directory=collection_dir)
-        collection.load()
-        endpoints = collection.resource_endpoints(resource)
-        organisations = collection.resource_organisations(resource)
-        entry_date = collection.resource_start_date(resource)
-
-    # resource specific default values
-    if len(organisations) == 1:
-        default_values["organisation"] = organisations[0]
-
-    if entry_date:
-        default_values["entry-date"] = entry_date
-
-    convert_phase = ConvertPhase(
-        path=input_path,
-        dataset_resource_log=DatasetResourceLog(),
-        custom_temp_dir=custom_temp_dir,
-        output_path=output_path,
-    )
-
-    # Execute the ConvertPhase to set the converted_resource_path
-    convert_phase.process()
-    converted_resource_path = convert_phase.converted_resource_path
-
-    run_pipeline(
-        PostConversionPhase(
-            converted_resource_path=converted_resource_path,
-            output_dir=output_path,
-            dataset=dataset,
-            typology=specification.get_dataset_typology(dataset),
-        ),
-        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-            dataset=dataset,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        EntityLookupPhase(lookups),
-        SavePhase(
-            default_output_path("harmonised", input_path),
-            fieldnames=intermediate_fieldnames,
-            enabled=save_harmonised,
-        ),
-        EntityPrunePhase(
-            issue_log=issue_log, dataset_resource_log=dataset_resource_log
-        ),
-        PivotPhase(),
-        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
-        FactorPhase(),
-        FactReferencePhase(
-            field_typology_map=specification.get_field_typology_map(),
-            field_prefix_map=specification.get_field_prefix_map(),
-        ),
-        FactLookupPhase(lookups),
-        FactPrunePhase(),
-        SavePhase(
-            output_path,
-            fieldnames=specification.factor_fieldnames(),
-        ),
-    )
-
-    issue_log.save(os.path.join(issue_dir, resource + ".csv"))
-    column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
-    dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
-
-
-#
-#  build dataset from processed resources
-#
-def dataset_create(
-    input_paths,
-    output_path,
-    organisation_path,
-    pipeline,
-    dataset,
-    specification,
-    issue_dir="issue",
-):
-    if not output_path:
-        print("missing output path", file=sys.stderr)
-        sys.exit(2)
-    organisation = Organisation(organisation_path, Path(pipeline.path))
-    package = DatasetPackage(
-        dataset,
-        organisation=organisation,
-        path=output_path,
-        specification_dir=None,  # TBD: package should use this specification object
-    )
-    package.create()
-    for path in input_paths:
-        package.load_transformed(path)
-    package.load_entities()
-
-    old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
-    if os.path.exists(old_entity_path):
-        package.load_old_entities(old_entity_path)
-
-    issue_paths = os.path.join(issue_dir, dataset)
-    if os.path.exists(issue_paths):
-        for issue_path in os.listdir(issue_paths):
-            package.load_issues(os.path.join(issue_paths, issue_path))
-    else:
-        logging.warning("No directory for this dataset in the provided issue_directory")
-
-    package.add_counts()
-
-
-def dataset_dump(input_path, output_path):
-    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
-    logging.info(cmd)
-    os.system(cmd)
-
-
-def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
-    if isinstance(csv_path, str):
-        path = Path(csv_path)
-        dataset_name = path.stem
-    elif isinstance(csv_path, Path):
-        dataset_name = csv_path.stem
-    else:
-        logging.error(f"Can't extract datapackage name from {csv_path}")
-        sys.exit(-1)
-
-    flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv")
-    with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file:
-        reader = csv.DictReader(read_file)
-
-        spec_field_names = [
-            field
-            for field in itertools.chain(
-                *[
-                    specification.current_fieldnames(schema)
-                    for schema in specification.dataset_schema[dataset]
-                ]
-            )
-        ]
-        reader_fieldnames = [
-            field.replace("_", "-")
-            for field in list(reader.fieldnames)
-            if field != "json"
-        ]
-
-        flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames))
-        # Make sure we put flattened fieldnames last
-        field_names = reader_fieldnames + sorted(list(flattened_field_names))
-
-        writer = csv.DictWriter(write_file, fieldnames=field_names)
-        writer.writeheader()
-        entities = []
-        for row in reader:
-            row.pop("geojson", None)
-            row = OrderedDict(row)
-            json_string = row.pop("json") or "{}"
-            row.update(json.loads(json_string))
-            kebab_case_row = dict(
-                [(key.replace("_", "-"), val) for key, val in row.items()]
-            )
-            writer.writerow(kebab_case_row)
-            entities.append(kebab_case_row)
-
-    # write the entities to json file as well
-    flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json")
-    with open(flattened_json_path, "w") as out_json:
-        out_json.write(json.dumps({"entities": entities}))
-    batch_size = 100000
-    temp_geojson_files = []
-    geography_entities = [e for e in entities if e["typology"] == "geography"]
-    for i in range(0, len(geography_entities), batch_size):
-        batch = geography_entities[i : i + batch_size]
-        feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name)
-
-        geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson")
-        temp_geojson_files.append(geojson_path)
-        try:
-            with open(geojson_path, "w", encoding="utf-8") as out_geojson:
-                out_geojson.write(geojson.dumps(feature_collection))
-        except Exception as e:
-            logging.error(f"Error writing to GeoJSON file: {e}")
-
-    if all(os.path.isfile(path) for path in temp_geojson_files):
-        rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson")
-        for temp_path in temp_geojson_files:
-            responseCode, _, _ = execute(
-                [
-                    "ogr2ogr",
-                    "-f",
-                    "GeoJSON",
-                    "-lco",
-                    "RFC7946=YES",
-                    "-append",
-                    rfc7946_geojson_path,
-                    temp_path,
-                ]
-            )
-
-            if responseCode != 0:
-                logging.error(
-                    "Could not generate rfc7946 compliant geojson. Use existing file."
-                )
-                execute(
-                    [
-                        "ogr2ogr",
-                        "-f",
-                        "GeoJSON",
-                        "-append",
-                        rfc7946_geojson_path,
-                        temp_path,
-                    ]
-                )
-            # clear up input geojson file
-            if os.path.isfile(temp_path):
-                os.remove(temp_path)
-
-
-#
-#  configuration commands
-#
-def collection_add_source(entry, collection, endpoint_url, collection_dir):
-    """
-    followed by a sequence of optional name and value pairs including the following names:
-    "attribution", "licence", "pipelines", "status", "plugin",
-    "parameters", "start-date", "end-date"
-    """
-    entry["collection"] = collection
-    entry["endpoint-url"] = endpoint_url
-    allowed_names = set(
-        list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames)
-    )
-    for key in entry.keys():
-        if key not in allowed_names:
-            logging.error(f"unrecognised argument '{key}'")
-            sys.exit(2)
-    add_source_endpoint(entry, directory=collection_dir)
-
-
-def add_endpoints_and_lookups(
-    csv_file_path,
-    collection_name,
-    collection_dir,
-    pipeline_dir,
-    specification_dir,
-    organisation_path,
-    tmp_dir="./var/cache",
-):
-    """
-    :param csv_file_path:
-    :param collection_name:
-    :param collection_dir:
-    :param pipeline_dir:
-    :param specification_dir:
-    :param organisation_path:
-    :param tmp_dir:
-    :return:
-    """
-
-    expected_cols = [
-        "pipelines",
-        "organisation",
-        "documentation-url",
-        "endpoint-url",
-        "start-date",
-        "licence",
-    ]
-
-    licence_csv_path = os.path.join(specification_dir, "licence.csv")
-    valid_licenses = []
-    with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile:
-        reader = csv.DictReader(csvfile)
-        valid_licenses = [row["licence"] for row in reader]
-
-    # need to get collection name from somewhere
-    # collection name is NOT the dataset name
-    collection = Collection(name=collection_name, directory=collection_dir)
-    collection.load()
-
-    # read and process each record of the new endpoints csv at csv_file_path
-    with open(csv_file_path) as new_endpoints_file:
-        reader = csv.DictReader(new_endpoints_file)
-        csv_columns = reader.fieldnames
-
-        # validate the columns
-        for expected_col in expected_cols:
-            if expected_col not in csv_columns:
-                raise Exception(f"required column ({expected_col}) not found in csv")
-
-        # this is not perfect we should riase validation errors in our code and below should include a try and except statement
-        endpoints = []
-        for row in reader:
-            if row["licence"] not in valid_licenses:
-                raise ValueError(
-                    f"Licence '{row['licence']}' is not a valid licence according to the specification."
-                )
-            if not row["documentation-url"].strip():
-                raise ValueError(
-                    "The 'documentation-url' must be populated for each row."
-                )
-            if collection.add_source_endpoint(row):
-                endpoint = {
-                    "endpoint-url": row["endpoint-url"],
-                    "endpoint": hash_value(row["endpoint-url"]),
-                    "end-date": row.get("end-date", ""),
-                    "plugin": row.get("plugin"),
-                    "licence": row["licence"],
-                }
-                endpoints.append(endpoint)
-
-    # endpoints have been added now lets collect the resources using the endpoint information
-    collector = Collector(collection_dir=collection_dir)
-
-    for endpoint in endpoints:
-        collector.fetch(
-            url=endpoint["endpoint-url"],
-            endpoint=endpoint["endpoint"],
-            end_date=endpoint["end-date"],
-            plugin=endpoint["plugin"],
-        )
-    # reload log items
-    collection.load_log_items()
-
-    dataset_resource_map = collection.dataset_resource_map()
-
-    #  searching for the specific resources that we have downloaded
-    for dataset in dataset_resource_map:
-        resources_to_assign = []
-        for resource in dataset_resource_map[dataset]:
-            resource_endpoints = collection.resource_endpoints(resource)
-            if any(
-                endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints]
-                for endpoint in resource_endpoints
-            ):
-                resource_file_path = Path(collection_dir) / "resource" / resource
-                resources_to_assign.append(resource_file_path)
-        assign_entities(
-            resource_file_paths=resources_to_assign,
-            collection=collection,
-            pipeline_dir=pipeline_dir,
-            specification_dir=specification_dir,
-            organisation_path=organisation_path,
-            tmp_dir=tmp_dir,
-            dataset=dataset,
-        )
-
-
-def resource_from_path(path):
-    return Path(path).stem
-
-
-def default_output_path(command, input_path):
-    directory = "" if command in ["harmonised", "transformed"] else "var/"
-    return f"{directory}{command}/{resource_from_path(input_path)}.csv"
-
-
-def assign_entities(
-    resource_file_paths,
-    collection,
-    pipeline_dir,
-    specification_dir,
-    organisation_path,
-    tmp_dir="./var/cache",
-    dataset=None,
-):
-    """
-    Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection
-    :param resource_file_paths:
-    :param collection:
-    :param pipeline_dir:
-    :param specification_dir:
-    :param organisation_path:
-    :param tmp_dir:
-    :return:
-    """
-
-    specification = Specification(specification_dir)
-
-    print("")
-    print("======================================================================")
-    print("New Lookups")
-    print("======================================================================")
-
-    dataset_resource_map = collection.dataset_resource_map()
-    new_lookups = []
-
-    pipeline_name = None
-    # establish pipeline if dataset is known - else have to find dataset for each resource
-    if dataset is not None:
-        pipeline = Pipeline(pipeline_dir, dataset)
-        pipeline_name = pipeline.name
-
-    for resource_file_path in resource_file_paths:
-        resource = os.path.splitext(os.path.basename(resource_file_path))[0]
-        # Find dataset for resource if not given
-        if dataset is None:
-            for dataset_key, resources in dataset_resource_map.items():
-                if resource in list(resources):
-                    dataset = dataset_key
-                    continue
-            # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline
-            if dataset is not None:
-                pipeline = Pipeline(pipeline_dir, dataset)
-                pipeline_name = pipeline.name
-            else:
-                logging.error(
-                    "Resource '%s' has not been processed by pipeline - no lookups added"
-                    % (resource)
-                )
-                break
-
-        resource_lookups = get_resource_unidentified_lookups(
-            input_path=Path(resource_file_path),
-            dataset=dataset,
-            organisations=collection.resource_organisations(resource),
-            pipeline=pipeline,
-            specification=specification,
-            tmp_dir=Path(tmp_dir).absolute(),
-            org_csv_path=organisation_path,
-        )
-        new_lookups.append(resource_lookups)
-
-    if pipeline_name is not None:
-        # save new lookups to file
-        lookups = Lookups(pipeline_dir)
-        # Check if the lookups file exists, create it if not
-        if not os.path.exists(lookups.lookups_path):
-            with open(lookups.lookups_path, "w", newline="") as f:
-                writer = csv.writer(f)
-                writer.writerow(list(lookups.schema.fieldnames))
-
-        lookups.load_csv()
-        for new_lookup in new_lookups:
-            for idx, entry in enumerate(new_lookup):
-                lookups.add_entry(entry[0])
-
-        # save edited csvs
-        max_entity_num = lookups.get_max_entity(pipeline_name)
-        lookups.entity_num_gen.state["current"] = max_entity_num
-        lookups.entity_num_gen.state["range_max"] = (
-            specification.get_dataset_entity_max(pipeline_name)
-        )
-        lookups.entity_num_gen.state["range_min"] = (
-            specification.get_dataset_entity_min(pipeline_name)
-        )
-
-        # TO DO: Currently using pipeline_name to find dataset min, max, current
-        # This would not function properly if each resource had a different dataset
-
-        collection.save_csv()
-        new_lookups = lookups.save_csv()
-
-        for entity in new_lookups:
-            print(
-                entity["prefix"],
-                ",",
-                entity["organisation"],
-                ",",
-                entity["reference"],
-                ",",
-                entity["entity"],
-            )
-
-
-def get_resource_unidentified_lookups(
-    input_path: Path,
-    dataset: str,
-    pipeline: Pipeline,
-    specification: Specification,
-    organisations: list = [],
-    tmp_dir: Path = None,
-    org_csv_path: Path = None,
-):
-    # convert phase inputs
-    # could alter resource_from_path to file from path and promote to a utils folder
-    resource = resource_from_path(input_path)
-    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-    custom_temp_dir = tmp_dir  # './var'
-
-    print("")
-    print("----------------------------------------------------------------------")
-    print(f">>> organisations:{organisations}")
-    print(f">>> resource:{resource}")
-    print("----------------------------------------------------------------------")
-
-    # normalise phase inputs
-    skip_patterns = pipeline.skip_patterns(resource)
-    null_path = None
-
-    # concat field phase
-    concats = pipeline.concatenations(resource)
-    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
-
-    # map phase
-    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
-    columns = pipeline.columns(resource)
-
-    # patch phase
-    patches = pipeline.patches(resource=resource)
-
-    # harmonize phase
-    issue_log = IssueLog(dataset=dataset, resource=resource)
-
-    # default phase
-    default_fields = pipeline.default_fields(resource=resource)
-    default_values = pipeline.default_values(endpoints=[])
-
-    if len(organisations) == 1:
-        default_values["organisation"] = organisations[0]
-
-    # migrate phase
-    schema = specification.pipeline[pipeline.name]["schema"]
-
-    # organisation phase
-    organisation = Organisation(org_csv_path, Path(pipeline.path))
-
-    # print lookups phase
-    pipeline_lookups = pipeline.lookups()
-    redirect_lookups = pipeline.redirect_lookups()
-    print_lookup_phase = PrintLookupPhase(
-        lookups=pipeline_lookups, redirect_lookups=redirect_lookups
-    )
-
-    run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
-        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        print_lookup_phase,
-    )
-
-    return print_lookup_phase.new_lookup_entries
-
-
-def process_data_in_batches(entities, flattened_dir, dataset_name):
-    features = []
-    feature_collection = ""
-    for entity in entities:
-        geom = entity.pop("geometry")
-        point = entity.pop("point")
-        if geom:
-            try:
-                geometry = shapely.wkt.loads(geom)
-                feature = geojson.Feature(geometry=geometry, properties=entity)
-                features.append(feature)
-            except Exception as e:
-                logging.error(f"Error loading wkt from entity {entity['entity']}")
-                logging.error(e)
-        elif point:
-            try:
-                geometry = shapely.wkt.loads(point)
-                feature = geojson.Feature(geometry=geometry, properties=entity)
-                features.append(feature)
-            except Exception as e:
-                logging.error(f"Error loading wkt from entity {entity['entity']}")
-                logging.error(e)
-        else:
-            logging.error(
-                f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
-            )
-
-    if features:
-        feature_collection = geojson.FeatureCollection(
-            features=features, name=dataset_name
-        )
-
-    return feature_collection
-
-
-def add_redirections(csv_file_path, pipeline_dir):
-    """
-    :param csv_file_path:
-    :param pipeline_dir:
-    :return:
-    """
-    expected_cols = [
-        "entity_source",
-        "entity_destination",
-    ]
-
-    old_entity_path = Path(pipeline_dir) / "old-entity.csv"
-
-    with open(csv_file_path) as new_endpoints_file:
-        reader = csv.DictReader(new_endpoints_file)
-        csv_columns = reader.fieldnames
-
-        for expected_col in expected_cols:
-            if expected_col not in csv_columns:
-                raise Exception(f"required column ({expected_col}) not found in csv")
-
-        fieldnames = ["old-entity", "status", "entity"]
-
-        f = open(old_entity_path, "a", newline="")
-        writer = csv.DictWriter(f, fieldnames=fieldnames)
-        if f.tell() == 0:
-            writer.writeheader()
-
-        for row in reader:
-            if row["entity_source"] == "" or row["entity_destination"] == "":
-                print(
-                    "Missing entity number for",
-                    (
-                        row["entity_destination"]
-                        if row["entity_source"] == ""
-                        else row["entity_source"]
-                    ),
-                )
-            else:
-                writer.writerow(
-                    {
-                        "old-entity": row["entity_source"],
-                        "status": "301",
-                        "entity": row["entity_destination"],
-                    }
-                )
-    print("Redirections added to old-entity.csv")
+from collections import OrderedDict
+import csv
+import itertools
+import os
+import sys
+import json
+import logging
+from pathlib import Path
+
+import geojson
+import shapely
+
+from digital_land.specification import Specification
+from digital_land.collect import Collector
+from digital_land.collection import Collection, resource_path
+from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog
+from digital_land.organisation import Organisation
+from digital_land.package.dataset import DatasetPackage
+from digital_land.phase.combine import FactCombinePhase
+from digital_land.phase.concat import ConcatFieldPhase
+from digital_land.phase.convert import ConvertPhase, execute
+from digital_land.phase.post_conversion import PostConversionPhase
+from digital_land.phase.default import DefaultPhase
+from digital_land.phase.dump import DumpPhase
+from digital_land.phase.factor import FactorPhase
+from digital_land.phase.filter import FilterPhase
+from digital_land.phase.harmonise import HarmonisePhase
+from digital_land.phase.lookup import (
+    EntityLookupPhase,
+    FactLookupPhase,
+    PrintLookupPhase,
+)
+from digital_land.phase.map import MapPhase
+from digital_land.phase.migrate import MigratePhase
+from digital_land.phase.normalise import NormalisePhase
+from digital_land.phase.organisation import OrganisationPhase
+from digital_land.phase.parse import ParsePhase
+from digital_land.phase.patch import PatchPhase
+from digital_land.phase.pivot import PivotPhase
+from digital_land.phase.prefix import EntityPrefixPhase
+from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
+from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
+from digital_land.phase.save import SavePhase
+from digital_land.pipeline import run_pipeline, Lookups, Pipeline
+from digital_land.schema import Schema
+from digital_land.update import add_source_endpoint
+from .register import hash_value
+
+logger = logging.getLogger(__name__)
+
+
+def fetch(url, pipeline):
+    """fetch a single source endpoint URL, and add it to the collection"""
+    collector = Collector(pipeline.name)
+    collector.fetch(url)
+
+
+def collect(endpoint_path, collection_dir, pipeline):
+    """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file"""
+    collector = Collector(pipeline.name, Path(collection_dir))
+    collector.collect(endpoint_path)
+
+
+#
+#  collection commands
+#  TBD: make sub commands
+#
+def collection_list_resources(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    for resource in sorted(collection.resource.records):
+        print(resource_path(resource, directory=collection_dir))
+
+
+def collection_pipeline_makerules(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    collection.pipeline_makerules()
+
+
+def collection_save_csv(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    collection.update()
+    collection.save_csv()
+
+
+#
+#  pipeline commands
+#
+def convert(input_path, output_path, custom_temp_dir=None):
+    if not output_path:
+        output_path = default_output_path("converted", input_path)
+    dataset_resource_log = DatasetResourceLog()
+    run_pipeline(
+        ConvertPhase(
+            input_path,
+            dataset_resource_log=dataset_resource_log,
+            custom_temp_dir=custom_temp_dir,
+        ),
+        DumpPhase(output_path),
+    )
+    dataset_resource_log.save(f=sys.stdout)
+
+
+def pipeline_run(
+    dataset,
+    pipeline,
+    specification,
+    input_path,
+    output_path,
+    collection_dir="./collection",  # TBD: remove, replaced by endpoints, organisations and entry_date
+    null_path=None,  # TBD: remove this
+    issue_dir=None,
+    organisation_path=None,
+    save_harmonised=False,
+    column_field_dir=None,
+    dataset_resource_dir=None,
+    custom_temp_dir=None,  # TBD: rename to "tmpdir"
+    endpoints=[],
+    organisations=[],
+    entry_date="",
+):
+    resource = resource_from_path(input_path)
+    dataset = dataset
+    schema = specification.pipeline[pipeline.name]["schema"]
+    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+    issue_log = IssueLog(dataset=dataset, resource=resource)
+    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
+    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+
+    # load pipeline configuration
+    skip_patterns = pipeline.skip_patterns(resource)
+    columns = pipeline.columns(resource, endpoints=endpoints)
+    concats = pipeline.concatenations(resource, endpoints=endpoints)
+    patches = pipeline.patches(resource=resource)
+    lookups = pipeline.lookups(resource=resource)
+    default_fields = pipeline.default_fields(resource=resource)
+    default_values = pipeline.default_values(endpoints=endpoints)
+    combine_fields = pipeline.combine_fields(endpoints=endpoints)
+
+    # load organisations
+    organisation = Organisation(organisation_path, Path(pipeline.path))
+
+    # load the resource default values from the collection
+    if not endpoints:
+        collection = Collection(name=None, directory=collection_dir)
+        collection.load()
+        endpoints = collection.resource_endpoints(resource)
+        organisations = collection.resource_organisations(resource)
+        entry_date = collection.resource_start_date(resource)
+
+    # resource specific default values
+    if len(organisations) == 1:
+        default_values["organisation"] = organisations[0]
+
+    if entry_date:
+        default_values["entry-date"] = entry_date
+
+    run_pipeline(
+        ConvertPhase(
+            path=input_path,
+            dataset_resource_log=DatasetResourceLog(),
+            custom_temp_dir=custom_temp_dir,
+            output_path=output_path,
+        ),
+        PostConversionPhase(
+            converted_resource_path=input_path,
+            output_dir=output_path,
+            dataset=dataset,
+            typology=specification.get_dataset_typology(dataset),
+        ),
+        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
+        ParsePhase(),
+        ConcatFieldPhase(concats=concats, log=column_field_log),
+        MapPhase(
+            fieldnames=intermediate_fieldnames,
+            columns=columns,
+            log=column_field_log,
+        ),
+        FilterPhase(filters=pipeline.filters(resource)),
+        PatchPhase(
+            issues=issue_log,
+            patches=patches,
+        ),
+        HarmonisePhase(
+            field_datatype_map=specification.get_field_datatype_map(),
+            issues=issue_log,
+            dataset=dataset,
+        ),
+        DefaultPhase(
+            default_fields=default_fields,
+            default_values=default_values,
+            issues=issue_log,
+        ),
+        # TBD: move migrating columns to fields to be immediately after map
+        # this will simplify harmonisation and remove intermediate_fieldnames
+        # but effects brownfield-land and other pipelines which operate on columns
+        MigratePhase(
+            fields=specification.schema_field[schema],
+            migrations=pipeline.migrations(),
+        ),
+        OrganisationPhase(organisation=organisation, issues=issue_log),
+        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+        EntityReferencePhase(
+            dataset=dataset,
+            prefix=specification.dataset_prefix(dataset),
+        ),
+        EntityPrefixPhase(dataset=dataset),
+        EntityLookupPhase(lookups),
+        SavePhase(
+            default_output_path("harmonised", input_path),
+            fieldnames=intermediate_fieldnames,
+            enabled=save_harmonised,
+        ),
+        EntityPrunePhase(
+            issue_log=issue_log, dataset_resource_log=dataset_resource_log
+        ),
+        PivotPhase(),
+        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
+        FactorPhase(),
+        FactReferencePhase(
+            field_typology_map=specification.get_field_typology_map(),
+            field_prefix_map=specification.get_field_prefix_map(),
+        ),
+        FactLookupPhase(lookups),
+        FactPrunePhase(),
+        SavePhase(
+            output_path,
+            fieldnames=specification.factor_fieldnames(),
+        ),
+    )
+
+    issue_log.save(os.path.join(issue_dir, resource + ".csv"))
+    column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
+    dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
+
+
+#
+#  build dataset from processed resources
+#
+def dataset_create(
+    input_paths,
+    output_path,
+    organisation_path,
+    pipeline,
+    dataset,
+    specification,
+    issue_dir="issue",
+):
+    if not output_path:
+        print("missing output path", file=sys.stderr)
+        sys.exit(2)
+    organisation = Organisation(organisation_path, Path(pipeline.path))
+    package = DatasetPackage(
+        dataset,
+        organisation=organisation,
+        path=output_path,
+        specification_dir=None,  # TBD: package should use this specification object
+    )
+    package.create()
+    for path in input_paths:
+        package.load_transformed(path)
+    package.load_entities()
+
+    old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
+    if os.path.exists(old_entity_path):
+        package.load_old_entities(old_entity_path)
+
+    issue_paths = os.path.join(issue_dir, dataset)
+    if os.path.exists(issue_paths):
+        for issue_path in os.listdir(issue_paths):
+            package.load_issues(os.path.join(issue_paths, issue_path))
+    else:
+        logging.warning("No directory for this dataset in the provided issue_directory")
+
+    package.add_counts()
+
+
+def dataset_dump(input_path, output_path):
+    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
+    logging.info(cmd)
+    os.system(cmd)
+
+
+def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
+    if isinstance(csv_path, str):
+        path = Path(csv_path)
+        dataset_name = path.stem
+    elif isinstance(csv_path, Path):
+        dataset_name = csv_path.stem
+    else:
+        logging.error(f"Can't extract datapackage name from {csv_path}")
+        sys.exit(-1)
+
+    flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv")
+    with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file:
+        reader = csv.DictReader(read_file)
+
+        spec_field_names = [
+            field
+            for field in itertools.chain(
+                *[
+                    specification.current_fieldnames(schema)
+                    for schema in specification.dataset_schema[dataset]
+                ]
+            )
+        ]
+        reader_fieldnames = [
+            field.replace("_", "-")
+            for field in list(reader.fieldnames)
+            if field != "json"
+        ]
+
+        flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames))
+        # Make sure we put flattened fieldnames last
+        field_names = reader_fieldnames + sorted(list(flattened_field_names))
+
+        writer = csv.DictWriter(write_file, fieldnames=field_names)
+        writer.writeheader()
+        entities = []
+        for row in reader:
+            row.pop("geojson", None)
+            row = OrderedDict(row)
+            json_string = row.pop("json") or "{}"
+            row.update(json.loads(json_string))
+            kebab_case_row = dict(
+                [(key.replace("_", "-"), val) for key, val in row.items()]
+            )
+            writer.writerow(kebab_case_row)
+            entities.append(kebab_case_row)
+
+    # write the entities to json file as well
+    flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json")
+    with open(flattened_json_path, "w") as out_json:
+        out_json.write(json.dumps({"entities": entities}))
+    batch_size = 100000
+    temp_geojson_files = []
+    geography_entities = [e for e in entities if e["typology"] == "geography"]
+    for i in range(0, len(geography_entities), batch_size):
+        batch = geography_entities[i : i + batch_size]
+        feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name)
+
+        geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson")
+        temp_geojson_files.append(geojson_path)
+        try:
+            with open(geojson_path, "w", encoding="utf-8") as out_geojson:
+                out_geojson.write(geojson.dumps(feature_collection))
+        except Exception as e:
+            logging.error(f"Error writing to GeoJSON file: {e}")
+
+    if all(os.path.isfile(path) for path in temp_geojson_files):
+        rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson")
+        for temp_path in temp_geojson_files:
+            responseCode, _, _ = execute(
+                [
+                    "ogr2ogr",
+                    "-f",
+                    "GeoJSON",
+                    "-lco",
+                    "RFC7946=YES",
+                    "-append",
+                    rfc7946_geojson_path,
+                    temp_path,
+                ]
+            )
+
+            if responseCode != 0:
+                logging.error(
+                    "Could not generate rfc7946 compliant geojson. Use existing file."
+                )
+                execute(
+                    [
+                        "ogr2ogr",
+                        "-f",
+                        "GeoJSON",
+                        "-append",
+                        rfc7946_geojson_path,
+                        temp_path,
+                    ]
+                )
+            # clear up input geojson file
+            if os.path.isfile(temp_path):
+                os.remove(temp_path)
+
+
+#
+#  configuration commands
+#
+def collection_add_source(entry, collection, endpoint_url, collection_dir):
+    """
+    followed by a sequence of optional name and value pairs including the following names:
+    "attribution", "licence", "pipelines", "status", "plugin",
+    "parameters", "start-date", "end-date"
+    """
+    entry["collection"] = collection
+    entry["endpoint-url"] = endpoint_url
+    allowed_names = set(
+        list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames)
+    )
+    for key in entry.keys():
+        if key not in allowed_names:
+            logging.error(f"unrecognised argument '{key}'")
+            sys.exit(2)
+    add_source_endpoint(entry, directory=collection_dir)
+
+
+def add_endpoints_and_lookups(
+    csv_file_path,
+    collection_name,
+    collection_dir,
+    pipeline_dir,
+    specification_dir,
+    organisation_path,
+    tmp_dir="./var/cache",
+):
+    """
+    :param csv_file_path:
+    :param collection_name:
+    :param collection_dir:
+    :param pipeline_dir:
+    :param specification_dir:
+    :param organisation_path:
+    :param tmp_dir:
+    :return:
+    """
+
+    expected_cols = [
+        "pipelines",
+        "organisation",
+        "documentation-url",
+        "endpoint-url",
+        "start-date",
+        "licence",
+    ]
+
+    licence_csv_path = os.path.join(specification_dir, "licence.csv")
+    valid_licenses = []
+    with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile)
+        valid_licenses = [row["licence"] for row in reader]
+
+    # need to get collection name from somewhere
+    # collection name is NOT the dataset name
+    collection = Collection(name=collection_name, directory=collection_dir)
+    collection.load()
+
+    # read and process each record of the new endpoints csv at csv_file_path
+    with open(csv_file_path) as new_endpoints_file:
+        reader = csv.DictReader(new_endpoints_file)
+        csv_columns = reader.fieldnames
+
+        # validate the columns
+        for expected_col in expected_cols:
+            if expected_col not in csv_columns:
+                raise Exception(f"required column ({expected_col}) not found in csv")
+
+        # this is not perfect we should riase validation errors in our code and below should include a try and except statement
+        endpoints = []
+        for row in reader:
+            if row["licence"] not in valid_licenses:
+                raise ValueError(
+                    f"Licence '{row['licence']}' is not a valid licence according to the specification."
+                )
+            if not row["documentation-url"].strip():
+                raise ValueError(
+                    "The 'documentation-url' must be populated for each row."
+                )
+            if collection.add_source_endpoint(row):
+                endpoint = {
+                    "endpoint-url": row["endpoint-url"],
+                    "endpoint": hash_value(row["endpoint-url"]),
+                    "end-date": row.get("end-date", ""),
+                    "plugin": row.get("plugin"),
+                    "licence": row["licence"],
+                }
+                endpoints.append(endpoint)
+
+    # endpoints have been added now lets collect the resources using the endpoint information
+    collector = Collector(collection_dir=collection_dir)
+
+    for endpoint in endpoints:
+        collector.fetch(
+            url=endpoint["endpoint-url"],
+            endpoint=endpoint["endpoint"],
+            end_date=endpoint["end-date"],
+            plugin=endpoint["plugin"],
+        )
+    # reload log items
+    collection.load_log_items()
+
+    dataset_resource_map = collection.dataset_resource_map()
+
+    #  searching for the specific resources that we have downloaded
+    for dataset in dataset_resource_map:
+        resources_to_assign = []
+        for resource in dataset_resource_map[dataset]:
+            resource_endpoints = collection.resource_endpoints(resource)
+            if any(
+                endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints]
+                for endpoint in resource_endpoints
+            ):
+                resource_file_path = Path(collection_dir) / "resource" / resource
+                resources_to_assign.append(resource_file_path)
+        assign_entities(
+            resource_file_paths=resources_to_assign,
+            collection=collection,
+            pipeline_dir=pipeline_dir,
+            specification_dir=specification_dir,
+            organisation_path=organisation_path,
+            tmp_dir=tmp_dir,
+            dataset=dataset,
+        )
+
+
+def resource_from_path(path):
+    return Path(path).stem
+
+
+def default_output_path(command, input_path):
+    directory = "" if command in ["harmonised", "transformed"] else "var/"
+    return f"{directory}{command}/{resource_from_path(input_path)}.csv"
+
+
+def assign_entities(
+    resource_file_paths,
+    collection,
+    pipeline_dir,
+    specification_dir,
+    organisation_path,
+    tmp_dir="./var/cache",
+    dataset=None,
+):
+    """
+    Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection
+    :param resource_file_paths:
+    :param collection:
+    :param pipeline_dir:
+    :param specification_dir:
+    :param organisation_path:
+    :param tmp_dir:
+    :return:
+    """
+
+    specification = Specification(specification_dir)
+
+    print("")
+    print("======================================================================")
+    print("New Lookups")
+    print("======================================================================")
+
+    dataset_resource_map = collection.dataset_resource_map()
+    new_lookups = []
+
+    pipeline_name = None
+    # establish pipeline if dataset is known - else have to find dataset for each resource
+    if dataset is not None:
+        pipeline = Pipeline(pipeline_dir, dataset)
+        pipeline_name = pipeline.name
+
+    for resource_file_path in resource_file_paths:
+        resource = os.path.splitext(os.path.basename(resource_file_path))[0]
+        # Find dataset for resource if not given
+        if dataset is None:
+            for dataset_key, resources in dataset_resource_map.items():
+                if resource in list(resources):
+                    dataset = dataset_key
+                    continue
+            # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline
+            if dataset is not None:
+                pipeline = Pipeline(pipeline_dir, dataset)
+                pipeline_name = pipeline.name
+            else:
+                logging.error(
+                    "Resource '%s' has not been processed by pipeline - no lookups added"
+                    % (resource)
+                )
+                break
+
+        resource_lookups = get_resource_unidentified_lookups(
+            input_path=Path(resource_file_path),
+            dataset=dataset,
+            organisations=collection.resource_organisations(resource),
+            pipeline=pipeline,
+            specification=specification,
+            tmp_dir=Path(tmp_dir).absolute(),
+            org_csv_path=organisation_path,
+        )
+        new_lookups.append(resource_lookups)
+
+    if pipeline_name is not None:
+        # save new lookups to file
+        lookups = Lookups(pipeline_dir)
+        # Check if the lookups file exists, create it if not
+        if not os.path.exists(lookups.lookups_path):
+            with open(lookups.lookups_path, "w", newline="") as f:
+                writer = csv.writer(f)
+                writer.writerow(list(lookups.schema.fieldnames))
+
+        lookups.load_csv()
+        for new_lookup in new_lookups:
+            for idx, entry in enumerate(new_lookup):
+                lookups.add_entry(entry[0])
+
+        # save edited csvs
+        max_entity_num = lookups.get_max_entity(pipeline_name)
+        lookups.entity_num_gen.state["current"] = max_entity_num
+        lookups.entity_num_gen.state["range_max"] = (
+            specification.get_dataset_entity_max(pipeline_name)
+        )
+        lookups.entity_num_gen.state["range_min"] = (
+            specification.get_dataset_entity_min(pipeline_name)
+        )
+
+        # TO DO: Currently using pipeline_name to find dataset min, max, current
+        # This would not function properly if each resource had a different dataset
+
+        collection.save_csv()
+        new_lookups = lookups.save_csv()
+
+        for entity in new_lookups:
+            print(
+                entity["prefix"],
+                ",",
+                entity["organisation"],
+                ",",
+                entity["reference"],
+                ",",
+                entity["entity"],
+            )
+
+
+def get_resource_unidentified_lookups(
+    input_path: Path,
+    dataset: str,
+    pipeline: Pipeline,
+    specification: Specification,
+    organisations: list = [],
+    tmp_dir: Path = None,
+    org_csv_path: Path = None,
+):
+    # convert phase inputs
+    # could alter resource_from_path to file from path and promote to a utils folder
+    resource = resource_from_path(input_path)
+    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+    custom_temp_dir = tmp_dir  # './var'
+
+    print("")
+    print("----------------------------------------------------------------------")
+    print(f">>> organisations:{organisations}")
+    print(f">>> resource:{resource}")
+    print("----------------------------------------------------------------------")
+
+    # normalise phase inputs
+    skip_patterns = pipeline.skip_patterns(resource)
+    null_path = None
+
+    # concat field phase
+    concats = pipeline.concatenations(resource)
+    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
+
+    # map phase
+    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+    columns = pipeline.columns(resource)
+
+    # patch phase
+    patches = pipeline.patches(resource=resource)
+
+    # harmonize phase
+    issue_log = IssueLog(dataset=dataset, resource=resource)
+
+    # default phase
+    default_fields = pipeline.default_fields(resource=resource)
+    default_values = pipeline.default_values(endpoints=[])
+
+    if len(organisations) == 1:
+        default_values["organisation"] = organisations[0]
+
+    # migrate phase
+    schema = specification.pipeline[pipeline.name]["schema"]
+
+    # organisation phase
+    organisation = Organisation(org_csv_path, Path(pipeline.path))
+
+    # print lookups phase
+    pipeline_lookups = pipeline.lookups()
+    redirect_lookups = pipeline.redirect_lookups()
+    print_lookup_phase = PrintLookupPhase(
+        lookups=pipeline_lookups, redirect_lookups=redirect_lookups
+    )
+
+    run_pipeline(
+        ConvertPhase(
+            path=input_path,
+            dataset_resource_log=dataset_resource_log,
+            custom_temp_dir=custom_temp_dir,
+        ),
+        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
+        ParsePhase(),
+        ConcatFieldPhase(concats=concats, log=column_field_log),
+        MapPhase(
+            fieldnames=intermediate_fieldnames,
+            columns=columns,
+            log=column_field_log,
+        ),
+        FilterPhase(filters=pipeline.filters(resource)),
+        PatchPhase(
+            issues=issue_log,
+            patches=patches,
+        ),
+        HarmonisePhase(
+            field_datatype_map=specification.get_field_datatype_map(),
+            issues=issue_log,
+        ),
+        DefaultPhase(
+            default_fields=default_fields,
+            default_values=default_values,
+            issues=issue_log,
+        ),
+        # TBD: move migrating columns to fields to be immediately after map
+        # this will simplify harmonisation and remove intermediate_fieldnames
+        # but effects brownfield-land and other pipelines which operate on columns
+        MigratePhase(
+            fields=specification.schema_field[schema],
+            migrations=pipeline.migrations(),
+        ),
+        OrganisationPhase(organisation=organisation, issues=issue_log),
+        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+        EntityReferencePhase(
+            dataset=dataset,
+            prefix=specification.dataset_prefix(dataset),
+        ),
+        EntityPrefixPhase(dataset=dataset),
+        print_lookup_phase,
+    )
+
+    return print_lookup_phase.new_lookup_entries
+
+
+def process_data_in_batches(entities, flattened_dir, dataset_name):
+    features = []
+    feature_collection = ""
+    for entity in entities:
+        geom = entity.pop("geometry")
+        point = entity.pop("point")
+        if geom:
+            try:
+                geometry = shapely.wkt.loads(geom)
+                feature = geojson.Feature(geometry=geometry, properties=entity)
+                features.append(feature)
+            except Exception as e:
+                logging.error(f"Error loading wkt from entity {entity['entity']}")
+                logging.error(e)
+        elif point:
+            try:
+                geometry = shapely.wkt.loads(point)
+                feature = geojson.Feature(geometry=geometry, properties=entity)
+                features.append(feature)
+            except Exception as e:
+                logging.error(f"Error loading wkt from entity {entity['entity']}")
+                logging.error(e)
+        else:
+            logging.error(
+                f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
+            )
+
+    if features:
+        feature_collection = geojson.FeatureCollection(
+            features=features, name=dataset_name
+        )
+
+    return feature_collection
+
+
+def add_redirections(csv_file_path, pipeline_dir):
+    """
+    :param csv_file_path:
+    :param pipeline_dir:
+    :return:
+    """
+    expected_cols = [
+        "entity_source",
+        "entity_destination",
+    ]
+
+    old_entity_path = Path(pipeline_dir) / "old-entity.csv"
+
+    with open(csv_file_path) as new_endpoints_file:
+        reader = csv.DictReader(new_endpoints_file)
+        csv_columns = reader.fieldnames
+
+        for expected_col in expected_cols:
+            if expected_col not in csv_columns:
+                raise Exception(f"required column ({expected_col}) not found in csv")
+
+        fieldnames = ["old-entity", "status", "entity"]
+
+        f = open(old_entity_path, "a", newline="")
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        if f.tell() == 0:
+            writer.writeheader()
+
+        for row in reader:
+            if row["entity_source"] == "" or row["entity_destination"] == "":
+                print(
+                    "Missing entity number for",
+                    (
+                        row["entity_destination"]
+                        if row["entity_source"] == ""
+                        else row["entity_source"]
+                    ),
+                )
+            else:
+                writer.writerow(
+                    {
+                        "old-entity": row["entity_source"],
+                        "status": "301",
+                        "entity": row["entity_destination"],
+                    }
+                )
+    print("Redirections added to old-entity.csv")

From 92a4ae4ed9250c2c1df9c524eaf54ebf7f171016 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Fri, 5 Apr 2024 15:32:02 +0100
Subject: [PATCH 26/58] Updated

---
 .../checkpoints/converted_resource.py         | 72 ++++++++++++++++++-
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index e735a41d..f2eebfec 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -5,9 +5,75 @@
 # a checkpoint represents the moment in the process where we tell it the
 # type of data it is validating and where the data is
 # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
+from pathlib import Path
+import csv
+import re
 from .base import BaseCheckpoint
 
 
-class CovertedResourceCheckpoint(BaseCheckpoint):
-    def load():
-        pass
+class ConvertedResourceCheckpoint(BaseCheckpoint):
+    def __init__(self, data_path):
+        super().__init__("converted_resource", data_path)
+        self.csv_path = Path(data_path)
+
+    def load(self):
+        self.expectations = [
+            {
+                "function": self.check_for_duplicate_references,
+                "name": "Check for Duplicate References",
+                "severity": "error",
+                "responsibility": "system",
+            },
+            {
+                "function": self.validate_references,
+                "name": "Validate References",
+                "severity": "error",
+                "responsibility": "system",
+            },
+        ]
+
+    def check_for_duplicate_references(self):
+        duplicates = {}
+        issues = []
+
+        with self.csv_path.open(newline="") as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row_number, row in enumerate(reader, start=1):
+                ref = row.get("reference")
+                if ref in duplicates:
+                    duplicates[ref].append(row_number)
+                else:
+                    duplicates[ref] = [row_number]
+
+        for ref, rows in duplicates.items():
+            if len(rows) > 1:
+                issues.append(
+                    {
+                        "scope": "duplicate_reference",
+                        "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                        "rows": rows,
+                        "reference": ref,
+                    }
+                )
+
+        return True, "Checked for duplicate references.", issues
+
+    def validate_references(self):
+        pattern = re.compile(r"^REF-\d+$")
+        issues = []
+
+        with self.csv_path.open(newline="") as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row_number, row in enumerate(reader, start=1):
+                ref = row.get("reference")
+                if not pattern.match(ref):
+                    issues.append(
+                        {
+                            "scope": "invalid_reference",
+                            "message": f"Invalid reference '{ref}' on row {row_number}.",
+                            "row": row_number,
+                            "reference": ref,
+                        }
+                    )
+
+        return len(issues) == 0, "Checked for invalid references.", issues

From f1e0d7aa55f963092e71b9dbcf9e4970651bf184 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 9 Apr 2024 14:41:41 +0100
Subject: [PATCH 27/58] Added unit tests and integrated into convert

---
 digital_land/phase/convert.py                 | 25 +++++++++
 .../expectations/test_checkpoint.py           | 53 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 091fa006..8e057e7e 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -11,6 +11,9 @@
 import pandas as pd
 from .load import Stream
 from .phase import Phase
+from digital_land.expectations.checkpoints.converted_resource import (
+    ConvertedResourceCheckpoint,
+)
 
 
 def detect_file_encoding(path):
@@ -187,12 +190,34 @@ def _read_text_file(self, input_path, encoding):
 
         if converted_csv_file:
             f.close()
+            self.run_checkpoint(converted_csv_file)
             reader = read_csv(converted_csv_file)
         else:
             reader = f
 
         return reader
 
+    def run_checkpoint(self, path):
+        checkpoint = ConvertedResourceCheckpoint(data_path=path)
+        checkpoint.load()
+        checkpoint_result, issues = checkpoint.run()
+
+        if issues:
+            for issue in issues:
+                log_message = self.format_issue_message(issue)
+
+                if issue["severity"] == "error":
+                    logging.error(log_message)
+                elif issue["severity"] == "warning":
+                    logging.warning(log_message)
+                else:
+                    logging.info(log_message)
+        else:
+            logging.info(f"Checkpoint completed with result: {checkpoint_result}")
+
+    def format_issue_message(self, issue):
+        return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})"
+
     def _find_zip_file(self, input_file, suffix=".gml"):
         zip_ = zipfile.ZipFile(input_file)
         files = zip_.namelist()
diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index 35984f6d..c5c2443f 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -2,8 +2,11 @@
 import os
 import spatialite
 import pandas as pd
-from csv import DictReader
+from csv import DictReader, DictWriter
 from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint
+from digital_land.expectations.checkpoints.converted_resource import (
+    ConvertedResourceCheckpoint,
+)
 
 
 @pytest.fixture
@@ -43,6 +46,22 @@ def sqlite3_with_entity_tables_path(tmp_path):
     return dataset_path
 
 
+@pytest.fixture
+def csv_path(tmp_path):
+    data = [
+        {"reference": "REF-001", "name": "Test 1"},
+        {"reference": "REF-002", "name": "Test 2"},
+        {"reference": "REF-001", "name": "Test 3"},  # Duplicate
+        {"reference": "INVALID-003", "name": "Test 4"},  # Invalid format
+    ]
+    csv_file = tmp_path / "test_data.csv"
+    with csv_file.open(mode="w", newline="") as f:
+        writer = DictWriter(f, fieldnames=["reference", "name"])
+        writer.writeheader()
+        writer.writerows(data)
+    return csv_file
+
+
 def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path):
     # load data
     test_entity_data = pd.DataFrame.from_dict({"entity": [1], "name": ["test1"]})
@@ -132,3 +151,35 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
     assert issues[0]["rows"] == ""
     assert issues[0]["row"] != ""  # Just check it's there
     assert issues[0]["value"] == ""
+
+
+def test_check_for_duplicate_references(csv_path):
+    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
+    checkpoint.load()
+
+    success, message, issues = checkpoint.check_for_duplicate_references()
+
+    assert success is True, "The function should successfully identify issues."
+    assert len(issues) == 1, "There should be one issue identified."
+    assert (
+        issues[0]["scope"] == "duplicate_reference"
+    ), "The issue should be identified as a duplicate reference."
+    assert (
+        "REF-001" in issues[0]["message"]
+    ), "REF-001 should be identified as a duplicate."
+
+
+def test_validate_references(csv_path):
+    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
+    checkpoint.load()
+
+    success, message, issues = checkpoint.validate_references()
+
+    assert success is False, "The function should fail due to invalid references."
+    assert len(issues) == 1, "There should be one issue identified."
+    assert (
+        issues[0]["scope"] == "invalid_reference"
+    ), "The issue should be identified as an invalid reference."
+    assert (
+        "INVALID-003" in issues[0]["message"]
+    ), "INVALID-003 should be identified as invalid."

From d4c98c06534a6ec5c2783bf4e23f1527bf2ec6fb Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 14:19:05 +0100
Subject: [PATCH 28/58] Updated verification

---
 .../expectations/checkpoints/converted_resource.py    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index f2eebfec..206eecb8 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -7,7 +7,6 @@
 # the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
 from pathlib import Path
 import csv
-import re
 from .base import BaseCheckpoint
 
 
@@ -59,21 +58,21 @@ def check_for_duplicate_references(self):
         return True, "Checked for duplicate references.", issues
 
     def validate_references(self):
-        pattern = re.compile(r"^REF-\d+$")
         issues = []
 
         with self.csv_path.open(newline="") as csvfile:
             reader = csv.DictReader(csvfile)
             for row_number, row in enumerate(reader, start=1):
                 ref = row.get("reference")
-                if not pattern.match(ref):
+                # Check if reference is not populated (None or empty string)
+                if not ref:  # This will be True for both None and empty strings
                     issues.append(
                         {
                             "scope": "invalid_reference",
-                            "message": f"Invalid reference '{ref}' on row {row_number}.",
+                            "message": f"Reference is missing on row {row_number}.",
                             "row": row_number,
-                            "reference": ref,
+                            "reference": ref,  # This will be None or ''
                         }
                     )
 
-        return len(issues) == 0, "Checked for invalid references.", issues
+        return len(issues) == 0, "Checked for unpopulated references.", issues

From 2666a59fe21743a7151761c280f1afc04e67cc34 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 14:29:16 +0100
Subject: [PATCH 29/58] Adjust issue factory

---
 digital_land/expectations/issue.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index 68cd0ae8..1d6d5a1b 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -19,6 +19,8 @@ def issue_factory(scope):
         "row-group": RowGroupIssue,
         "row": RowIssue,
         "value": ValueIssue,
+        "duplicate_reference": RowIssue,
+        "invalid_reference": ValueIssue,
     }
     if scope in SCOPE_MAP:
         return SCOPE_MAP[scope]

From 518196adb41389371ad700a8367c74b39238b955 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:05:42 +0100
Subject: [PATCH 30/58] Issue adjustments

---
 .../checkpoints/converted_resource.py         | 14 ++++++--
 digital_land/expectations/issue.py            | 32 +++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 206eecb8..43671a56 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -52,6 +52,11 @@ def check_for_duplicate_references(self):
                         "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
                         "rows": rows,
                         "reference": ref,
+                        "dataset": "dataset",
+                        "field_name": "reference",
+                        "row_id": str(rows[0]),
+                        "value": ref,
+                        "organisation": "organisation",
                     }
                 )
 
@@ -64,14 +69,19 @@ def validate_references(self):
             reader = csv.DictReader(csvfile)
             for row_number, row in enumerate(reader, start=1):
                 ref = row.get("reference")
-                # Check if reference is not populated (None or empty string)
+
                 if not ref:  # This will be True for both None and empty strings
                     issues.append(
                         {
                             "scope": "invalid_reference",
                             "message": f"Reference is missing on row {row_number}.",
                             "row": row_number,
-                            "reference": ref,  # This will be None or ''
+                            "reference": ref,
+                            "dataset": "dataset",
+                            "field_name": "reference",
+                            "row_id": str(row_number),
+                            "value": ref,
+                            "organisation": "organisation",
                         }
                     )
 
diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index 1d6d5a1b..75718d32 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -19,8 +19,8 @@ def issue_factory(scope):
         "row-group": RowGroupIssue,
         "row": RowIssue,
         "value": ValueIssue,
-        "duplicate_reference": RowIssue,
-        "invalid_reference": ValueIssue,
+        "duplicate_reference": DuplicateReferenceIssue,
+        "invalid_reference": InvalidReferenceIssue,
     }
     if scope in SCOPE_MAP:
         return SCOPE_MAP[scope]
@@ -131,3 +131,31 @@ def __post_init__(self):
         issue_scope = "value"
         if self.scope != issue_scope:
             raise ValueError(f"scope must be '{issue_scope}'.")
+
+
+@dataclass
+class DuplicateReferenceIssue(Issue):
+    dataset: str
+    field_name: str = field(metadata=config(field_name="field_name"))
+    duplicated_value: str = field(metadata=config(field_name="duplicated_value"))
+    rows: list = field(metadata=config(field_name="rows"))
+    organisation: str
+
+    def __post_init__(self):
+        issue_scope = "duplicate_reference"
+        if self.scope != issue_scope:
+            raise ValueError(f"scope must be '{issue_scope}'.")
+
+
+@dataclass
+class InvalidReferenceIssue(Issue):
+    dataset: str
+    field_name: str = field(metadata=config(field_name="field_name"))
+    invalid_value: str = field(metadata=config(field_name="invalid_value"))
+    row_id: str = field(metadata=config(field_name="row_id"))
+    organisation: str
+
+    def __post_init__(self):
+        issue_scope = "invalid_reference"
+        if self.scope != issue_scope:
+            raise ValueError(f"scope must be '{issue_scope}'.")

From febdbace578f93372f3b78c82386dbacd2af504e Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:09:51 +0100
Subject: [PATCH 31/58] Changed value

---
 digital_land/expectations/checkpoints/converted_resource.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 43671a56..8a952d5e 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -55,7 +55,7 @@ def check_for_duplicate_references(self):
                         "dataset": "dataset",
                         "field_name": "reference",
                         "row_id": str(rows[0]),
-                        "value": ref,
+                        "invalid_value": ref,
                         "organisation": "organisation",
                     }
                 )
@@ -80,7 +80,7 @@ def validate_references(self):
                             "dataset": "dataset",
                             "field_name": "reference",
                             "row_id": str(row_number),
-                            "value": ref,
+                            "invalid_value": ref,
                             "organisation": "organisation",
                         }
                     )

From 4efc16dd2732ed141e4fff272f26e866bc1d605b Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:15:26 +0100
Subject: [PATCH 32/58] Value changes

---
 digital_land/expectations/checkpoints/converted_resource.py | 2 --
 digital_land/expectations/issue.py                          | 2 --
 2 files changed, 4 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 8a952d5e..8e7f1727 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -55,7 +55,6 @@ def check_for_duplicate_references(self):
                         "dataset": "dataset",
                         "field_name": "reference",
                         "row_id": str(rows[0]),
-                        "invalid_value": ref,
                         "organisation": "organisation",
                     }
                 )
@@ -80,7 +79,6 @@ def validate_references(self):
                             "dataset": "dataset",
                             "field_name": "reference",
                             "row_id": str(row_number),
-                            "invalid_value": ref,
                             "organisation": "organisation",
                         }
                     )
diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index 75718d32..dc45a1c3 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -137,7 +137,6 @@ def __post_init__(self):
 class DuplicateReferenceIssue(Issue):
     dataset: str
     field_name: str = field(metadata=config(field_name="field_name"))
-    duplicated_value: str = field(metadata=config(field_name="duplicated_value"))
     rows: list = field(metadata=config(field_name="rows"))
     organisation: str
 
@@ -151,7 +150,6 @@ def __post_init__(self):
 class InvalidReferenceIssue(Issue):
     dataset: str
     field_name: str = field(metadata=config(field_name="field_name"))
-    invalid_value: str = field(metadata=config(field_name="invalid_value"))
     row_id: str = field(metadata=config(field_name="row_id"))
     organisation: str
 

From 91e5c189a72a37aa2e8fce9f60e9b6319d38a820 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:22:06 +0100
Subject: [PATCH 33/58] Adjust convert.py

---
 digital_land/phase/convert.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 8e057e7e..74da23eb 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -200,7 +200,14 @@ def _read_text_file(self, input_path, encoding):
     def run_checkpoint(self, path):
         checkpoint = ConvertedResourceCheckpoint(data_path=path)
         checkpoint.load()
-        checkpoint_result, issues = checkpoint.run()
+        result = checkpoint.run()
+
+        # Check if the result is not None and is iterable (unpackable)
+        if result is not None and isinstance(result, tuple) and len(result) == 2:
+            checkpoint_result, issues = result
+        else:
+            logging.error("Checkpoint did not return the expected result format.")
+            return
 
         if issues:
             for issue in issues:

From 238607f148b03a31a62e9d4c526480ef26022100 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 10 Apr 2024 15:28:53 +0100
Subject: [PATCH 34/58] Test fixes

---
 tests/integration/expectations/test_checkpoint.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index c5c2443f..62a78568 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -52,7 +52,7 @@ def csv_path(tmp_path):
         {"reference": "REF-001", "name": "Test 1"},
         {"reference": "REF-002", "name": "Test 2"},
         {"reference": "REF-001", "name": "Test 3"},  # Duplicate
-        {"reference": "INVALID-003", "name": "Test 4"},  # Invalid format
+        {"reference": "", "name": "Test 4"},  # Invalid format
     ]
     csv_file = tmp_path / "test_data.csv"
     with csv_file.open(mode="w", newline="") as f:
@@ -180,6 +180,4 @@ def test_validate_references(csv_path):
     assert (
         issues[0]["scope"] == "invalid_reference"
     ), "The issue should be identified as an invalid reference."
-    assert (
-        "INVALID-003" in issues[0]["message"]
-    ), "INVALID-003 should be identified as invalid."
+    assert "" in issues[0]["message"], " 4th value should be identified as invalid."

From d3ecda39cc59d7318c5cf227c2dac91215e98f80 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 10:21:33 +0100
Subject: [PATCH 35/58] Chanegs to issues

---
 .../checkpoints/converted_resource.py         | 13 ++++-----
 digital_land/expectations/issue.py            | 28 -------------------
 2 files changed, 6 insertions(+), 35 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 8e7f1727..73b666cf 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -48,12 +48,11 @@ def check_for_duplicate_references(self):
             if len(rows) > 1:
                 issues.append(
                     {
-                        "scope": "duplicate_reference",
+                        "scope": "row-group",
                         "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
-                        "rows": rows,
-                        "reference": ref,
                         "dataset": "dataset",
-                        "field_name": "reference",
+                        "table_name": "resource",
+                        "rows": rows,
                         "row_id": str(rows[0]),
                         "organisation": "organisation",
                     }
@@ -72,13 +71,13 @@ def validate_references(self):
                 if not ref:  # This will be True for both None and empty strings
                     issues.append(
                         {
-                            "scope": "invalid_reference",
+                            "scope": "value",
                             "message": f"Reference is missing on row {row_number}.",
-                            "row": row_number,
-                            "reference": ref,
                             "dataset": "dataset",
+                            "table_name": "resource",
                             "field_name": "reference",
                             "row_id": str(row_number),
+                            "value": ref,
                             "organisation": "organisation",
                         }
                     )
diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index dc45a1c3..68cd0ae8 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -19,8 +19,6 @@ def issue_factory(scope):
         "row-group": RowGroupIssue,
         "row": RowIssue,
         "value": ValueIssue,
-        "duplicate_reference": DuplicateReferenceIssue,
-        "invalid_reference": InvalidReferenceIssue,
     }
     if scope in SCOPE_MAP:
         return SCOPE_MAP[scope]
@@ -131,29 +129,3 @@ def __post_init__(self):
         issue_scope = "value"
         if self.scope != issue_scope:
             raise ValueError(f"scope must be '{issue_scope}'.")
-
-
-@dataclass
-class DuplicateReferenceIssue(Issue):
-    dataset: str
-    field_name: str = field(metadata=config(field_name="field_name"))
-    rows: list = field(metadata=config(field_name="rows"))
-    organisation: str
-
-    def __post_init__(self):
-        issue_scope = "duplicate_reference"
-        if self.scope != issue_scope:
-            raise ValueError(f"scope must be '{issue_scope}'.")
-
-
-@dataclass
-class InvalidReferenceIssue(Issue):
-    dataset: str
-    field_name: str = field(metadata=config(field_name="field_name"))
-    row_id: str = field(metadata=config(field_name="row_id"))
-    organisation: str
-
-    def __post_init__(self):
-        issue_scope = "invalid_reference"
-        if self.scope != issue_scope:
-            raise ValueError(f"scope must be '{issue_scope}'.")

From e22412faff0fd5c89c700515497a848ed6652a60 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 10:35:25 +0100
Subject: [PATCH 36/58] Change to reference

---
 digital_land/expectations/checkpoints/converted_resource.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 73b666cf..512a8dce 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -77,7 +77,7 @@ def validate_references(self):
                             "table_name": "resource",
                             "field_name": "reference",
                             "row_id": str(row_number),
-                            "value": ref,
+                            "value": "reference",
                             "organisation": "organisation",
                         }
                     )

From 4bc8119f45840dcbd4a357a0fe2d5f9708826017 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 14:39:35 +0100
Subject: [PATCH 37/58] Separate functions and correct tests

---
 .../checkpoints/converted_resource.py         | 69 ++-----------------
 .../resource_validations.py                   | 56 +++++++++++++++
 .../expectations/test_checkpoint.py           | 23 +++----
 3 files changed, 71 insertions(+), 77 deletions(-)
 create mode 100644 digital_land/expectations/expectation_functions/resource_validations.py

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 512a8dce..f00f24fc 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -1,13 +1,9 @@
-# checkpoint needs to assemble class state
-# it needs to validate inputs specific for that checkpoint
-# it then needs to run expectations
-# then it needs to be able to save those expectation resultts
-# a checkpoint represents the moment in the process where we tell it the
-# type of data it is validating and where the data is
-# the primary different between checkpoints is how it loads expectations (i.e. where that are loaded from)
 from pathlib import Path
-import csv
 from .base import BaseCheckpoint
+from ..expectation_functions.resource_validations import (
+    check_for_duplicate_references,
+    validate_references,
+)
 
 
 class ConvertedResourceCheckpoint(BaseCheckpoint):
@@ -18,68 +14,15 @@ def __init__(self, data_path):
     def load(self):
         self.expectations = [
             {
-                "function": self.check_for_duplicate_references,
+                "function": check_for_duplicate_references(self.csv_path),
                 "name": "Check for Duplicate References",
                 "severity": "error",
                 "responsibility": "system",
             },
             {
-                "function": self.validate_references,
+                "function": validate_references(self.csv_path),
                 "name": "Validate References",
                 "severity": "error",
                 "responsibility": "system",
             },
         ]
-
-    def check_for_duplicate_references(self):
-        duplicates = {}
-        issues = []
-
-        with self.csv_path.open(newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-                if ref in duplicates:
-                    duplicates[ref].append(row_number)
-                else:
-                    duplicates[ref] = [row_number]
-
-        for ref, rows in duplicates.items():
-            if len(rows) > 1:
-                issues.append(
-                    {
-                        "scope": "row-group",
-                        "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
-                        "dataset": "dataset",
-                        "table_name": "resource",
-                        "rows": rows,
-                        "row_id": str(rows[0]),
-                        "organisation": "organisation",
-                    }
-                )
-
-        return True, "Checked for duplicate references.", issues
-
-    def validate_references(self):
-        issues = []
-
-        with self.csv_path.open(newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-
-                if not ref:  # This will be True for both None and empty strings
-                    issues.append(
-                        {
-                            "scope": "value",
-                            "message": f"Reference is missing on row {row_number}.",
-                            "dataset": "dataset",
-                            "table_name": "resource",
-                            "field_name": "reference",
-                            "row_id": str(row_number),
-                            "value": "reference",
-                            "organisation": "organisation",
-                        }
-                    )
-
-        return len(issues) == 0, "Checked for unpopulated references.", issues
diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
new file mode 100644
index 00000000..23150be1
--- /dev/null
+++ b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -0,0 +1,56 @@
+import csv
+
+
+def check_for_duplicate_references(csv_path):
+    duplicates = {}
+    issues = []
+
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+            if ref in duplicates:
+                duplicates[ref].append(row_number)
+            else:
+                duplicates[ref] = [row_number]
+
+    for ref, rows in duplicates.items():
+        if len(rows) > 1:
+            issues.append(
+                {
+                    "scope": "row-group",
+                    "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                    "dataset": "dataset",
+                    "table_name": "resource",
+                    "rows": rows,
+                    "row_id": str(rows[0]),
+                    "organisation": "organisation",
+                }
+            )
+
+    return issues
+
+
+def validate_references(csv_path):
+    issues = []
+
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+
+            if not ref:  # This will be True for both None and empty strings
+                issues.append(
+                    {
+                        "scope": "value",
+                        "message": f"Reference is missing on row {row_number}.",
+                        "dataset": "dataset",
+                        "table_name": "resource",
+                        "field_name": "reference",
+                        "row_id": str(row_number),
+                        "value": "Missing",
+                        "organisation": "organisation",
+                    }
+                )
+
+    return issues
diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index 62a78568..37c2ac04 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -4,8 +4,9 @@
 import pandas as pd
 from csv import DictReader, DictWriter
 from digital_land.expectations.checkpoints.dataset import DatasetCheckpoint
-from digital_land.expectations.checkpoints.converted_resource import (
-    ConvertedResourceCheckpoint,
+from digital_land.expectations.expectation_functions.resource_validations import (
+    check_for_duplicate_references,
+    validate_references,
 )
 
 
@@ -154,15 +155,12 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
 
 
 def test_check_for_duplicate_references(csv_path):
-    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
-    checkpoint.load()
-
-    success, message, issues = checkpoint.check_for_duplicate_references()
+    issues = check_for_duplicate_references(csv_path)
 
-    assert success is True, "The function should successfully identify issues."
+    assert issues, "The function should successfully identify issues."
     assert len(issues) == 1, "There should be one issue identified."
     assert (
-        issues[0]["scope"] == "duplicate_reference"
+        issues[0]["scope"] == "row-group"
     ), "The issue should be identified as a duplicate reference."
     assert (
         "REF-001" in issues[0]["message"]
@@ -170,14 +168,11 @@ def test_check_for_duplicate_references(csv_path):
 
 
 def test_validate_references(csv_path):
-    checkpoint = ConvertedResourceCheckpoint(data_path=csv_path)
-    checkpoint.load()
-
-    success, message, issues = checkpoint.validate_references()
+    issues = validate_references(csv_path)
 
-    assert success is False, "The function should fail due to invalid references."
+    assert issues, "The function should fail due to invalid references."
     assert len(issues) == 1, "There should be one issue identified."
     assert (
-        issues[0]["scope"] == "invalid_reference"
+        issues[0]["scope"] == "value"
     ), "The issue should be identified as an invalid reference."
     assert "" in issues[0]["message"], " 4th value should be identified as invalid."

From 4b0a43712f2551f1e5bb0bfe75914bc7c3f5e55f Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 15:02:58 +0100
Subject: [PATCH 38/58] Changes back to helpers

---
 .../expectations/checkpoints/converted_resource.py    | 11 ++++++++---
 .../expectation_functions/resource_validations.py     |  7 ++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index f00f24fc..d82726c7 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -7,9 +7,14 @@
 
 
 class ConvertedResourceCheckpoint(BaseCheckpoint):
-    def __init__(self, data_path):
-        super().__init__("converted_resource", data_path)
-        self.csv_path = Path(data_path)
+    def __init__(self, dataset_path, typology, dataset=None):
+        super().__init__("converted_resource", dataset_path)
+        self.csv_path = Path(dataset_path)
+        if dataset:
+            self.dataset = dataset
+        else:
+            self.dataset = self.csv_path.stem
+        self.typology = typology
 
     def load(self):
         self.expectations = [
diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
index 23150be1..2acbe669 100644
--- a/digital_land/expectations/expectation_functions/resource_validations.py
+++ b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -4,7 +4,6 @@
 def check_for_duplicate_references(csv_path):
     duplicates = {}
     issues = []
-
     with csv_path.open(newline="") as csvfile:
         reader = csv.DictReader(csvfile)
         for row_number, row in enumerate(reader, start=1):
@@ -28,17 +27,15 @@ def check_for_duplicate_references(csv_path):
                 }
             )
 
-    return issues
+    return True, "Checked for duplicate references.", issues
 
 
 def validate_references(csv_path):
     issues = []
-
     with csv_path.open(newline="") as csvfile:
         reader = csv.DictReader(csvfile)
         for row_number, row in enumerate(reader, start=1):
             ref = row.get("reference")
-
             if not ref:  # This will be True for both None and empty strings
                 issues.append(
                     {
@@ -53,4 +50,4 @@ def validate_references(csv_path):
                     }
                 )
 
-    return issues
+    return len(issues) == 0, "Checked for unpopulated references.", issues

From 568f456fd1f9a752a695a7a2164a9bad647d5396 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Mon, 15 Apr 2024 15:07:19 +0100
Subject: [PATCH 39/58] Fix

---
 digital_land/phase/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 74da23eb..303609f9 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -198,7 +198,7 @@ def _read_text_file(self, input_path, encoding):
         return reader
 
     def run_checkpoint(self, path):
-        checkpoint = ConvertedResourceCheckpoint(data_path=path)
+        checkpoint = ConvertedResourceCheckpoint(dataset_path=path)
         checkpoint.load()
         result = checkpoint.run()
 

From 2eb2134954cdcf48de796d98ad3d16d6194b5233 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 12:41:53 +0100
Subject: [PATCH 40/58] Core changes

---
 digital_land/commands.py                      | 11 +++
 .../checkpoints/converted_resource.py         | 71 ++++++++++++++-----
 digital_land/expectations/commands.py         |  4 +-
 digital_land/phase/convert.py                 | 29 --------
 digital_land/phase/post_conversion.py         | 38 ++++++++++
 .../expectations/test_checkpoint.py           |  4 +-
 6 files changed, 106 insertions(+), 51 deletions(-)
 create mode 100644 digital_land/phase/post_conversion.py

diff --git a/digital_land/commands.py b/digital_land/commands.py
index ad9d05b1..07d7c488 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -19,6 +19,7 @@
 from digital_land.phase.combine import FactCombinePhase
 from digital_land.phase.concat import ConcatFieldPhase
 from digital_land.phase.convert import ConvertPhase, execute
+from digital_land.phase.post_conversion import PostConversionPhase
 from digital_land.phase.default import DefaultPhase
 from digital_land.phase.dump import DumpPhase
 from digital_land.phase.factor import FactorPhase
@@ -162,6 +163,16 @@ def pipeline_run(
             dataset_resource_log=dataset_resource_log,
             custom_temp_dir=custom_temp_dir,
         ),
+        PostConversionPhase(
+            converted_resource_path=os.path.join(
+                custom_temp_dir, f"{resource}_converted.csv"
+            ),
+            output_dir=os.path.join(
+                os.path.dirname(output_path), "post_conversion_outputs"
+            ),
+            dataset=dataset,
+            typology=specification.get_typology_for_dataset(dataset),
+        ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),
         ConcatFieldPhase(concats=concats, log=column_field_log),
diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index d82726c7..59c1c307 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -1,33 +1,68 @@
 from pathlib import Path
 from .base import BaseCheckpoint
+from ..utils import QueryRunner
+import os
 from ..expectation_functions.resource_validations import (
     check_for_duplicate_references,
     validate_references,
 )
 
+# Define BASE expectations which should always run
+BASE = [
+    {
+        "function": check_for_duplicate_references,
+        "name": "Check for Duplicate References",
+        "severity": "error",
+        "responsibility": "system",
+    },
+    {
+        "function": validate_references,
+        "name": "Validate References",
+        "severity": "error",
+        "responsibility": "system",
+    },
+]
+
+# Empty TYPOLOGY and DATASET for now as per advice
+TYPOLOGY = {}
+DATASET = {}
+
 
 class ConvertedResourceCheckpoint(BaseCheckpoint):
     def __init__(self, dataset_path, typology, dataset=None):
         super().__init__("converted_resource", dataset_path)
         self.csv_path = Path(dataset_path)
-        if dataset:
-            self.dataset = dataset
-        else:
-            self.dataset = self.csv_path.stem
+        self.dataset = dataset if dataset else self.csv_path.stem
         self.typology = typology
 
     def load(self):
-        self.expectations = [
-            {
-                "function": check_for_duplicate_references(self.csv_path),
-                "name": "Check for Duplicate References",
-                "severity": "error",
-                "responsibility": "system",
-            },
-            {
-                "function": validate_references(self.csv_path),
-                "name": "Validate References",
-                "severity": "error",
-                "responsibility": "system",
-            },
-        ]
+        self.expectations = []
+        self.expectations.extend(BASE)
+        typology_expectations = TYPOLOGY.get(self.typology, [])
+        dataset_expectations = DATASET.get(self.dataset, [])
+
+        # Extend the expectations list with relevant typology and dataset-specific expectations
+        if typology_expectations:
+            self.expectations.extend(typology_expectations)
+        if dataset_expectations:
+            self.expectations.extend(dataset_expectations)
+
+        # Assign a QueryRunner instance to each expectation
+        for expectation in self.expectations:
+            expectation["query_runner"] = QueryRunner(self.csv_path)
+
+    def save(self, output_dir, format="csv"):
+        responses_file_path = os.path.join(
+            output_dir, self.checkpoint, f"{self.dataset}-responses.csv"
+        )
+        issues_file_path = os.path.join(
+            output_dir, self.checkpoint, f"{self.dataset}-issues.csv"
+        )
+
+        self.save_responses(
+            self.responses,
+            responses_file_path,
+            format=format,
+        )
+
+        self.save_issues(self.issues, issues_file_path, format=format)
diff --git a/digital_land/expectations/commands.py b/digital_land/expectations/commands.py
index d16c6533..7b7f7922 100644
--- a/digital_land/expectations/commands.py
+++ b/digital_land/expectations/commands.py
@@ -1,5 +1,5 @@
 from .checkpoints.dataset import DatasetCheckpoint
-from .checkpoints.converted_resource import CovertedResourceCheckpoint
+from .checkpoints.converted_resource import ConvertedResourceCheckpoint
 
 
 def run_dataset_checkpoint(
@@ -30,7 +30,7 @@ def run_converted_resource_checkpoint(
     """
     Function to run the expectation checkpoint for a converted resource
     """
-    checkpoint = CovertedResourceCheckpoint(converted_resource_path, dataset, typology)
+    checkpoint = ConvertedResourceCheckpoint(converted_resource_path, dataset, typology)
     checkpoint.load()
     checkpoint.run()
     checkpoint.save(output_dir, format="csv")
diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 303609f9..b57c22c1 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -11,9 +11,6 @@
 import pandas as pd
 from .load import Stream
 from .phase import Phase
-from digital_land.expectations.checkpoints.converted_resource import (
-    ConvertedResourceCheckpoint,
-)
 
 
 def detect_file_encoding(path):
@@ -190,38 +187,12 @@ def _read_text_file(self, input_path, encoding):
 
         if converted_csv_file:
             f.close()
-            self.run_checkpoint(converted_csv_file)
             reader = read_csv(converted_csv_file)
         else:
             reader = f
 
         return reader
 
-    def run_checkpoint(self, path):
-        checkpoint = ConvertedResourceCheckpoint(dataset_path=path)
-        checkpoint.load()
-        result = checkpoint.run()
-
-        # Check if the result is not None and is iterable (unpackable)
-        if result is not None and isinstance(result, tuple) and len(result) == 2:
-            checkpoint_result, issues = result
-        else:
-            logging.error("Checkpoint did not return the expected result format.")
-            return
-
-        if issues:
-            for issue in issues:
-                log_message = self.format_issue_message(issue)
-
-                if issue["severity"] == "error":
-                    logging.error(log_message)
-                elif issue["severity"] == "warning":
-                    logging.warning(log_message)
-                else:
-                    logging.info(log_message)
-        else:
-            logging.info(f"Checkpoint completed with result: {checkpoint_result}")
-
     def format_issue_message(self, issue):
         return f"Checkpoint Issue: {issue['message']} at line {issue.get('line_number', 'N/A')} (Severity: {issue['severity']})"
 
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
new file mode 100644
index 00000000..801aaed8
--- /dev/null
+++ b/digital_land/phase/post_conversion.py
@@ -0,0 +1,38 @@
+from expectations.commands import run_converted_resource_checkpoint
+
+
+class PostConversionPhase:
+    def __init__(
+        self,
+        converted_resource_path,
+        output_dir,
+        dataset,
+        typology,
+        act_on_critical_error=False,
+    ):
+        """
+        Initializes the PostConversionPhase with necessary parameters.
+        :param converted_resource_path: Path to the converted CSV file.
+        :param output_dir: Directory to store output files.
+        :param dataset: Dataset related information for the checkpoint.
+        :param typology: Typology information for the checkpoint.
+        :param act_on_critical_error: Whether to act on critical errors during the checkpoint.
+        """
+        self.converted_resource_path = converted_resource_path
+        self.output_dir = output_dir
+        self.dataset = dataset
+        self.typology = typology
+        self.act_on_critical_error = act_on_critical_error
+
+    def run(self):
+        """
+        Executes the converted resource checkpoint using the provided parameters.
+        """
+        # Run the checkpoint on the converted resource
+        run_converted_resource_checkpoint(
+            self.converted_resource_path,
+            self.output_dir,
+            self.dataset,
+            self.typology,
+            self.act_on_critical_error,
+        )
diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index 37c2ac04..13ab54c0 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -155,7 +155,7 @@ def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
 
 
 def test_check_for_duplicate_references(csv_path):
-    issues = check_for_duplicate_references(csv_path)
+    _, _, issues = check_for_duplicate_references(csv_path)
 
     assert issues, "The function should successfully identify issues."
     assert len(issues) == 1, "There should be one issue identified."
@@ -168,7 +168,7 @@ def test_check_for_duplicate_references(csv_path):
 
 
 def test_validate_references(csv_path):
-    issues = validate_references(csv_path)
+    _, _, issues = validate_references(csv_path)
 
     assert issues, "The function should fail due to invalid references."
     assert len(issues) == 1, "There should be one issue identified."

From 4338c8babe37cea958645376e525bdc5b76a2d44 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 12:49:24 +0100
Subject: [PATCH 41/58] Import change

---
 digital_land/phase/post_conversion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index 801aaed8..e312644d 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -1,4 +1,4 @@
-from expectations.commands import run_converted_resource_checkpoint
+from ..expectations.commands import run_converted_resource_checkpoint
 
 
 class PostConversionPhase:

From 0851420efaa94106fc8212bba132869f3ead74b8 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:03:10 +0100
Subject: [PATCH 42/58] Parameter changes

---
 digital_land/commands.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 07d7c488..ca9224cf 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -128,6 +128,7 @@ def pipeline_run(
     issue_log = IssueLog(dataset=dataset, resource=resource)
     column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
     dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+    converted_resource_path = custom_temp_dir / f"{resource}_converted.csv"
 
     # load pipeline configuration
     skip_patterns = pipeline.skip_patterns(resource)
@@ -164,12 +165,8 @@ def pipeline_run(
             custom_temp_dir=custom_temp_dir,
         ),
         PostConversionPhase(
-            converted_resource_path=os.path.join(
-                custom_temp_dir, f"{resource}_converted.csv"
-            ),
-            output_dir=os.path.join(
-                os.path.dirname(output_path), "post_conversion_outputs"
-            ),
+            converted_resource_path=converted_resource_path,
+            output_dir=output_path,
             dataset=dataset,
             typology=specification.get_typology_for_dataset(dataset),
         ),

From 0db28da558a817ba4feae69cb6d1b3b8a5f8f09d Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:39:52 +0100
Subject: [PATCH 43/58] Changes to convert

---
 digital_land/commands.py      | 11 +++++++++++
 digital_land/phase/convert.py |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index ca9224cf..1c719b63 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -158,6 +158,17 @@ def pipeline_run(
     if entry_date:
         default_values["entry-date"] = entry_date
 
+    convert_phase = ConvertPhase(
+        path=input_path,
+        dataset_resource_log=DatasetResourceLog(),
+        custom_temp_dir=custom_temp_dir,
+        output_path=output_path,
+    )
+
+    # Execute the ConvertPhase to set the converted_resource_path
+    convert_phase.process()
+    converted_resource_path = convert_phase.converted_resource_path
+
     run_pipeline(
         ConvertPhase(
             path=input_path,
diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index b57c22c1..9cd99f45 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -125,6 +125,9 @@ def __init__(
         self.path = path
         self.log = dataset_resource_log
         self.charset = ""
+        self.converted_resource_path = (
+            None  # This will hold the path to the converted file
+        )
         # Allows for custom temporary directory to be specified
         # This allows symlink creation in case of /tmp & path being on different partitions
         if custom_temp_dir:
@@ -155,6 +158,8 @@ def process(self, stream=None):
 
             # raise StopIteration()
             reader = iter(())
+        if self.output_path:
+            self.converted_resource_path = self.output_path
 
         return Stream(input_path, f=reader, log=self.log)
 

From e552ff51a9739ce767c716aa1df09e16f1bc0802 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:43:39 +0100
Subject: [PATCH 44/58] Fix

---
 digital_land/commands.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 1c719b63..8c4767dc 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -128,7 +128,6 @@ def pipeline_run(
     issue_log = IssueLog(dataset=dataset, resource=resource)
     column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
     dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-    converted_resource_path = custom_temp_dir / f"{resource}_converted.csv"
 
     # load pipeline configuration
     skip_patterns = pipeline.skip_patterns(resource)

From 965d1bce843e233825717d37d28a36287b75cd7a Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:49:43 +0100
Subject: [PATCH 45/58] Typology change

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 8c4767dc..9062c36c 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -178,7 +178,7 @@ def pipeline_run(
             converted_resource_path=converted_resource_path,
             output_dir=output_path,
             dataset=dataset,
-            typology=specification.get_typology_for_dataset(dataset),
+            typology=specification.get_dataset_typology(dataset),
         ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),

From 13df751bfe3f90711035462f9861e72ae99cb6c3 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 13:55:08 +0100
Subject: [PATCH 46/58] Add Process

---
 digital_land/commands.py              |  5 -----
 digital_land/phase/post_conversion.py | 11 +++--------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 9062c36c..a0f3fc26 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -169,11 +169,6 @@ def pipeline_run(
     converted_resource_path = convert_phase.converted_resource_path
 
     run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
         PostConversionPhase(
             converted_resource_path=converted_resource_path,
             output_dir=output_path,
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index e312644d..2216f8dd 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -10,20 +10,15 @@ def __init__(
         typology,
         act_on_critical_error=False,
     ):
-        """
-        Initializes the PostConversionPhase with necessary parameters.
-        :param converted_resource_path: Path to the converted CSV file.
-        :param output_dir: Directory to store output files.
-        :param dataset: Dataset related information for the checkpoint.
-        :param typology: Typology information for the checkpoint.
-        :param act_on_critical_error: Whether to act on critical errors during the checkpoint.
-        """
         self.converted_resource_path = converted_resource_path
         self.output_dir = output_dir
         self.dataset = dataset
         self.typology = typology
         self.act_on_critical_error = act_on_critical_error
 
+    def process(self):
+        return self.run()
+
     def run(self):
         """
         Executes the converted resource checkpoint using the provided parameters.

From eb3b67ed7bce0b8eabc1b351dbe58a0015057397 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 14:00:24 +0100
Subject: [PATCH 47/58] Add process parameter

---
 digital_land/phase/post_conversion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index 2216f8dd..00dcdd77 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -16,7 +16,7 @@ def __init__(
         self.typology = typology
         self.act_on_critical_error = act_on_critical_error
 
-    def process(self):
+    def process(self, stream=None):
         return self.run()
 
     def run(self):

From d7fc4f7ef8740660f938423658b1aa0bd42d237e Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 14:09:35 +0100
Subject: [PATCH 48/58] Query runner adjustments

---
 .../expectation_functions/resource_validations.py             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
index 2acbe669..c6acae74 100644
--- a/digital_land/expectations/expectation_functions/resource_validations.py
+++ b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -1,7 +1,7 @@
 import csv
 
 
-def check_for_duplicate_references(csv_path):
+def check_for_duplicate_references(csv_path, **kwargs):
     duplicates = {}
     issues = []
     with csv_path.open(newline="") as csvfile:
@@ -30,7 +30,7 @@ def check_for_duplicate_references(csv_path):
     return True, "Checked for duplicate references.", issues
 
 
-def validate_references(csv_path):
+def validate_references(csv_path, **kwargs):
     issues = []
     with csv_path.open(newline="") as csvfile:
         reader = csv.DictReader(csvfile)

From b5ebc71a2d6862c6007b86e1731e6a02b602efa2 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Tue, 16 Apr 2024 14:15:47 +0100
Subject: [PATCH 49/58] Fix converted resource

---
 digital_land/expectations/checkpoints/converted_resource.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 59c1c307..14be3c21 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -14,12 +14,14 @@
         "name": "Check for Duplicate References",
         "severity": "error",
         "responsibility": "system",
+        "csv_path": None,
     },
     {
         "function": validate_references,
         "name": "Validate References",
         "severity": "error",
         "responsibility": "system",
+        "csv_path": None,
     },
 ]
 
@@ -49,6 +51,7 @@ def load(self):
 
         # Assign a QueryRunner instance to each expectation
         for expectation in self.expectations:
+            expectation["csv_path"] = self.csv_path
             expectation["query_runner"] = QueryRunner(self.csv_path)
 
     def save(self, output_dir, format="csv"):

From 7b60741395410f21956ce804562eecb6a8b09239 Mon Sep 17 00:00:00 2001
From: James Bannister <jbannister@scottlogic.com>
Date: Wed, 17 Apr 2024 16:33:12 +0100
Subject: [PATCH 50/58] Change pathing

---
 digital_land/commands.py | 1647 +++++++++++++++++++-------------------
 1 file changed, 821 insertions(+), 826 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index a0f3fc26..07befebf 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1,826 +1,821 @@
-from collections import OrderedDict
-import csv
-import itertools
-import os
-import sys
-import json
-import logging
-from pathlib import Path
-
-import geojson
-import shapely
-
-from digital_land.specification import Specification
-from digital_land.collect import Collector
-from digital_land.collection import Collection, resource_path
-from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog
-from digital_land.organisation import Organisation
-from digital_land.package.dataset import DatasetPackage
-from digital_land.phase.combine import FactCombinePhase
-from digital_land.phase.concat import ConcatFieldPhase
-from digital_land.phase.convert import ConvertPhase, execute
-from digital_land.phase.post_conversion import PostConversionPhase
-from digital_land.phase.default import DefaultPhase
-from digital_land.phase.dump import DumpPhase
-from digital_land.phase.factor import FactorPhase
-from digital_land.phase.filter import FilterPhase
-from digital_land.phase.harmonise import HarmonisePhase
-from digital_land.phase.lookup import (
-    EntityLookupPhase,
-    FactLookupPhase,
-    PrintLookupPhase,
-)
-from digital_land.phase.map import MapPhase
-from digital_land.phase.migrate import MigratePhase
-from digital_land.phase.normalise import NormalisePhase
-from digital_land.phase.organisation import OrganisationPhase
-from digital_land.phase.parse import ParsePhase
-from digital_land.phase.patch import PatchPhase
-from digital_land.phase.pivot import PivotPhase
-from digital_land.phase.prefix import EntityPrefixPhase
-from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
-from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
-from digital_land.phase.save import SavePhase
-from digital_land.pipeline import run_pipeline, Lookups, Pipeline
-from digital_land.schema import Schema
-from digital_land.update import add_source_endpoint
-from .register import hash_value
-
-logger = logging.getLogger(__name__)
-
-
-def fetch(url, pipeline):
-    """fetch a single source endpoint URL, and add it to the collection"""
-    collector = Collector(pipeline.name)
-    collector.fetch(url)
-
-
-def collect(endpoint_path, collection_dir, pipeline):
-    """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file"""
-    collector = Collector(pipeline.name, Path(collection_dir))
-    collector.collect(endpoint_path)
-
-
-#
-#  collection commands
-#  TBD: make sub commands
-#
-def collection_list_resources(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    for resource in sorted(collection.resource.records):
-        print(resource_path(resource, directory=collection_dir))
-
-
-def collection_pipeline_makerules(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    collection.pipeline_makerules()
-
-
-def collection_save_csv(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    collection.update()
-    collection.save_csv()
-
-
-#
-#  pipeline commands
-#
-def convert(input_path, output_path, custom_temp_dir=None):
-    if not output_path:
-        output_path = default_output_path("converted", input_path)
-    dataset_resource_log = DatasetResourceLog()
-    run_pipeline(
-        ConvertPhase(
-            input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
-        DumpPhase(output_path),
-    )
-    dataset_resource_log.save(f=sys.stdout)
-
-
-def pipeline_run(
-    dataset,
-    pipeline,
-    specification,
-    input_path,
-    output_path,
-    collection_dir="./collection",  # TBD: remove, replaced by endpoints, organisations and entry_date
-    null_path=None,  # TBD: remove this
-    issue_dir=None,
-    organisation_path=None,
-    save_harmonised=False,
-    column_field_dir=None,
-    dataset_resource_dir=None,
-    custom_temp_dir=None,  # TBD: rename to "tmpdir"
-    endpoints=[],
-    organisations=[],
-    entry_date="",
-):
-    resource = resource_from_path(input_path)
-    dataset = dataset
-    schema = specification.pipeline[pipeline.name]["schema"]
-    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
-    issue_log = IssueLog(dataset=dataset, resource=resource)
-    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
-    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-
-    # load pipeline configuration
-    skip_patterns = pipeline.skip_patterns(resource)
-    columns = pipeline.columns(resource, endpoints=endpoints)
-    concats = pipeline.concatenations(resource, endpoints=endpoints)
-    patches = pipeline.patches(resource=resource)
-    lookups = pipeline.lookups(resource=resource)
-    default_fields = pipeline.default_fields(resource=resource)
-    default_values = pipeline.default_values(endpoints=endpoints)
-    combine_fields = pipeline.combine_fields(endpoints=endpoints)
-
-    # load organisations
-    organisation = Organisation(organisation_path, Path(pipeline.path))
-
-    # load the resource default values from the collection
-    if not endpoints:
-        collection = Collection(name=None, directory=collection_dir)
-        collection.load()
-        endpoints = collection.resource_endpoints(resource)
-        organisations = collection.resource_organisations(resource)
-        entry_date = collection.resource_start_date(resource)
-
-    # resource specific default values
-    if len(organisations) == 1:
-        default_values["organisation"] = organisations[0]
-
-    if entry_date:
-        default_values["entry-date"] = entry_date
-
-    convert_phase = ConvertPhase(
-        path=input_path,
-        dataset_resource_log=DatasetResourceLog(),
-        custom_temp_dir=custom_temp_dir,
-        output_path=output_path,
-    )
-
-    # Execute the ConvertPhase to set the converted_resource_path
-    convert_phase.process()
-    converted_resource_path = convert_phase.converted_resource_path
-
-    run_pipeline(
-        PostConversionPhase(
-            converted_resource_path=converted_resource_path,
-            output_dir=output_path,
-            dataset=dataset,
-            typology=specification.get_dataset_typology(dataset),
-        ),
-        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-            dataset=dataset,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        EntityLookupPhase(lookups),
-        SavePhase(
-            default_output_path("harmonised", input_path),
-            fieldnames=intermediate_fieldnames,
-            enabled=save_harmonised,
-        ),
-        EntityPrunePhase(
-            issue_log=issue_log, dataset_resource_log=dataset_resource_log
-        ),
-        PivotPhase(),
-        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
-        FactorPhase(),
-        FactReferencePhase(
-            field_typology_map=specification.get_field_typology_map(),
-            field_prefix_map=specification.get_field_prefix_map(),
-        ),
-        FactLookupPhase(lookups),
-        FactPrunePhase(),
-        SavePhase(
-            output_path,
-            fieldnames=specification.factor_fieldnames(),
-        ),
-    )
-
-    issue_log.save(os.path.join(issue_dir, resource + ".csv"))
-    column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
-    dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
-
-
-#
-#  build dataset from processed resources
-#
-def dataset_create(
-    input_paths,
-    output_path,
-    organisation_path,
-    pipeline,
-    dataset,
-    specification,
-    issue_dir="issue",
-):
-    if not output_path:
-        print("missing output path", file=sys.stderr)
-        sys.exit(2)
-    organisation = Organisation(organisation_path, Path(pipeline.path))
-    package = DatasetPackage(
-        dataset,
-        organisation=organisation,
-        path=output_path,
-        specification_dir=None,  # TBD: package should use this specification object
-    )
-    package.create()
-    for path in input_paths:
-        package.load_transformed(path)
-    package.load_entities()
-
-    old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
-    if os.path.exists(old_entity_path):
-        package.load_old_entities(old_entity_path)
-
-    issue_paths = os.path.join(issue_dir, dataset)
-    if os.path.exists(issue_paths):
-        for issue_path in os.listdir(issue_paths):
-            package.load_issues(os.path.join(issue_paths, issue_path))
-    else:
-        logging.warning("No directory for this dataset in the provided issue_directory")
-
-    package.add_counts()
-
-
-def dataset_dump(input_path, output_path):
-    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
-    logging.info(cmd)
-    os.system(cmd)
-
-
-def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
-    if isinstance(csv_path, str):
-        path = Path(csv_path)
-        dataset_name = path.stem
-    elif isinstance(csv_path, Path):
-        dataset_name = csv_path.stem
-    else:
-        logging.error(f"Can't extract datapackage name from {csv_path}")
-        sys.exit(-1)
-
-    flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv")
-    with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file:
-        reader = csv.DictReader(read_file)
-
-        spec_field_names = [
-            field
-            for field in itertools.chain(
-                *[
-                    specification.current_fieldnames(schema)
-                    for schema in specification.dataset_schema[dataset]
-                ]
-            )
-        ]
-        reader_fieldnames = [
-            field.replace("_", "-")
-            for field in list(reader.fieldnames)
-            if field != "json"
-        ]
-
-        flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames))
-        # Make sure we put flattened fieldnames last
-        field_names = reader_fieldnames + sorted(list(flattened_field_names))
-
-        writer = csv.DictWriter(write_file, fieldnames=field_names)
-        writer.writeheader()
-        entities = []
-        for row in reader:
-            row.pop("geojson", None)
-            row = OrderedDict(row)
-            json_string = row.pop("json") or "{}"
-            row.update(json.loads(json_string))
-            kebab_case_row = dict(
-                [(key.replace("_", "-"), val) for key, val in row.items()]
-            )
-            writer.writerow(kebab_case_row)
-            entities.append(kebab_case_row)
-
-    # write the entities to json file as well
-    flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json")
-    with open(flattened_json_path, "w") as out_json:
-        out_json.write(json.dumps({"entities": entities}))
-    batch_size = 100000
-    temp_geojson_files = []
-    geography_entities = [e for e in entities if e["typology"] == "geography"]
-    for i in range(0, len(geography_entities), batch_size):
-        batch = geography_entities[i : i + batch_size]
-        feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name)
-
-        geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson")
-        temp_geojson_files.append(geojson_path)
-        try:
-            with open(geojson_path, "w", encoding="utf-8") as out_geojson:
-                out_geojson.write(geojson.dumps(feature_collection))
-        except Exception as e:
-            logging.error(f"Error writing to GeoJSON file: {e}")
-
-    if all(os.path.isfile(path) for path in temp_geojson_files):
-        rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson")
-        for temp_path in temp_geojson_files:
-            responseCode, _, _ = execute(
-                [
-                    "ogr2ogr",
-                    "-f",
-                    "GeoJSON",
-                    "-lco",
-                    "RFC7946=YES",
-                    "-append",
-                    rfc7946_geojson_path,
-                    temp_path,
-                ]
-            )
-
-            if responseCode != 0:
-                logging.error(
-                    "Could not generate rfc7946 compliant geojson. Use existing file."
-                )
-                execute(
-                    [
-                        "ogr2ogr",
-                        "-f",
-                        "GeoJSON",
-                        "-append",
-                        rfc7946_geojson_path,
-                        temp_path,
-                    ]
-                )
-            # clear up input geojson file
-            if os.path.isfile(temp_path):
-                os.remove(temp_path)
-
-
-#
-#  configuration commands
-#
-def collection_add_source(entry, collection, endpoint_url, collection_dir):
-    """
-    followed by a sequence of optional name and value pairs including the following names:
-    "attribution", "licence", "pipelines", "status", "plugin",
-    "parameters", "start-date", "end-date"
-    """
-    entry["collection"] = collection
-    entry["endpoint-url"] = endpoint_url
-    allowed_names = set(
-        list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames)
-    )
-    for key in entry.keys():
-        if key not in allowed_names:
-            logging.error(f"unrecognised argument '{key}'")
-            sys.exit(2)
-    add_source_endpoint(entry, directory=collection_dir)
-
-
-def add_endpoints_and_lookups(
-    csv_file_path,
-    collection_name,
-    collection_dir,
-    pipeline_dir,
-    specification_dir,
-    organisation_path,
-    tmp_dir="./var/cache",
-):
-    """
-    :param csv_file_path:
-    :param collection_name:
-    :param collection_dir:
-    :param pipeline_dir:
-    :param specification_dir:
-    :param organisation_path:
-    :param tmp_dir:
-    :return:
-    """
-
-    expected_cols = [
-        "pipelines",
-        "organisation",
-        "documentation-url",
-        "endpoint-url",
-        "start-date",
-        "licence",
-    ]
-
-    licence_csv_path = os.path.join(specification_dir, "licence.csv")
-    valid_licenses = []
-    with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile:
-        reader = csv.DictReader(csvfile)
-        valid_licenses = [row["licence"] for row in reader]
-
-    # need to get collection name from somewhere
-    # collection name is NOT the dataset name
-    collection = Collection(name=collection_name, directory=collection_dir)
-    collection.load()
-
-    # read and process each record of the new endpoints csv at csv_file_path
-    with open(csv_file_path) as new_endpoints_file:
-        reader = csv.DictReader(new_endpoints_file)
-        csv_columns = reader.fieldnames
-
-        # validate the columns
-        for expected_col in expected_cols:
-            if expected_col not in csv_columns:
-                raise Exception(f"required column ({expected_col}) not found in csv")
-
-        # this is not perfect we should riase validation errors in our code and below should include a try and except statement
-        endpoints = []
-        for row in reader:
-            if row["licence"] not in valid_licenses:
-                raise ValueError(
-                    f"Licence '{row['licence']}' is not a valid licence according to the specification."
-                )
-            if not row["documentation-url"].strip():
-                raise ValueError(
-                    "The 'documentation-url' must be populated for each row."
-                )
-            if collection.add_source_endpoint(row):
-                endpoint = {
-                    "endpoint-url": row["endpoint-url"],
-                    "endpoint": hash_value(row["endpoint-url"]),
-                    "end-date": row.get("end-date", ""),
-                    "plugin": row.get("plugin"),
-                    "licence": row["licence"],
-                }
-                endpoints.append(endpoint)
-
-    # endpoints have been added now lets collect the resources using the endpoint information
-    collector = Collector(collection_dir=collection_dir)
-
-    for endpoint in endpoints:
-        collector.fetch(
-            url=endpoint["endpoint-url"],
-            endpoint=endpoint["endpoint"],
-            end_date=endpoint["end-date"],
-            plugin=endpoint["plugin"],
-        )
-    # reload log items
-    collection.load_log_items()
-
-    dataset_resource_map = collection.dataset_resource_map()
-
-    #  searching for the specific resources that we have downloaded
-    for dataset in dataset_resource_map:
-        resources_to_assign = []
-        for resource in dataset_resource_map[dataset]:
-            resource_endpoints = collection.resource_endpoints(resource)
-            if any(
-                endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints]
-                for endpoint in resource_endpoints
-            ):
-                resource_file_path = Path(collection_dir) / "resource" / resource
-                resources_to_assign.append(resource_file_path)
-        assign_entities(
-            resource_file_paths=resources_to_assign,
-            collection=collection,
-            pipeline_dir=pipeline_dir,
-            specification_dir=specification_dir,
-            organisation_path=organisation_path,
-            tmp_dir=tmp_dir,
-            dataset=dataset,
-        )
-
-
-def resource_from_path(path):
-    return Path(path).stem
-
-
-def default_output_path(command, input_path):
-    directory = "" if command in ["harmonised", "transformed"] else "var/"
-    return f"{directory}{command}/{resource_from_path(input_path)}.csv"
-
-
-def assign_entities(
-    resource_file_paths,
-    collection,
-    pipeline_dir,
-    specification_dir,
-    organisation_path,
-    tmp_dir="./var/cache",
-    dataset=None,
-):
-    """
-    Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection
-    :param resource_file_paths:
-    :param collection:
-    :param pipeline_dir:
-    :param specification_dir:
-    :param organisation_path:
-    :param tmp_dir:
-    :return:
-    """
-
-    specification = Specification(specification_dir)
-
-    print("")
-    print("======================================================================")
-    print("New Lookups")
-    print("======================================================================")
-
-    dataset_resource_map = collection.dataset_resource_map()
-    new_lookups = []
-
-    pipeline_name = None
-    # establish pipeline if dataset is known - else have to find dataset for each resource
-    if dataset is not None:
-        pipeline = Pipeline(pipeline_dir, dataset)
-        pipeline_name = pipeline.name
-
-    for resource_file_path in resource_file_paths:
-        resource = os.path.splitext(os.path.basename(resource_file_path))[0]
-        # Find dataset for resource if not given
-        if dataset is None:
-            for dataset_key, resources in dataset_resource_map.items():
-                if resource in list(resources):
-                    dataset = dataset_key
-                    continue
-            # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline
-            if dataset is not None:
-                pipeline = Pipeline(pipeline_dir, dataset)
-                pipeline_name = pipeline.name
-            else:
-                logging.error(
-                    "Resource '%s' has not been processed by pipeline - no lookups added"
-                    % (resource)
-                )
-                break
-
-        resource_lookups = get_resource_unidentified_lookups(
-            input_path=Path(resource_file_path),
-            dataset=dataset,
-            organisations=collection.resource_organisations(resource),
-            pipeline=pipeline,
-            specification=specification,
-            tmp_dir=Path(tmp_dir).absolute(),
-            org_csv_path=organisation_path,
-        )
-        new_lookups.append(resource_lookups)
-
-    if pipeline_name is not None:
-        # save new lookups to file
-        lookups = Lookups(pipeline_dir)
-        # Check if the lookups file exists, create it if not
-        if not os.path.exists(lookups.lookups_path):
-            with open(lookups.lookups_path, "w", newline="") as f:
-                writer = csv.writer(f)
-                writer.writerow(list(lookups.schema.fieldnames))
-
-        lookups.load_csv()
-        for new_lookup in new_lookups:
-            for idx, entry in enumerate(new_lookup):
-                lookups.add_entry(entry[0])
-
-        # save edited csvs
-        max_entity_num = lookups.get_max_entity(pipeline_name)
-        lookups.entity_num_gen.state["current"] = max_entity_num
-        lookups.entity_num_gen.state["range_max"] = (
-            specification.get_dataset_entity_max(pipeline_name)
-        )
-        lookups.entity_num_gen.state["range_min"] = (
-            specification.get_dataset_entity_min(pipeline_name)
-        )
-
-        # TO DO: Currently using pipeline_name to find dataset min, max, current
-        # This would not function properly if each resource had a different dataset
-
-        collection.save_csv()
-        new_lookups = lookups.save_csv()
-
-        for entity in new_lookups:
-            print(
-                entity["prefix"],
-                ",",
-                entity["organisation"],
-                ",",
-                entity["reference"],
-                ",",
-                entity["entity"],
-            )
-
-
-def get_resource_unidentified_lookups(
-    input_path: Path,
-    dataset: str,
-    pipeline: Pipeline,
-    specification: Specification,
-    organisations: list = [],
-    tmp_dir: Path = None,
-    org_csv_path: Path = None,
-):
-    # convert phase inputs
-    # could alter resource_from_path to file from path and promote to a utils folder
-    resource = resource_from_path(input_path)
-    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-    custom_temp_dir = tmp_dir  # './var'
-
-    print("")
-    print("----------------------------------------------------------------------")
-    print(f">>> organisations:{organisations}")
-    print(f">>> resource:{resource}")
-    print("----------------------------------------------------------------------")
-
-    # normalise phase inputs
-    skip_patterns = pipeline.skip_patterns(resource)
-    null_path = None
-
-    # concat field phase
-    concats = pipeline.concatenations(resource)
-    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
-
-    # map phase
-    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
-    columns = pipeline.columns(resource)
-
-    # patch phase
-    patches = pipeline.patches(resource=resource)
-
-    # harmonize phase
-    issue_log = IssueLog(dataset=dataset, resource=resource)
-
-    # default phase
-    default_fields = pipeline.default_fields(resource=resource)
-    default_values = pipeline.default_values(endpoints=[])
-
-    if len(organisations) == 1:
-        default_values["organisation"] = organisations[0]
-
-    # migrate phase
-    schema = specification.pipeline[pipeline.name]["schema"]
-
-    # organisation phase
-    organisation = Organisation(org_csv_path, Path(pipeline.path))
-
-    # print lookups phase
-    pipeline_lookups = pipeline.lookups()
-    redirect_lookups = pipeline.redirect_lookups()
-    print_lookup_phase = PrintLookupPhase(
-        lookups=pipeline_lookups, redirect_lookups=redirect_lookups
-    )
-
-    run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
-        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        print_lookup_phase,
-    )
-
-    return print_lookup_phase.new_lookup_entries
-
-
-def process_data_in_batches(entities, flattened_dir, dataset_name):
-    features = []
-    feature_collection = ""
-    for entity in entities:
-        geom = entity.pop("geometry")
-        point = entity.pop("point")
-        if geom:
-            try:
-                geometry = shapely.wkt.loads(geom)
-                feature = geojson.Feature(geometry=geometry, properties=entity)
-                features.append(feature)
-            except Exception as e:
-                logging.error(f"Error loading wkt from entity {entity['entity']}")
-                logging.error(e)
-        elif point:
-            try:
-                geometry = shapely.wkt.loads(point)
-                feature = geojson.Feature(geometry=geometry, properties=entity)
-                features.append(feature)
-            except Exception as e:
-                logging.error(f"Error loading wkt from entity {entity['entity']}")
-                logging.error(e)
-        else:
-            logging.error(
-                f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
-            )
-
-    if features:
-        feature_collection = geojson.FeatureCollection(
-            features=features, name=dataset_name
-        )
-
-    return feature_collection
-
-
-def add_redirections(csv_file_path, pipeline_dir):
-    """
-    :param csv_file_path:
-    :param pipeline_dir:
-    :return:
-    """
-    expected_cols = [
-        "entity_source",
-        "entity_destination",
-    ]
-
-    old_entity_path = Path(pipeline_dir) / "old-entity.csv"
-
-    with open(csv_file_path) as new_endpoints_file:
-        reader = csv.DictReader(new_endpoints_file)
-        csv_columns = reader.fieldnames
-
-        for expected_col in expected_cols:
-            if expected_col not in csv_columns:
-                raise Exception(f"required column ({expected_col}) not found in csv")
-
-        fieldnames = ["old-entity", "status", "entity"]
-
-        f = open(old_entity_path, "a", newline="")
-        writer = csv.DictWriter(f, fieldnames=fieldnames)
-        if f.tell() == 0:
-            writer.writeheader()
-
-        for row in reader:
-            if row["entity_source"] == "" or row["entity_destination"] == "":
-                print(
-                    "Missing entity number for",
-                    (
-                        row["entity_destination"]
-                        if row["entity_source"] == ""
-                        else row["entity_source"]
-                    ),
-                )
-            else:
-                writer.writerow(
-                    {
-                        "old-entity": row["entity_source"],
-                        "status": "301",
-                        "entity": row["entity_destination"],
-                    }
-                )
-    print("Redirections added to old-entity.csv")
+from collections import OrderedDict
+import csv
+import itertools
+import os
+import sys
+import json
+import logging
+from pathlib import Path
+
+import geojson
+import shapely
+
+from digital_land.specification import Specification
+from digital_land.collect import Collector
+from digital_land.collection import Collection, resource_path
+from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog
+from digital_land.organisation import Organisation
+from digital_land.package.dataset import DatasetPackage
+from digital_land.phase.combine import FactCombinePhase
+from digital_land.phase.concat import ConcatFieldPhase
+from digital_land.phase.convert import ConvertPhase, execute
+from digital_land.phase.post_conversion import PostConversionPhase
+from digital_land.phase.default import DefaultPhase
+from digital_land.phase.dump import DumpPhase
+from digital_land.phase.factor import FactorPhase
+from digital_land.phase.filter import FilterPhase
+from digital_land.phase.harmonise import HarmonisePhase
+from digital_land.phase.lookup import (
+    EntityLookupPhase,
+    FactLookupPhase,
+    PrintLookupPhase,
+)
+from digital_land.phase.map import MapPhase
+from digital_land.phase.migrate import MigratePhase
+from digital_land.phase.normalise import NormalisePhase
+from digital_land.phase.organisation import OrganisationPhase
+from digital_land.phase.parse import ParsePhase
+from digital_land.phase.patch import PatchPhase
+from digital_land.phase.pivot import PivotPhase
+from digital_land.phase.prefix import EntityPrefixPhase
+from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
+from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
+from digital_land.phase.save import SavePhase
+from digital_land.pipeline import run_pipeline, Lookups, Pipeline
+from digital_land.schema import Schema
+from digital_land.update import add_source_endpoint
+from .register import hash_value
+
+logger = logging.getLogger(__name__)
+
+
+def fetch(url, pipeline):
+    """fetch a single source endpoint URL, and add it to the collection"""
+    collector = Collector(pipeline.name)
+    collector.fetch(url)
+
+
+def collect(endpoint_path, collection_dir, pipeline):
+    """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file"""
+    collector = Collector(pipeline.name, Path(collection_dir))
+    collector.collect(endpoint_path)
+
+
+#
+#  collection commands
+#  TBD: make sub commands
+#
+def collection_list_resources(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    for resource in sorted(collection.resource.records):
+        print(resource_path(resource, directory=collection_dir))
+
+
+def collection_pipeline_makerules(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    collection.pipeline_makerules()
+
+
+def collection_save_csv(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    collection.update()
+    collection.save_csv()
+
+
+#
+#  pipeline commands
+#
+def convert(input_path, output_path, custom_temp_dir=None):
+    if not output_path:
+        output_path = default_output_path("converted", input_path)
+    dataset_resource_log = DatasetResourceLog()
+    run_pipeline(
+        ConvertPhase(
+            input_path,
+            dataset_resource_log=dataset_resource_log,
+            custom_temp_dir=custom_temp_dir,
+        ),
+        DumpPhase(output_path),
+    )
+    dataset_resource_log.save(f=sys.stdout)
+
+
+def pipeline_run(
+    dataset,
+    pipeline,
+    specification,
+    input_path,
+    output_path,
+    collection_dir="./collection",  # TBD: remove, replaced by endpoints, organisations and entry_date
+    null_path=None,  # TBD: remove this
+    issue_dir=None,
+    organisation_path=None,
+    save_harmonised=False,
+    column_field_dir=None,
+    dataset_resource_dir=None,
+    custom_temp_dir=None,  # TBD: rename to "tmpdir"
+    endpoints=[],
+    organisations=[],
+    entry_date="",
+):
+    resource = resource_from_path(input_path)
+    dataset = dataset
+    schema = specification.pipeline[pipeline.name]["schema"]
+    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+    issue_log = IssueLog(dataset=dataset, resource=resource)
+    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
+    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+
+    # load pipeline configuration
+    skip_patterns = pipeline.skip_patterns(resource)
+    columns = pipeline.columns(resource, endpoints=endpoints)
+    concats = pipeline.concatenations(resource, endpoints=endpoints)
+    patches = pipeline.patches(resource=resource)
+    lookups = pipeline.lookups(resource=resource)
+    default_fields = pipeline.default_fields(resource=resource)
+    default_values = pipeline.default_values(endpoints=endpoints)
+    combine_fields = pipeline.combine_fields(endpoints=endpoints)
+
+    # load organisations
+    organisation = Organisation(organisation_path, Path(pipeline.path))
+
+    # load the resource default values from the collection
+    if not endpoints:
+        collection = Collection(name=None, directory=collection_dir)
+        collection.load()
+        endpoints = collection.resource_endpoints(resource)
+        organisations = collection.resource_organisations(resource)
+        entry_date = collection.resource_start_date(resource)
+
+    # resource specific default values
+    if len(organisations) == 1:
+        default_values["organisation"] = organisations[0]
+
+    if entry_date:
+        default_values["entry-date"] = entry_date
+
+    run_pipeline(
+        ConvertPhase(
+            path=input_path,
+            dataset_resource_log=DatasetResourceLog(),
+            custom_temp_dir=custom_temp_dir,
+            output_path=output_path,
+        ),
+        PostConversionPhase(
+            converted_resource_path=input_path,
+            output_dir=output_path,
+            dataset=dataset,
+            typology=specification.get_dataset_typology(dataset),
+        ),
+        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
+        ParsePhase(),
+        ConcatFieldPhase(concats=concats, log=column_field_log),
+        MapPhase(
+            fieldnames=intermediate_fieldnames,
+            columns=columns,
+            log=column_field_log,
+        ),
+        FilterPhase(filters=pipeline.filters(resource)),
+        PatchPhase(
+            issues=issue_log,
+            patches=patches,
+        ),
+        HarmonisePhase(
+            field_datatype_map=specification.get_field_datatype_map(),
+            issues=issue_log,
+            dataset=dataset,
+        ),
+        DefaultPhase(
+            default_fields=default_fields,
+            default_values=default_values,
+            issues=issue_log,
+        ),
+        # TBD: move migrating columns to fields to be immediately after map
+        # this will simplify harmonisation and remove intermediate_fieldnames
+        # but effects brownfield-land and other pipelines which operate on columns
+        MigratePhase(
+            fields=specification.schema_field[schema],
+            migrations=pipeline.migrations(),
+        ),
+        OrganisationPhase(organisation=organisation, issues=issue_log),
+        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+        EntityReferencePhase(
+            dataset=dataset,
+            prefix=specification.dataset_prefix(dataset),
+        ),
+        EntityPrefixPhase(dataset=dataset),
+        EntityLookupPhase(lookups),
+        SavePhase(
+            default_output_path("harmonised", input_path),
+            fieldnames=intermediate_fieldnames,
+            enabled=save_harmonised,
+        ),
+        EntityPrunePhase(
+            issue_log=issue_log, dataset_resource_log=dataset_resource_log
+        ),
+        PivotPhase(),
+        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
+        FactorPhase(),
+        FactReferencePhase(
+            field_typology_map=specification.get_field_typology_map(),
+            field_prefix_map=specification.get_field_prefix_map(),
+        ),
+        FactLookupPhase(lookups),
+        FactPrunePhase(),
+        SavePhase(
+            output_path,
+            fieldnames=specification.factor_fieldnames(),
+        ),
+    )
+
+    issue_log.save(os.path.join(issue_dir, resource + ".csv"))
+    column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
+    dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
+
+
+#
+#  build dataset from processed resources
+#
+def dataset_create(
+    input_paths,
+    output_path,
+    organisation_path,
+    pipeline,
+    dataset,
+    specification,
+    issue_dir="issue",
+):
+    if not output_path:
+        print("missing output path", file=sys.stderr)
+        sys.exit(2)
+    organisation = Organisation(organisation_path, Path(pipeline.path))
+    package = DatasetPackage(
+        dataset,
+        organisation=organisation,
+        path=output_path,
+        specification_dir=None,  # TBD: package should use this specification object
+    )
+    package.create()
+    for path in input_paths:
+        package.load_transformed(path)
+    package.load_entities()
+
+    old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
+    if os.path.exists(old_entity_path):
+        package.load_old_entities(old_entity_path)
+
+    issue_paths = os.path.join(issue_dir, dataset)
+    if os.path.exists(issue_paths):
+        for issue_path in os.listdir(issue_paths):
+            package.load_issues(os.path.join(issue_paths, issue_path))
+    else:
+        logging.warning("No directory for this dataset in the provided issue_directory")
+
+    package.add_counts()
+
+
+def dataset_dump(input_path, output_path):
+    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
+    logging.info(cmd)
+    os.system(cmd)
+
+
+def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
+    if isinstance(csv_path, str):
+        path = Path(csv_path)
+        dataset_name = path.stem
+    elif isinstance(csv_path, Path):
+        dataset_name = csv_path.stem
+    else:
+        logging.error(f"Can't extract datapackage name from {csv_path}")
+        sys.exit(-1)
+
+    flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv")
+    with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file:
+        reader = csv.DictReader(read_file)
+
+        spec_field_names = [
+            field
+            for field in itertools.chain(
+                *[
+                    specification.current_fieldnames(schema)
+                    for schema in specification.dataset_schema[dataset]
+                ]
+            )
+        ]
+        reader_fieldnames = [
+            field.replace("_", "-")
+            for field in list(reader.fieldnames)
+            if field != "json"
+        ]
+
+        flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames))
+        # Make sure we put flattened fieldnames last
+        field_names = reader_fieldnames + sorted(list(flattened_field_names))
+
+        writer = csv.DictWriter(write_file, fieldnames=field_names)
+        writer.writeheader()
+        entities = []
+        for row in reader:
+            row.pop("geojson", None)
+            row = OrderedDict(row)
+            json_string = row.pop("json") or "{}"
+            row.update(json.loads(json_string))
+            kebab_case_row = dict(
+                [(key.replace("_", "-"), val) for key, val in row.items()]
+            )
+            writer.writerow(kebab_case_row)
+            entities.append(kebab_case_row)
+
+    # write the entities to json file as well
+    flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json")
+    with open(flattened_json_path, "w") as out_json:
+        out_json.write(json.dumps({"entities": entities}))
+    batch_size = 100000
+    temp_geojson_files = []
+    geography_entities = [e for e in entities if e["typology"] == "geography"]
+    for i in range(0, len(geography_entities), batch_size):
+        batch = geography_entities[i : i + batch_size]
+        feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name)
+
+        geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson")
+        temp_geojson_files.append(geojson_path)
+        try:
+            with open(geojson_path, "w", encoding="utf-8") as out_geojson:
+                out_geojson.write(geojson.dumps(feature_collection))
+        except Exception as e:
+            logging.error(f"Error writing to GeoJSON file: {e}")
+
+    if all(os.path.isfile(path) for path in temp_geojson_files):
+        rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson")
+        for temp_path in temp_geojson_files:
+            responseCode, _, _ = execute(
+                [
+                    "ogr2ogr",
+                    "-f",
+                    "GeoJSON",
+                    "-lco",
+                    "RFC7946=YES",
+                    "-append",
+                    rfc7946_geojson_path,
+                    temp_path,
+                ]
+            )
+
+            if responseCode != 0:
+                logging.error(
+                    "Could not generate rfc7946 compliant geojson. Use existing file."
+                )
+                execute(
+                    [
+                        "ogr2ogr",
+                        "-f",
+                        "GeoJSON",
+                        "-append",
+                        rfc7946_geojson_path,
+                        temp_path,
+                    ]
+                )
+            # clear up input geojson file
+            if os.path.isfile(temp_path):
+                os.remove(temp_path)
+
+
+#
+#  configuration commands
+#
+def collection_add_source(entry, collection, endpoint_url, collection_dir):
+    """
+    followed by a sequence of optional name and value pairs including the following names:
+    "attribution", "licence", "pipelines", "status", "plugin",
+    "parameters", "start-date", "end-date"
+    """
+    entry["collection"] = collection
+    entry["endpoint-url"] = endpoint_url
+    allowed_names = set(
+        list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames)
+    )
+    for key in entry.keys():
+        if key not in allowed_names:
+            logging.error(f"unrecognised argument '{key}'")
+            sys.exit(2)
+    add_source_endpoint(entry, directory=collection_dir)
+
+
+def add_endpoints_and_lookups(
+    csv_file_path,
+    collection_name,
+    collection_dir,
+    pipeline_dir,
+    specification_dir,
+    organisation_path,
+    tmp_dir="./var/cache",
+):
+    """
+    :param csv_file_path:
+    :param collection_name:
+    :param collection_dir:
+    :param pipeline_dir:
+    :param specification_dir:
+    :param organisation_path:
+    :param tmp_dir:
+    :return:
+    """
+
+    expected_cols = [
+        "pipelines",
+        "organisation",
+        "documentation-url",
+        "endpoint-url",
+        "start-date",
+        "licence",
+    ]
+
+    licence_csv_path = os.path.join(specification_dir, "licence.csv")
+    valid_licenses = []
+    with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile)
+        valid_licenses = [row["licence"] for row in reader]
+
+    # need to get collection name from somewhere
+    # collection name is NOT the dataset name
+    collection = Collection(name=collection_name, directory=collection_dir)
+    collection.load()
+
+    # read and process each record of the new endpoints csv at csv_file_path
+    with open(csv_file_path) as new_endpoints_file:
+        reader = csv.DictReader(new_endpoints_file)
+        csv_columns = reader.fieldnames
+
+        # validate the columns
+        for expected_col in expected_cols:
+            if expected_col not in csv_columns:
+                raise Exception(f"required column ({expected_col}) not found in csv")
+
+        # this is not perfect we should riase validation errors in our code and below should include a try and except statement
+        endpoints = []
+        for row in reader:
+            if row["licence"] not in valid_licenses:
+                raise ValueError(
+                    f"Licence '{row['licence']}' is not a valid licence according to the specification."
+                )
+            if not row["documentation-url"].strip():
+                raise ValueError(
+                    "The 'documentation-url' must be populated for each row."
+                )
+            if collection.add_source_endpoint(row):
+                endpoint = {
+                    "endpoint-url": row["endpoint-url"],
+                    "endpoint": hash_value(row["endpoint-url"]),
+                    "end-date": row.get("end-date", ""),
+                    "plugin": row.get("plugin"),
+                    "licence": row["licence"],
+                }
+                endpoints.append(endpoint)
+
+    # endpoints have been added now lets collect the resources using the endpoint information
+    collector = Collector(collection_dir=collection_dir)
+
+    for endpoint in endpoints:
+        collector.fetch(
+            url=endpoint["endpoint-url"],
+            endpoint=endpoint["endpoint"],
+            end_date=endpoint["end-date"],
+            plugin=endpoint["plugin"],
+        )
+    # reload log items
+    collection.load_log_items()
+
+    dataset_resource_map = collection.dataset_resource_map()
+
+    #  searching for the specific resources that we have downloaded
+    for dataset in dataset_resource_map:
+        resources_to_assign = []
+        for resource in dataset_resource_map[dataset]:
+            resource_endpoints = collection.resource_endpoints(resource)
+            if any(
+                endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints]
+                for endpoint in resource_endpoints
+            ):
+                resource_file_path = Path(collection_dir) / "resource" / resource
+                resources_to_assign.append(resource_file_path)
+        assign_entities(
+            resource_file_paths=resources_to_assign,
+            collection=collection,
+            pipeline_dir=pipeline_dir,
+            specification_dir=specification_dir,
+            organisation_path=organisation_path,
+            tmp_dir=tmp_dir,
+            dataset=dataset,
+        )
+
+
+def resource_from_path(path):
+    return Path(path).stem
+
+
+def default_output_path(command, input_path):
+    directory = "" if command in ["harmonised", "transformed"] else "var/"
+    return f"{directory}{command}/{resource_from_path(input_path)}.csv"
+
+
+def assign_entities(
+    resource_file_paths,
+    collection,
+    pipeline_dir,
+    specification_dir,
+    organisation_path,
+    tmp_dir="./var/cache",
+    dataset=None,
+):
+    """
+    Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection
+    :param resource_file_paths:
+    :param collection:
+    :param pipeline_dir:
+    :param specification_dir:
+    :param organisation_path:
+    :param tmp_dir:
+    :return:
+    """
+
+    specification = Specification(specification_dir)
+
+    print("")
+    print("======================================================================")
+    print("New Lookups")
+    print("======================================================================")
+
+    dataset_resource_map = collection.dataset_resource_map()
+    new_lookups = []
+
+    pipeline_name = None
+    # establish pipeline if dataset is known - else have to find dataset for each resource
+    if dataset is not None:
+        pipeline = Pipeline(pipeline_dir, dataset)
+        pipeline_name = pipeline.name
+
+    for resource_file_path in resource_file_paths:
+        resource = os.path.splitext(os.path.basename(resource_file_path))[0]
+        # Find dataset for resource if not given
+        if dataset is None:
+            for dataset_key, resources in dataset_resource_map.items():
+                if resource in list(resources):
+                    dataset = dataset_key
+                    continue
+            # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline
+            if dataset is not None:
+                pipeline = Pipeline(pipeline_dir, dataset)
+                pipeline_name = pipeline.name
+            else:
+                logging.error(
+                    "Resource '%s' has not been processed by pipeline - no lookups added"
+                    % (resource)
+                )
+                break
+
+        resource_lookups = get_resource_unidentified_lookups(
+            input_path=Path(resource_file_path),
+            dataset=dataset,
+            organisations=collection.resource_organisations(resource),
+            pipeline=pipeline,
+            specification=specification,
+            tmp_dir=Path(tmp_dir).absolute(),
+            org_csv_path=organisation_path,
+        )
+        new_lookups.append(resource_lookups)
+
+    if pipeline_name is not None:
+        # save new lookups to file
+        lookups = Lookups(pipeline_dir)
+        # Check if the lookups file exists, create it if not
+        if not os.path.exists(lookups.lookups_path):
+            with open(lookups.lookups_path, "w", newline="") as f:
+                writer = csv.writer(f)
+                writer.writerow(list(lookups.schema.fieldnames))
+
+        lookups.load_csv()
+        for new_lookup in new_lookups:
+            for idx, entry in enumerate(new_lookup):
+                lookups.add_entry(entry[0])
+
+        # save edited csvs
+        max_entity_num = lookups.get_max_entity(pipeline_name)
+        lookups.entity_num_gen.state["current"] = max_entity_num
+        lookups.entity_num_gen.state["range_max"] = (
+            specification.get_dataset_entity_max(pipeline_name)
+        )
+        lookups.entity_num_gen.state["range_min"] = (
+            specification.get_dataset_entity_min(pipeline_name)
+        )
+
+        # TO DO: Currently using pipeline_name to find dataset min, max, current
+        # This would not function properly if each resource had a different dataset
+
+        collection.save_csv()
+        new_lookups = lookups.save_csv()
+
+        for entity in new_lookups:
+            print(
+                entity["prefix"],
+                ",",
+                entity["organisation"],
+                ",",
+                entity["reference"],
+                ",",
+                entity["entity"],
+            )
+
+
+def get_resource_unidentified_lookups(
+    input_path: Path,
+    dataset: str,
+    pipeline: Pipeline,
+    specification: Specification,
+    organisations: list = [],
+    tmp_dir: Path = None,
+    org_csv_path: Path = None,
+):
+    # convert phase inputs
+    # could alter resource_from_path to file from path and promote to a utils folder
+    resource = resource_from_path(input_path)
+    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+    custom_temp_dir = tmp_dir  # './var'
+
+    print("")
+    print("----------------------------------------------------------------------")
+    print(f">>> organisations:{organisations}")
+    print(f">>> resource:{resource}")
+    print("----------------------------------------------------------------------")
+
+    # normalise phase inputs
+    skip_patterns = pipeline.skip_patterns(resource)
+    null_path = None
+
+    # concat field phase
+    concats = pipeline.concatenations(resource)
+    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
+
+    # map phase
+    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+    columns = pipeline.columns(resource)
+
+    # patch phase
+    patches = pipeline.patches(resource=resource)
+
+    # harmonize phase
+    issue_log = IssueLog(dataset=dataset, resource=resource)
+
+    # default phase
+    default_fields = pipeline.default_fields(resource=resource)
+    default_values = pipeline.default_values(endpoints=[])
+
+    if len(organisations) == 1:
+        default_values["organisation"] = organisations[0]
+
+    # migrate phase
+    schema = specification.pipeline[pipeline.name]["schema"]
+
+    # organisation phase
+    organisation = Organisation(org_csv_path, Path(pipeline.path))
+
+    # print lookups phase
+    pipeline_lookups = pipeline.lookups()
+    redirect_lookups = pipeline.redirect_lookups()
+    print_lookup_phase = PrintLookupPhase(
+        lookups=pipeline_lookups, redirect_lookups=redirect_lookups
+    )
+
+    run_pipeline(
+        ConvertPhase(
+            path=input_path,
+            dataset_resource_log=dataset_resource_log,
+            custom_temp_dir=custom_temp_dir,
+        ),
+        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
+        ParsePhase(),
+        ConcatFieldPhase(concats=concats, log=column_field_log),
+        MapPhase(
+            fieldnames=intermediate_fieldnames,
+            columns=columns,
+            log=column_field_log,
+        ),
+        FilterPhase(filters=pipeline.filters(resource)),
+        PatchPhase(
+            issues=issue_log,
+            patches=patches,
+        ),
+        HarmonisePhase(
+            field_datatype_map=specification.get_field_datatype_map(),
+            issues=issue_log,
+        ),
+        DefaultPhase(
+            default_fields=default_fields,
+            default_values=default_values,
+            issues=issue_log,
+        ),
+        # TBD: move migrating columns to fields to be immediately after map
+        # this will simplify harmonisation and remove intermediate_fieldnames
+        # but effects brownfield-land and other pipelines which operate on columns
+        MigratePhase(
+            fields=specification.schema_field[schema],
+            migrations=pipeline.migrations(),
+        ),
+        OrganisationPhase(organisation=organisation, issues=issue_log),
+        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+        EntityReferencePhase(
+            dataset=dataset,
+            prefix=specification.dataset_prefix(dataset),
+        ),
+        EntityPrefixPhase(dataset=dataset),
+        print_lookup_phase,
+    )
+
+    return print_lookup_phase.new_lookup_entries
+
+
+def process_data_in_batches(entities, flattened_dir, dataset_name):
+    features = []
+    feature_collection = ""
+    for entity in entities:
+        geom = entity.pop("geometry")
+        point = entity.pop("point")
+        if geom:
+            try:
+                geometry = shapely.wkt.loads(geom)
+                feature = geojson.Feature(geometry=geometry, properties=entity)
+                features.append(feature)
+            except Exception as e:
+                logging.error(f"Error loading wkt from entity {entity['entity']}")
+                logging.error(e)
+        elif point:
+            try:
+                geometry = shapely.wkt.loads(point)
+                feature = geojson.Feature(geometry=geometry, properties=entity)
+                features.append(feature)
+            except Exception as e:
+                logging.error(f"Error loading wkt from entity {entity['entity']}")
+                logging.error(e)
+        else:
+            logging.error(
+                f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
+            )
+
+    if features:
+        feature_collection = geojson.FeatureCollection(
+            features=features, name=dataset_name
+        )
+
+    return feature_collection
+
+
+def add_redirections(csv_file_path, pipeline_dir):
+    """
+    :param csv_file_path:
+    :param pipeline_dir:
+    :return:
+    """
+    expected_cols = [
+        "entity_source",
+        "entity_destination",
+    ]
+
+    old_entity_path = Path(pipeline_dir) / "old-entity.csv"
+
+    with open(csv_file_path) as new_endpoints_file:
+        reader = csv.DictReader(new_endpoints_file)
+        csv_columns = reader.fieldnames
+
+        for expected_col in expected_cols:
+            if expected_col not in csv_columns:
+                raise Exception(f"required column ({expected_col}) not found in csv")
+
+        fieldnames = ["old-entity", "status", "entity"]
+
+        f = open(old_entity_path, "a", newline="")
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        if f.tell() == 0:
+            writer.writeheader()
+
+        for row in reader:
+            if row["entity_source"] == "" or row["entity_destination"] == "":
+                print(
+                    "Missing entity number for",
+                    (
+                        row["entity_destination"]
+                        if row["entity_source"] == ""
+                        else row["entity_source"]
+                    ),
+                )
+            else:
+                writer.writerow(
+                    {
+                        "old-entity": row["entity_source"],
+                        "status": "301",
+                        "entity": row["entity_destination"],
+                    }
+                )
+    print("Redirections added to old-entity.csv")

From 954735a364dda5260c23faaec664559a2b3750a3 Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Thu, 18 Apr 2024 15:35:40 +0100
Subject: [PATCH 51/58] Set field name of items in ValueIssue. Small fixes to
 PostConversionPhase.

---
 digital_land/commands.py              | 2 +-
 digital_land/expectations/issue.py    | 4 ++--
 digital_land/phase/post_conversion.py | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 07befebf..f9aa4fcf 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -166,7 +166,7 @@ def pipeline_run(
         ),
         PostConversionPhase(
             converted_resource_path=input_path,
-            output_dir=output_path,
+            output_dir=os.path.dirname(output_path),
             dataset=dataset,
             typology=specification.get_dataset_typology(dataset),
         ),
diff --git a/digital_land/expectations/issue.py b/digital_land/expectations/issue.py
index 68cd0ae8..f68b3e47 100644
--- a/digital_land/expectations/issue.py
+++ b/digital_land/expectations/issue.py
@@ -120,8 +120,8 @@ class ValueIssue(Issue):
     scope: str
     dataset: str
     table_name: str = field(metadata=config(field_name="table-name"))
-    field_name: str
-    row_id: str
+    field_name: str = field(metadata=config(field_name="field-name"))
+    row_id: str = field(metadata=config(field_name="row-id"))
     value: str
     organisation: str
 
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index 00dcdd77..9fc1eec3 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -17,7 +17,8 @@ def __init__(
         self.act_on_critical_error = act_on_critical_error
 
     def process(self, stream=None):
-        return self.run()
+        self.run()
+        return stream
 
     def run(self):
         """

From 18b9b34768855caa2b8d0f310170288b6a3f2442 Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Thu, 18 Apr 2024 15:48:33 +0100
Subject: [PATCH 52/58] Converted file to unix format (so they diff easier with
 main)

---
 digital_land/commands.py                      | 1642 ++++++++---------
 .../resource_validations.py                   |  106 +-
 digital_land/phase/post_conversion.py         |   68 +-
 3 files changed, 908 insertions(+), 908 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index f9aa4fcf..d7730d15 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1,821 +1,821 @@
-from collections import OrderedDict
-import csv
-import itertools
-import os
-import sys
-import json
-import logging
-from pathlib import Path
-
-import geojson
-import shapely
-
-from digital_land.specification import Specification
-from digital_land.collect import Collector
-from digital_land.collection import Collection, resource_path
-from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog
-from digital_land.organisation import Organisation
-from digital_land.package.dataset import DatasetPackage
-from digital_land.phase.combine import FactCombinePhase
-from digital_land.phase.concat import ConcatFieldPhase
-from digital_land.phase.convert import ConvertPhase, execute
-from digital_land.phase.post_conversion import PostConversionPhase
-from digital_land.phase.default import DefaultPhase
-from digital_land.phase.dump import DumpPhase
-from digital_land.phase.factor import FactorPhase
-from digital_land.phase.filter import FilterPhase
-from digital_land.phase.harmonise import HarmonisePhase
-from digital_land.phase.lookup import (
-    EntityLookupPhase,
-    FactLookupPhase,
-    PrintLookupPhase,
-)
-from digital_land.phase.map import MapPhase
-from digital_land.phase.migrate import MigratePhase
-from digital_land.phase.normalise import NormalisePhase
-from digital_land.phase.organisation import OrganisationPhase
-from digital_land.phase.parse import ParsePhase
-from digital_land.phase.patch import PatchPhase
-from digital_land.phase.pivot import PivotPhase
-from digital_land.phase.prefix import EntityPrefixPhase
-from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
-from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
-from digital_land.phase.save import SavePhase
-from digital_land.pipeline import run_pipeline, Lookups, Pipeline
-from digital_land.schema import Schema
-from digital_land.update import add_source_endpoint
-from .register import hash_value
-
-logger = logging.getLogger(__name__)
-
-
-def fetch(url, pipeline):
-    """fetch a single source endpoint URL, and add it to the collection"""
-    collector = Collector(pipeline.name)
-    collector.fetch(url)
-
-
-def collect(endpoint_path, collection_dir, pipeline):
-    """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file"""
-    collector = Collector(pipeline.name, Path(collection_dir))
-    collector.collect(endpoint_path)
-
-
-#
-#  collection commands
-#  TBD: make sub commands
-#
-def collection_list_resources(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    for resource in sorted(collection.resource.records):
-        print(resource_path(resource, directory=collection_dir))
-
-
-def collection_pipeline_makerules(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    collection.pipeline_makerules()
-
-
-def collection_save_csv(collection_dir):
-    collection = Collection(name=None, directory=collection_dir)
-    collection.load()
-    collection.update()
-    collection.save_csv()
-
-
-#
-#  pipeline commands
-#
-def convert(input_path, output_path, custom_temp_dir=None):
-    if not output_path:
-        output_path = default_output_path("converted", input_path)
-    dataset_resource_log = DatasetResourceLog()
-    run_pipeline(
-        ConvertPhase(
-            input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
-        DumpPhase(output_path),
-    )
-    dataset_resource_log.save(f=sys.stdout)
-
-
-def pipeline_run(
-    dataset,
-    pipeline,
-    specification,
-    input_path,
-    output_path,
-    collection_dir="./collection",  # TBD: remove, replaced by endpoints, organisations and entry_date
-    null_path=None,  # TBD: remove this
-    issue_dir=None,
-    organisation_path=None,
-    save_harmonised=False,
-    column_field_dir=None,
-    dataset_resource_dir=None,
-    custom_temp_dir=None,  # TBD: rename to "tmpdir"
-    endpoints=[],
-    organisations=[],
-    entry_date="",
-):
-    resource = resource_from_path(input_path)
-    dataset = dataset
-    schema = specification.pipeline[pipeline.name]["schema"]
-    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
-    issue_log = IssueLog(dataset=dataset, resource=resource)
-    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
-    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-
-    # load pipeline configuration
-    skip_patterns = pipeline.skip_patterns(resource)
-    columns = pipeline.columns(resource, endpoints=endpoints)
-    concats = pipeline.concatenations(resource, endpoints=endpoints)
-    patches = pipeline.patches(resource=resource)
-    lookups = pipeline.lookups(resource=resource)
-    default_fields = pipeline.default_fields(resource=resource)
-    default_values = pipeline.default_values(endpoints=endpoints)
-    combine_fields = pipeline.combine_fields(endpoints=endpoints)
-
-    # load organisations
-    organisation = Organisation(organisation_path, Path(pipeline.path))
-
-    # load the resource default values from the collection
-    if not endpoints:
-        collection = Collection(name=None, directory=collection_dir)
-        collection.load()
-        endpoints = collection.resource_endpoints(resource)
-        organisations = collection.resource_organisations(resource)
-        entry_date = collection.resource_start_date(resource)
-
-    # resource specific default values
-    if len(organisations) == 1:
-        default_values["organisation"] = organisations[0]
-
-    if entry_date:
-        default_values["entry-date"] = entry_date
-
-    run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=DatasetResourceLog(),
-            custom_temp_dir=custom_temp_dir,
-            output_path=output_path,
-        ),
-        PostConversionPhase(
-            converted_resource_path=input_path,
-            output_dir=os.path.dirname(output_path),
-            dataset=dataset,
-            typology=specification.get_dataset_typology(dataset),
-        ),
-        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-            dataset=dataset,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        EntityLookupPhase(lookups),
-        SavePhase(
-            default_output_path("harmonised", input_path),
-            fieldnames=intermediate_fieldnames,
-            enabled=save_harmonised,
-        ),
-        EntityPrunePhase(
-            issue_log=issue_log, dataset_resource_log=dataset_resource_log
-        ),
-        PivotPhase(),
-        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
-        FactorPhase(),
-        FactReferencePhase(
-            field_typology_map=specification.get_field_typology_map(),
-            field_prefix_map=specification.get_field_prefix_map(),
-        ),
-        FactLookupPhase(lookups),
-        FactPrunePhase(),
-        SavePhase(
-            output_path,
-            fieldnames=specification.factor_fieldnames(),
-        ),
-    )
-
-    issue_log.save(os.path.join(issue_dir, resource + ".csv"))
-    column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
-    dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
-
-
-#
-#  build dataset from processed resources
-#
-def dataset_create(
-    input_paths,
-    output_path,
-    organisation_path,
-    pipeline,
-    dataset,
-    specification,
-    issue_dir="issue",
-):
-    if not output_path:
-        print("missing output path", file=sys.stderr)
-        sys.exit(2)
-    organisation = Organisation(organisation_path, Path(pipeline.path))
-    package = DatasetPackage(
-        dataset,
-        organisation=organisation,
-        path=output_path,
-        specification_dir=None,  # TBD: package should use this specification object
-    )
-    package.create()
-    for path in input_paths:
-        package.load_transformed(path)
-    package.load_entities()
-
-    old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
-    if os.path.exists(old_entity_path):
-        package.load_old_entities(old_entity_path)
-
-    issue_paths = os.path.join(issue_dir, dataset)
-    if os.path.exists(issue_paths):
-        for issue_path in os.listdir(issue_paths):
-            package.load_issues(os.path.join(issue_paths, issue_path))
-    else:
-        logging.warning("No directory for this dataset in the provided issue_directory")
-
-    package.add_counts()
-
-
-def dataset_dump(input_path, output_path):
-    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
-    logging.info(cmd)
-    os.system(cmd)
-
-
-def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
-    if isinstance(csv_path, str):
-        path = Path(csv_path)
-        dataset_name = path.stem
-    elif isinstance(csv_path, Path):
-        dataset_name = csv_path.stem
-    else:
-        logging.error(f"Can't extract datapackage name from {csv_path}")
-        sys.exit(-1)
-
-    flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv")
-    with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file:
-        reader = csv.DictReader(read_file)
-
-        spec_field_names = [
-            field
-            for field in itertools.chain(
-                *[
-                    specification.current_fieldnames(schema)
-                    for schema in specification.dataset_schema[dataset]
-                ]
-            )
-        ]
-        reader_fieldnames = [
-            field.replace("_", "-")
-            for field in list(reader.fieldnames)
-            if field != "json"
-        ]
-
-        flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames))
-        # Make sure we put flattened fieldnames last
-        field_names = reader_fieldnames + sorted(list(flattened_field_names))
-
-        writer = csv.DictWriter(write_file, fieldnames=field_names)
-        writer.writeheader()
-        entities = []
-        for row in reader:
-            row.pop("geojson", None)
-            row = OrderedDict(row)
-            json_string = row.pop("json") or "{}"
-            row.update(json.loads(json_string))
-            kebab_case_row = dict(
-                [(key.replace("_", "-"), val) for key, val in row.items()]
-            )
-            writer.writerow(kebab_case_row)
-            entities.append(kebab_case_row)
-
-    # write the entities to json file as well
-    flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json")
-    with open(flattened_json_path, "w") as out_json:
-        out_json.write(json.dumps({"entities": entities}))
-    batch_size = 100000
-    temp_geojson_files = []
-    geography_entities = [e for e in entities if e["typology"] == "geography"]
-    for i in range(0, len(geography_entities), batch_size):
-        batch = geography_entities[i : i + batch_size]
-        feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name)
-
-        geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson")
-        temp_geojson_files.append(geojson_path)
-        try:
-            with open(geojson_path, "w", encoding="utf-8") as out_geojson:
-                out_geojson.write(geojson.dumps(feature_collection))
-        except Exception as e:
-            logging.error(f"Error writing to GeoJSON file: {e}")
-
-    if all(os.path.isfile(path) for path in temp_geojson_files):
-        rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson")
-        for temp_path in temp_geojson_files:
-            responseCode, _, _ = execute(
-                [
-                    "ogr2ogr",
-                    "-f",
-                    "GeoJSON",
-                    "-lco",
-                    "RFC7946=YES",
-                    "-append",
-                    rfc7946_geojson_path,
-                    temp_path,
-                ]
-            )
-
-            if responseCode != 0:
-                logging.error(
-                    "Could not generate rfc7946 compliant geojson. Use existing file."
-                )
-                execute(
-                    [
-                        "ogr2ogr",
-                        "-f",
-                        "GeoJSON",
-                        "-append",
-                        rfc7946_geojson_path,
-                        temp_path,
-                    ]
-                )
-            # clear up input geojson file
-            if os.path.isfile(temp_path):
-                os.remove(temp_path)
-
-
-#
-#  configuration commands
-#
-def collection_add_source(entry, collection, endpoint_url, collection_dir):
-    """
-    followed by a sequence of optional name and value pairs including the following names:
-    "attribution", "licence", "pipelines", "status", "plugin",
-    "parameters", "start-date", "end-date"
-    """
-    entry["collection"] = collection
-    entry["endpoint-url"] = endpoint_url
-    allowed_names = set(
-        list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames)
-    )
-    for key in entry.keys():
-        if key not in allowed_names:
-            logging.error(f"unrecognised argument '{key}'")
-            sys.exit(2)
-    add_source_endpoint(entry, directory=collection_dir)
-
-
-def add_endpoints_and_lookups(
-    csv_file_path,
-    collection_name,
-    collection_dir,
-    pipeline_dir,
-    specification_dir,
-    organisation_path,
-    tmp_dir="./var/cache",
-):
-    """
-    :param csv_file_path:
-    :param collection_name:
-    :param collection_dir:
-    :param pipeline_dir:
-    :param specification_dir:
-    :param organisation_path:
-    :param tmp_dir:
-    :return:
-    """
-
-    expected_cols = [
-        "pipelines",
-        "organisation",
-        "documentation-url",
-        "endpoint-url",
-        "start-date",
-        "licence",
-    ]
-
-    licence_csv_path = os.path.join(specification_dir, "licence.csv")
-    valid_licenses = []
-    with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile:
-        reader = csv.DictReader(csvfile)
-        valid_licenses = [row["licence"] for row in reader]
-
-    # need to get collection name from somewhere
-    # collection name is NOT the dataset name
-    collection = Collection(name=collection_name, directory=collection_dir)
-    collection.load()
-
-    # read and process each record of the new endpoints csv at csv_file_path
-    with open(csv_file_path) as new_endpoints_file:
-        reader = csv.DictReader(new_endpoints_file)
-        csv_columns = reader.fieldnames
-
-        # validate the columns
-        for expected_col in expected_cols:
-            if expected_col not in csv_columns:
-                raise Exception(f"required column ({expected_col}) not found in csv")
-
-        # this is not perfect we should riase validation errors in our code and below should include a try and except statement
-        endpoints = []
-        for row in reader:
-            if row["licence"] not in valid_licenses:
-                raise ValueError(
-                    f"Licence '{row['licence']}' is not a valid licence according to the specification."
-                )
-            if not row["documentation-url"].strip():
-                raise ValueError(
-                    "The 'documentation-url' must be populated for each row."
-                )
-            if collection.add_source_endpoint(row):
-                endpoint = {
-                    "endpoint-url": row["endpoint-url"],
-                    "endpoint": hash_value(row["endpoint-url"]),
-                    "end-date": row.get("end-date", ""),
-                    "plugin": row.get("plugin"),
-                    "licence": row["licence"],
-                }
-                endpoints.append(endpoint)
-
-    # endpoints have been added now lets collect the resources using the endpoint information
-    collector = Collector(collection_dir=collection_dir)
-
-    for endpoint in endpoints:
-        collector.fetch(
-            url=endpoint["endpoint-url"],
-            endpoint=endpoint["endpoint"],
-            end_date=endpoint["end-date"],
-            plugin=endpoint["plugin"],
-        )
-    # reload log items
-    collection.load_log_items()
-
-    dataset_resource_map = collection.dataset_resource_map()
-
-    #  searching for the specific resources that we have downloaded
-    for dataset in dataset_resource_map:
-        resources_to_assign = []
-        for resource in dataset_resource_map[dataset]:
-            resource_endpoints = collection.resource_endpoints(resource)
-            if any(
-                endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints]
-                for endpoint in resource_endpoints
-            ):
-                resource_file_path = Path(collection_dir) / "resource" / resource
-                resources_to_assign.append(resource_file_path)
-        assign_entities(
-            resource_file_paths=resources_to_assign,
-            collection=collection,
-            pipeline_dir=pipeline_dir,
-            specification_dir=specification_dir,
-            organisation_path=organisation_path,
-            tmp_dir=tmp_dir,
-            dataset=dataset,
-        )
-
-
-def resource_from_path(path):
-    return Path(path).stem
-
-
-def default_output_path(command, input_path):
-    directory = "" if command in ["harmonised", "transformed"] else "var/"
-    return f"{directory}{command}/{resource_from_path(input_path)}.csv"
-
-
-def assign_entities(
-    resource_file_paths,
-    collection,
-    pipeline_dir,
-    specification_dir,
-    organisation_path,
-    tmp_dir="./var/cache",
-    dataset=None,
-):
-    """
-    Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection
-    :param resource_file_paths:
-    :param collection:
-    :param pipeline_dir:
-    :param specification_dir:
-    :param organisation_path:
-    :param tmp_dir:
-    :return:
-    """
-
-    specification = Specification(specification_dir)
-
-    print("")
-    print("======================================================================")
-    print("New Lookups")
-    print("======================================================================")
-
-    dataset_resource_map = collection.dataset_resource_map()
-    new_lookups = []
-
-    pipeline_name = None
-    # establish pipeline if dataset is known - else have to find dataset for each resource
-    if dataset is not None:
-        pipeline = Pipeline(pipeline_dir, dataset)
-        pipeline_name = pipeline.name
-
-    for resource_file_path in resource_file_paths:
-        resource = os.path.splitext(os.path.basename(resource_file_path))[0]
-        # Find dataset for resource if not given
-        if dataset is None:
-            for dataset_key, resources in dataset_resource_map.items():
-                if resource in list(resources):
-                    dataset = dataset_key
-                    continue
-            # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline
-            if dataset is not None:
-                pipeline = Pipeline(pipeline_dir, dataset)
-                pipeline_name = pipeline.name
-            else:
-                logging.error(
-                    "Resource '%s' has not been processed by pipeline - no lookups added"
-                    % (resource)
-                )
-                break
-
-        resource_lookups = get_resource_unidentified_lookups(
-            input_path=Path(resource_file_path),
-            dataset=dataset,
-            organisations=collection.resource_organisations(resource),
-            pipeline=pipeline,
-            specification=specification,
-            tmp_dir=Path(tmp_dir).absolute(),
-            org_csv_path=organisation_path,
-        )
-        new_lookups.append(resource_lookups)
-
-    if pipeline_name is not None:
-        # save new lookups to file
-        lookups = Lookups(pipeline_dir)
-        # Check if the lookups file exists, create it if not
-        if not os.path.exists(lookups.lookups_path):
-            with open(lookups.lookups_path, "w", newline="") as f:
-                writer = csv.writer(f)
-                writer.writerow(list(lookups.schema.fieldnames))
-
-        lookups.load_csv()
-        for new_lookup in new_lookups:
-            for idx, entry in enumerate(new_lookup):
-                lookups.add_entry(entry[0])
-
-        # save edited csvs
-        max_entity_num = lookups.get_max_entity(pipeline_name)
-        lookups.entity_num_gen.state["current"] = max_entity_num
-        lookups.entity_num_gen.state["range_max"] = (
-            specification.get_dataset_entity_max(pipeline_name)
-        )
-        lookups.entity_num_gen.state["range_min"] = (
-            specification.get_dataset_entity_min(pipeline_name)
-        )
-
-        # TO DO: Currently using pipeline_name to find dataset min, max, current
-        # This would not function properly if each resource had a different dataset
-
-        collection.save_csv()
-        new_lookups = lookups.save_csv()
-
-        for entity in new_lookups:
-            print(
-                entity["prefix"],
-                ",",
-                entity["organisation"],
-                ",",
-                entity["reference"],
-                ",",
-                entity["entity"],
-            )
-
-
-def get_resource_unidentified_lookups(
-    input_path: Path,
-    dataset: str,
-    pipeline: Pipeline,
-    specification: Specification,
-    organisations: list = [],
-    tmp_dir: Path = None,
-    org_csv_path: Path = None,
-):
-    # convert phase inputs
-    # could alter resource_from_path to file from path and promote to a utils folder
-    resource = resource_from_path(input_path)
-    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-    custom_temp_dir = tmp_dir  # './var'
-
-    print("")
-    print("----------------------------------------------------------------------")
-    print(f">>> organisations:{organisations}")
-    print(f">>> resource:{resource}")
-    print("----------------------------------------------------------------------")
-
-    # normalise phase inputs
-    skip_patterns = pipeline.skip_patterns(resource)
-    null_path = None
-
-    # concat field phase
-    concats = pipeline.concatenations(resource)
-    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
-
-    # map phase
-    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
-    columns = pipeline.columns(resource)
-
-    # patch phase
-    patches = pipeline.patches(resource=resource)
-
-    # harmonize phase
-    issue_log = IssueLog(dataset=dataset, resource=resource)
-
-    # default phase
-    default_fields = pipeline.default_fields(resource=resource)
-    default_values = pipeline.default_values(endpoints=[])
-
-    if len(organisations) == 1:
-        default_values["organisation"] = organisations[0]
-
-    # migrate phase
-    schema = specification.pipeline[pipeline.name]["schema"]
-
-    # organisation phase
-    organisation = Organisation(org_csv_path, Path(pipeline.path))
-
-    # print lookups phase
-    pipeline_lookups = pipeline.lookups()
-    redirect_lookups = pipeline.redirect_lookups()
-    print_lookup_phase = PrintLookupPhase(
-        lookups=pipeline_lookups, redirect_lookups=redirect_lookups
-    )
-
-    run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=dataset_resource_log,
-            custom_temp_dir=custom_temp_dir,
-        ),
-        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        print_lookup_phase,
-    )
-
-    return print_lookup_phase.new_lookup_entries
-
-
-def process_data_in_batches(entities, flattened_dir, dataset_name):
-    features = []
-    feature_collection = ""
-    for entity in entities:
-        geom = entity.pop("geometry")
-        point = entity.pop("point")
-        if geom:
-            try:
-                geometry = shapely.wkt.loads(geom)
-                feature = geojson.Feature(geometry=geometry, properties=entity)
-                features.append(feature)
-            except Exception as e:
-                logging.error(f"Error loading wkt from entity {entity['entity']}")
-                logging.error(e)
-        elif point:
-            try:
-                geometry = shapely.wkt.loads(point)
-                feature = geojson.Feature(geometry=geometry, properties=entity)
-                features.append(feature)
-            except Exception as e:
-                logging.error(f"Error loading wkt from entity {entity['entity']}")
-                logging.error(e)
-        else:
-            logging.error(
-                f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
-            )
-
-    if features:
-        feature_collection = geojson.FeatureCollection(
-            features=features, name=dataset_name
-        )
-
-    return feature_collection
-
-
-def add_redirections(csv_file_path, pipeline_dir):
-    """
-    :param csv_file_path:
-    :param pipeline_dir:
-    :return:
-    """
-    expected_cols = [
-        "entity_source",
-        "entity_destination",
-    ]
-
-    old_entity_path = Path(pipeline_dir) / "old-entity.csv"
-
-    with open(csv_file_path) as new_endpoints_file:
-        reader = csv.DictReader(new_endpoints_file)
-        csv_columns = reader.fieldnames
-
-        for expected_col in expected_cols:
-            if expected_col not in csv_columns:
-                raise Exception(f"required column ({expected_col}) not found in csv")
-
-        fieldnames = ["old-entity", "status", "entity"]
-
-        f = open(old_entity_path, "a", newline="")
-        writer = csv.DictWriter(f, fieldnames=fieldnames)
-        if f.tell() == 0:
-            writer.writeheader()
-
-        for row in reader:
-            if row["entity_source"] == "" or row["entity_destination"] == "":
-                print(
-                    "Missing entity number for",
-                    (
-                        row["entity_destination"]
-                        if row["entity_source"] == ""
-                        else row["entity_source"]
-                    ),
-                )
-            else:
-                writer.writerow(
-                    {
-                        "old-entity": row["entity_source"],
-                        "status": "301",
-                        "entity": row["entity_destination"],
-                    }
-                )
-    print("Redirections added to old-entity.csv")
+from collections import OrderedDict
+import csv
+import itertools
+import os
+import sys
+import json
+import logging
+from pathlib import Path
+
+import geojson
+import shapely
+
+from digital_land.specification import Specification
+from digital_land.collect import Collector
+from digital_land.collection import Collection, resource_path
+from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog
+from digital_land.organisation import Organisation
+from digital_land.package.dataset import DatasetPackage
+from digital_land.phase.combine import FactCombinePhase
+from digital_land.phase.concat import ConcatFieldPhase
+from digital_land.phase.convert import ConvertPhase, execute
+from digital_land.phase.post_conversion import PostConversionPhase
+from digital_land.phase.default import DefaultPhase
+from digital_land.phase.dump import DumpPhase
+from digital_land.phase.factor import FactorPhase
+from digital_land.phase.filter import FilterPhase
+from digital_land.phase.harmonise import HarmonisePhase
+from digital_land.phase.lookup import (
+    EntityLookupPhase,
+    FactLookupPhase,
+    PrintLookupPhase,
+)
+from digital_land.phase.map import MapPhase
+from digital_land.phase.migrate import MigratePhase
+from digital_land.phase.normalise import NormalisePhase
+from digital_land.phase.organisation import OrganisationPhase
+from digital_land.phase.parse import ParsePhase
+from digital_land.phase.patch import PatchPhase
+from digital_land.phase.pivot import PivotPhase
+from digital_land.phase.prefix import EntityPrefixPhase
+from digital_land.phase.prune import FieldPrunePhase, EntityPrunePhase, FactPrunePhase
+from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
+from digital_land.phase.save import SavePhase
+from digital_land.pipeline import run_pipeline, Lookups, Pipeline
+from digital_land.schema import Schema
+from digital_land.update import add_source_endpoint
+from .register import hash_value
+
+logger = logging.getLogger(__name__)
+
+
+def fetch(url, pipeline):
+    """fetch a single source endpoint URL, and add it to the collection"""
+    collector = Collector(pipeline.name)
+    collector.fetch(url)
+
+
+def collect(endpoint_path, collection_dir, pipeline):
+    """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file"""
+    collector = Collector(pipeline.name, Path(collection_dir))
+    collector.collect(endpoint_path)
+
+
+#
+#  collection commands
+#  TBD: make sub commands
+#
+def collection_list_resources(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    for resource in sorted(collection.resource.records):
+        print(resource_path(resource, directory=collection_dir))
+
+
+def collection_pipeline_makerules(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    collection.pipeline_makerules()
+
+
+def collection_save_csv(collection_dir):
+    collection = Collection(name=None, directory=collection_dir)
+    collection.load()
+    collection.update()
+    collection.save_csv()
+
+
+#
+#  pipeline commands
+#
+def convert(input_path, output_path, custom_temp_dir=None):
+    if not output_path:
+        output_path = default_output_path("converted", input_path)
+    dataset_resource_log = DatasetResourceLog()
+    run_pipeline(
+        ConvertPhase(
+            input_path,
+            dataset_resource_log=dataset_resource_log,
+            custom_temp_dir=custom_temp_dir,
+        ),
+        DumpPhase(output_path),
+    )
+    dataset_resource_log.save(f=sys.stdout)
+
+
+def pipeline_run(
+    dataset,
+    pipeline,
+    specification,
+    input_path,
+    output_path,
+    collection_dir="./collection",  # TBD: remove, replaced by endpoints, organisations and entry_date
+    null_path=None,  # TBD: remove this
+    issue_dir=None,
+    organisation_path=None,
+    save_harmonised=False,
+    column_field_dir=None,
+    dataset_resource_dir=None,
+    custom_temp_dir=None,  # TBD: rename to "tmpdir"
+    endpoints=[],
+    organisations=[],
+    entry_date="",
+):
+    resource = resource_from_path(input_path)
+    dataset = dataset
+    schema = specification.pipeline[pipeline.name]["schema"]
+    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+    issue_log = IssueLog(dataset=dataset, resource=resource)
+    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
+    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+
+    # load pipeline configuration
+    skip_patterns = pipeline.skip_patterns(resource)
+    columns = pipeline.columns(resource, endpoints=endpoints)
+    concats = pipeline.concatenations(resource, endpoints=endpoints)
+    patches = pipeline.patches(resource=resource)
+    lookups = pipeline.lookups(resource=resource)
+    default_fields = pipeline.default_fields(resource=resource)
+    default_values = pipeline.default_values(endpoints=endpoints)
+    combine_fields = pipeline.combine_fields(endpoints=endpoints)
+
+    # load organisations
+    organisation = Organisation(organisation_path, Path(pipeline.path))
+
+    # load the resource default values from the collection
+    if not endpoints:
+        collection = Collection(name=None, directory=collection_dir)
+        collection.load()
+        endpoints = collection.resource_endpoints(resource)
+        organisations = collection.resource_organisations(resource)
+        entry_date = collection.resource_start_date(resource)
+
+    # resource specific default values
+    if len(organisations) == 1:
+        default_values["organisation"] = organisations[0]
+
+    if entry_date:
+        default_values["entry-date"] = entry_date
+
+    run_pipeline(
+        ConvertPhase(
+            path=input_path,
+            dataset_resource_log=DatasetResourceLog(),
+            custom_temp_dir=custom_temp_dir,
+            output_path=output_path,
+        ),
+        PostConversionPhase(
+            converted_resource_path=input_path,
+            output_dir=os.path.dirname(output_path),
+            dataset=dataset,
+            typology=specification.get_dataset_typology(dataset),
+        ),
+        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
+        ParsePhase(),
+        ConcatFieldPhase(concats=concats, log=column_field_log),
+        MapPhase(
+            fieldnames=intermediate_fieldnames,
+            columns=columns,
+            log=column_field_log,
+        ),
+        FilterPhase(filters=pipeline.filters(resource)),
+        PatchPhase(
+            issues=issue_log,
+            patches=patches,
+        ),
+        HarmonisePhase(
+            field_datatype_map=specification.get_field_datatype_map(),
+            issues=issue_log,
+            dataset=dataset,
+        ),
+        DefaultPhase(
+            default_fields=default_fields,
+            default_values=default_values,
+            issues=issue_log,
+        ),
+        # TBD: move migrating columns to fields to be immediately after map
+        # this will simplify harmonisation and remove intermediate_fieldnames
+        # but effects brownfield-land and other pipelines which operate on columns
+        MigratePhase(
+            fields=specification.schema_field[schema],
+            migrations=pipeline.migrations(),
+        ),
+        OrganisationPhase(organisation=organisation, issues=issue_log),
+        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+        EntityReferencePhase(
+            dataset=dataset,
+            prefix=specification.dataset_prefix(dataset),
+        ),
+        EntityPrefixPhase(dataset=dataset),
+        EntityLookupPhase(lookups),
+        SavePhase(
+            default_output_path("harmonised", input_path),
+            fieldnames=intermediate_fieldnames,
+            enabled=save_harmonised,
+        ),
+        EntityPrunePhase(
+            issue_log=issue_log, dataset_resource_log=dataset_resource_log
+        ),
+        PivotPhase(),
+        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
+        FactorPhase(),
+        FactReferencePhase(
+            field_typology_map=specification.get_field_typology_map(),
+            field_prefix_map=specification.get_field_prefix_map(),
+        ),
+        FactLookupPhase(lookups),
+        FactPrunePhase(),
+        SavePhase(
+            output_path,
+            fieldnames=specification.factor_fieldnames(),
+        ),
+    )
+
+    issue_log.save(os.path.join(issue_dir, resource + ".csv"))
+    column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
+    dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
+
+
+#
+#  build dataset from processed resources
+#
+def dataset_create(
+    input_paths,
+    output_path,
+    organisation_path,
+    pipeline,
+    dataset,
+    specification,
+    issue_dir="issue",
+):
+    if not output_path:
+        print("missing output path", file=sys.stderr)
+        sys.exit(2)
+    organisation = Organisation(organisation_path, Path(pipeline.path))
+    package = DatasetPackage(
+        dataset,
+        organisation=organisation,
+        path=output_path,
+        specification_dir=None,  # TBD: package should use this specification object
+    )
+    package.create()
+    for path in input_paths:
+        package.load_transformed(path)
+    package.load_entities()
+
+    old_entity_path = os.path.join(pipeline.path, "old-entity.csv")
+    if os.path.exists(old_entity_path):
+        package.load_old_entities(old_entity_path)
+
+    issue_paths = os.path.join(issue_dir, dataset)
+    if os.path.exists(issue_paths):
+        for issue_path in os.listdir(issue_paths):
+            package.load_issues(os.path.join(issue_paths, issue_path))
+    else:
+        logging.warning("No directory for this dataset in the provided issue_directory")
+
+    package.add_counts()
+
+
+def dataset_dump(input_path, output_path):
+    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
+    logging.info(cmd)
+    os.system(cmd)
+
+
+def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
+    if isinstance(csv_path, str):
+        path = Path(csv_path)
+        dataset_name = path.stem
+    elif isinstance(csv_path, Path):
+        dataset_name = csv_path.stem
+    else:
+        logging.error(f"Can't extract datapackage name from {csv_path}")
+        sys.exit(-1)
+
+    flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv")
+    with open(csv_path, "r") as read_file, open(flattened_csv_path, "w+") as write_file:
+        reader = csv.DictReader(read_file)
+
+        spec_field_names = [
+            field
+            for field in itertools.chain(
+                *[
+                    specification.current_fieldnames(schema)
+                    for schema in specification.dataset_schema[dataset]
+                ]
+            )
+        ]
+        reader_fieldnames = [
+            field.replace("_", "-")
+            for field in list(reader.fieldnames)
+            if field != "json"
+        ]
+
+        flattened_field_names = set(spec_field_names).difference(set(reader_fieldnames))
+        # Make sure we put flattened fieldnames last
+        field_names = reader_fieldnames + sorted(list(flattened_field_names))
+
+        writer = csv.DictWriter(write_file, fieldnames=field_names)
+        writer.writeheader()
+        entities = []
+        for row in reader:
+            row.pop("geojson", None)
+            row = OrderedDict(row)
+            json_string = row.pop("json") or "{}"
+            row.update(json.loads(json_string))
+            kebab_case_row = dict(
+                [(key.replace("_", "-"), val) for key, val in row.items()]
+            )
+            writer.writerow(kebab_case_row)
+            entities.append(kebab_case_row)
+
+    # write the entities to json file as well
+    flattened_json_path = os.path.join(flattened_dir, f"{dataset_name}.json")
+    with open(flattened_json_path, "w") as out_json:
+        out_json.write(json.dumps({"entities": entities}))
+    batch_size = 100000
+    temp_geojson_files = []
+    geography_entities = [e for e in entities if e["typology"] == "geography"]
+    for i in range(0, len(geography_entities), batch_size):
+        batch = geography_entities[i : i + batch_size]
+        feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name)
+
+        geojson_path = os.path.join(flattened_dir, f"{dataset_name}-tmp-{i}.geojson")
+        temp_geojson_files.append(geojson_path)
+        try:
+            with open(geojson_path, "w", encoding="utf-8") as out_geojson:
+                out_geojson.write(geojson.dumps(feature_collection))
+        except Exception as e:
+            logging.error(f"Error writing to GeoJSON file: {e}")
+
+    if all(os.path.isfile(path) for path in temp_geojson_files):
+        rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson")
+        for temp_path in temp_geojson_files:
+            responseCode, _, _ = execute(
+                [
+                    "ogr2ogr",
+                    "-f",
+                    "GeoJSON",
+                    "-lco",
+                    "RFC7946=YES",
+                    "-append",
+                    rfc7946_geojson_path,
+                    temp_path,
+                ]
+            )
+
+            if responseCode != 0:
+                logging.error(
+                    "Could not generate rfc7946 compliant geojson. Use existing file."
+                )
+                execute(
+                    [
+                        "ogr2ogr",
+                        "-f",
+                        "GeoJSON",
+                        "-append",
+                        rfc7946_geojson_path,
+                        temp_path,
+                    ]
+                )
+            # clear up input geojson file
+            if os.path.isfile(temp_path):
+                os.remove(temp_path)
+
+
+#
+#  configuration commands
+#
+def collection_add_source(entry, collection, endpoint_url, collection_dir):
+    """
+    followed by a sequence of optional name and value pairs including the following names:
+    "attribution", "licence", "pipelines", "status", "plugin",
+    "parameters", "start-date", "end-date"
+    """
+    entry["collection"] = collection
+    entry["endpoint-url"] = endpoint_url
+    allowed_names = set(
+        list(Schema("endpoint").fieldnames) + list(Schema("source").fieldnames)
+    )
+    for key in entry.keys():
+        if key not in allowed_names:
+            logging.error(f"unrecognised argument '{key}'")
+            sys.exit(2)
+    add_source_endpoint(entry, directory=collection_dir)
+
+
+def add_endpoints_and_lookups(
+    csv_file_path,
+    collection_name,
+    collection_dir,
+    pipeline_dir,
+    specification_dir,
+    organisation_path,
+    tmp_dir="./var/cache",
+):
+    """
+    :param csv_file_path:
+    :param collection_name:
+    :param collection_dir:
+    :param pipeline_dir:
+    :param specification_dir:
+    :param organisation_path:
+    :param tmp_dir:
+    :return:
+    """
+
+    expected_cols = [
+        "pipelines",
+        "organisation",
+        "documentation-url",
+        "endpoint-url",
+        "start-date",
+        "licence",
+    ]
+
+    licence_csv_path = os.path.join(specification_dir, "licence.csv")
+    valid_licenses = []
+    with open(licence_csv_path, mode="r", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile)
+        valid_licenses = [row["licence"] for row in reader]
+
+    # need to get collection name from somewhere
+    # collection name is NOT the dataset name
+    collection = Collection(name=collection_name, directory=collection_dir)
+    collection.load()
+
+    # read and process each record of the new endpoints csv at csv_file_path
+    with open(csv_file_path) as new_endpoints_file:
+        reader = csv.DictReader(new_endpoints_file)
+        csv_columns = reader.fieldnames
+
+        # validate the columns
+        for expected_col in expected_cols:
+            if expected_col not in csv_columns:
+                raise Exception(f"required column ({expected_col}) not found in csv")
+
+        # this is not perfect we should riase validation errors in our code and below should include a try and except statement
+        endpoints = []
+        for row in reader:
+            if row["licence"] not in valid_licenses:
+                raise ValueError(
+                    f"Licence '{row['licence']}' is not a valid licence according to the specification."
+                )
+            if not row["documentation-url"].strip():
+                raise ValueError(
+                    "The 'documentation-url' must be populated for each row."
+                )
+            if collection.add_source_endpoint(row):
+                endpoint = {
+                    "endpoint-url": row["endpoint-url"],
+                    "endpoint": hash_value(row["endpoint-url"]),
+                    "end-date": row.get("end-date", ""),
+                    "plugin": row.get("plugin"),
+                    "licence": row["licence"],
+                }
+                endpoints.append(endpoint)
+
+    # endpoints have been added now lets collect the resources using the endpoint information
+    collector = Collector(collection_dir=collection_dir)
+
+    for endpoint in endpoints:
+        collector.fetch(
+            url=endpoint["endpoint-url"],
+            endpoint=endpoint["endpoint"],
+            end_date=endpoint["end-date"],
+            plugin=endpoint["plugin"],
+        )
+    # reload log items
+    collection.load_log_items()
+
+    dataset_resource_map = collection.dataset_resource_map()
+
+    #  searching for the specific resources that we have downloaded
+    for dataset in dataset_resource_map:
+        resources_to_assign = []
+        for resource in dataset_resource_map[dataset]:
+            resource_endpoints = collection.resource_endpoints(resource)
+            if any(
+                endpoint in [new_endpoint["endpoint"] for new_endpoint in endpoints]
+                for endpoint in resource_endpoints
+            ):
+                resource_file_path = Path(collection_dir) / "resource" / resource
+                resources_to_assign.append(resource_file_path)
+        assign_entities(
+            resource_file_paths=resources_to_assign,
+            collection=collection,
+            pipeline_dir=pipeline_dir,
+            specification_dir=specification_dir,
+            organisation_path=organisation_path,
+            tmp_dir=tmp_dir,
+            dataset=dataset,
+        )
+
+
+def resource_from_path(path):
+    return Path(path).stem
+
+
+def default_output_path(command, input_path):
+    directory = "" if command in ["harmonised", "transformed"] else "var/"
+    return f"{directory}{command}/{resource_from_path(input_path)}.csv"
+
+
+def assign_entities(
+    resource_file_paths,
+    collection,
+    pipeline_dir,
+    specification_dir,
+    organisation_path,
+    tmp_dir="./var/cache",
+    dataset=None,
+):
+    """
+    Assigns entities for the given resources in the given collection. The resources must have sources already added to the collection
+    :param resource_file_paths:
+    :param collection:
+    :param pipeline_dir:
+    :param specification_dir:
+    :param organisation_path:
+    :param tmp_dir:
+    :return:
+    """
+
+    specification = Specification(specification_dir)
+
+    print("")
+    print("======================================================================")
+    print("New Lookups")
+    print("======================================================================")
+
+    dataset_resource_map = collection.dataset_resource_map()
+    new_lookups = []
+
+    pipeline_name = None
+    # establish pipeline if dataset is known - else have to find dataset for each resource
+    if dataset is not None:
+        pipeline = Pipeline(pipeline_dir, dataset)
+        pipeline_name = pipeline.name
+
+    for resource_file_path in resource_file_paths:
+        resource = os.path.splitext(os.path.basename(resource_file_path))[0]
+        # Find dataset for resource if not given
+        if dataset is None:
+            for dataset_key, resources in dataset_resource_map.items():
+                if resource in list(resources):
+                    dataset = dataset_key
+                    continue
+            # Check whether dataset was found in dataset resource map in case resource hasn't been run through pipeline
+            if dataset is not None:
+                pipeline = Pipeline(pipeline_dir, dataset)
+                pipeline_name = pipeline.name
+            else:
+                logging.error(
+                    "Resource '%s' has not been processed by pipeline - no lookups added"
+                    % (resource)
+                )
+                break
+
+        resource_lookups = get_resource_unidentified_lookups(
+            input_path=Path(resource_file_path),
+            dataset=dataset,
+            organisations=collection.resource_organisations(resource),
+            pipeline=pipeline,
+            specification=specification,
+            tmp_dir=Path(tmp_dir).absolute(),
+            org_csv_path=organisation_path,
+        )
+        new_lookups.append(resource_lookups)
+
+    if pipeline_name is not None:
+        # save new lookups to file
+        lookups = Lookups(pipeline_dir)
+        # Check if the lookups file exists, create it if not
+        if not os.path.exists(lookups.lookups_path):
+            with open(lookups.lookups_path, "w", newline="") as f:
+                writer = csv.writer(f)
+                writer.writerow(list(lookups.schema.fieldnames))
+
+        lookups.load_csv()
+        for new_lookup in new_lookups:
+            for idx, entry in enumerate(new_lookup):
+                lookups.add_entry(entry[0])
+
+        # save edited csvs
+        max_entity_num = lookups.get_max_entity(pipeline_name)
+        lookups.entity_num_gen.state["current"] = max_entity_num
+        lookups.entity_num_gen.state["range_max"] = (
+            specification.get_dataset_entity_max(pipeline_name)
+        )
+        lookups.entity_num_gen.state["range_min"] = (
+            specification.get_dataset_entity_min(pipeline_name)
+        )
+
+        # TO DO: Currently using pipeline_name to find dataset min, max, current
+        # This would not function properly if each resource had a different dataset
+
+        collection.save_csv()
+        new_lookups = lookups.save_csv()
+
+        for entity in new_lookups:
+            print(
+                entity["prefix"],
+                ",",
+                entity["organisation"],
+                ",",
+                entity["reference"],
+                ",",
+                entity["entity"],
+            )
+
+
+def get_resource_unidentified_lookups(
+    input_path: Path,
+    dataset: str,
+    pipeline: Pipeline,
+    specification: Specification,
+    organisations: list = [],
+    tmp_dir: Path = None,
+    org_csv_path: Path = None,
+):
+    # convert phase inputs
+    # could alter resource_from_path to file from path and promote to a utils folder
+    resource = resource_from_path(input_path)
+    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
+    custom_temp_dir = tmp_dir  # './var'
+
+    print("")
+    print("----------------------------------------------------------------------")
+    print(f">>> organisations:{organisations}")
+    print(f">>> resource:{resource}")
+    print("----------------------------------------------------------------------")
+
+    # normalise phase inputs
+    skip_patterns = pipeline.skip_patterns(resource)
+    null_path = None
+
+    # concat field phase
+    concats = pipeline.concatenations(resource)
+    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
+
+    # map phase
+    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
+    columns = pipeline.columns(resource)
+
+    # patch phase
+    patches = pipeline.patches(resource=resource)
+
+    # harmonize phase
+    issue_log = IssueLog(dataset=dataset, resource=resource)
+
+    # default phase
+    default_fields = pipeline.default_fields(resource=resource)
+    default_values = pipeline.default_values(endpoints=[])
+
+    if len(organisations) == 1:
+        default_values["organisation"] = organisations[0]
+
+    # migrate phase
+    schema = specification.pipeline[pipeline.name]["schema"]
+
+    # organisation phase
+    organisation = Organisation(org_csv_path, Path(pipeline.path))
+
+    # print lookups phase
+    pipeline_lookups = pipeline.lookups()
+    redirect_lookups = pipeline.redirect_lookups()
+    print_lookup_phase = PrintLookupPhase(
+        lookups=pipeline_lookups, redirect_lookups=redirect_lookups
+    )
+
+    run_pipeline(
+        ConvertPhase(
+            path=input_path,
+            dataset_resource_log=dataset_resource_log,
+            custom_temp_dir=custom_temp_dir,
+        ),
+        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
+        ParsePhase(),
+        ConcatFieldPhase(concats=concats, log=column_field_log),
+        MapPhase(
+            fieldnames=intermediate_fieldnames,
+            columns=columns,
+            log=column_field_log,
+        ),
+        FilterPhase(filters=pipeline.filters(resource)),
+        PatchPhase(
+            issues=issue_log,
+            patches=patches,
+        ),
+        HarmonisePhase(
+            field_datatype_map=specification.get_field_datatype_map(),
+            issues=issue_log,
+        ),
+        DefaultPhase(
+            default_fields=default_fields,
+            default_values=default_values,
+            issues=issue_log,
+        ),
+        # TBD: move migrating columns to fields to be immediately after map
+        # this will simplify harmonisation and remove intermediate_fieldnames
+        # but effects brownfield-land and other pipelines which operate on columns
+        MigratePhase(
+            fields=specification.schema_field[schema],
+            migrations=pipeline.migrations(),
+        ),
+        OrganisationPhase(organisation=organisation, issues=issue_log),
+        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+        EntityReferencePhase(
+            dataset=dataset,
+            prefix=specification.dataset_prefix(dataset),
+        ),
+        EntityPrefixPhase(dataset=dataset),
+        print_lookup_phase,
+    )
+
+    return print_lookup_phase.new_lookup_entries
+
+
+def process_data_in_batches(entities, flattened_dir, dataset_name):
+    features = []
+    feature_collection = ""
+    for entity in entities:
+        geom = entity.pop("geometry")
+        point = entity.pop("point")
+        if geom:
+            try:
+                geometry = shapely.wkt.loads(geom)
+                feature = geojson.Feature(geometry=geometry, properties=entity)
+                features.append(feature)
+            except Exception as e:
+                logging.error(f"Error loading wkt from entity {entity['entity']}")
+                logging.error(e)
+        elif point:
+            try:
+                geometry = shapely.wkt.loads(point)
+                feature = geojson.Feature(geometry=geometry, properties=entity)
+                features.append(feature)
+            except Exception as e:
+                logging.error(f"Error loading wkt from entity {entity['entity']}")
+                logging.error(e)
+        else:
+            logging.error(
+                f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
+            )
+
+    if features:
+        feature_collection = geojson.FeatureCollection(
+            features=features, name=dataset_name
+        )
+
+    return feature_collection
+
+
+def add_redirections(csv_file_path, pipeline_dir):
+    """
+    :param csv_file_path:
+    :param pipeline_dir:
+    :return:
+    """
+    expected_cols = [
+        "entity_source",
+        "entity_destination",
+    ]
+
+    old_entity_path = Path(pipeline_dir) / "old-entity.csv"
+
+    with open(csv_file_path) as new_endpoints_file:
+        reader = csv.DictReader(new_endpoints_file)
+        csv_columns = reader.fieldnames
+
+        for expected_col in expected_cols:
+            if expected_col not in csv_columns:
+                raise Exception(f"required column ({expected_col}) not found in csv")
+
+        fieldnames = ["old-entity", "status", "entity"]
+
+        f = open(old_entity_path, "a", newline="")
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        if f.tell() == 0:
+            writer.writeheader()
+
+        for row in reader:
+            if row["entity_source"] == "" or row["entity_destination"] == "":
+                print(
+                    "Missing entity number for",
+                    (
+                        row["entity_destination"]
+                        if row["entity_source"] == ""
+                        else row["entity_source"]
+                    ),
+                )
+            else:
+                writer.writerow(
+                    {
+                        "old-entity": row["entity_source"],
+                        "status": "301",
+                        "entity": row["entity_destination"],
+                    }
+                )
+    print("Redirections added to old-entity.csv")
diff --git a/digital_land/expectations/expectation_functions/resource_validations.py b/digital_land/expectations/expectation_functions/resource_validations.py
index c6acae74..6f6d0520 100644
--- a/digital_land/expectations/expectation_functions/resource_validations.py
+++ b/digital_land/expectations/expectation_functions/resource_validations.py
@@ -1,53 +1,53 @@
-import csv
-
-
-def check_for_duplicate_references(csv_path, **kwargs):
-    duplicates = {}
-    issues = []
-    with csv_path.open(newline="") as csvfile:
-        reader = csv.DictReader(csvfile)
-        for row_number, row in enumerate(reader, start=1):
-            ref = row.get("reference")
-            if ref in duplicates:
-                duplicates[ref].append(row_number)
-            else:
-                duplicates[ref] = [row_number]
-
-    for ref, rows in duplicates.items():
-        if len(rows) > 1:
-            issues.append(
-                {
-                    "scope": "row-group",
-                    "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
-                    "dataset": "dataset",
-                    "table_name": "resource",
-                    "rows": rows,
-                    "row_id": str(rows[0]),
-                    "organisation": "organisation",
-                }
-            )
-
-    return True, "Checked for duplicate references.", issues
-
-
-def validate_references(csv_path, **kwargs):
-    issues = []
-    with csv_path.open(newline="") as csvfile:
-        reader = csv.DictReader(csvfile)
-        for row_number, row in enumerate(reader, start=1):
-            ref = row.get("reference")
-            if not ref:  # This will be True for both None and empty strings
-                issues.append(
-                    {
-                        "scope": "value",
-                        "message": f"Reference is missing on row {row_number}.",
-                        "dataset": "dataset",
-                        "table_name": "resource",
-                        "field_name": "reference",
-                        "row_id": str(row_number),
-                        "value": "Missing",
-                        "organisation": "organisation",
-                    }
-                )
-
-    return len(issues) == 0, "Checked for unpopulated references.", issues
+import csv
+
+
+def check_for_duplicate_references(csv_path, **kwargs):
+    duplicates = {}
+    issues = []
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+            if ref in duplicates:
+                duplicates[ref].append(row_number)
+            else:
+                duplicates[ref] = [row_number]
+
+    for ref, rows in duplicates.items():
+        if len(rows) > 1:
+            issues.append(
+                {
+                    "scope": "row-group",
+                    "message": f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                    "dataset": "dataset",
+                    "table_name": "resource",
+                    "rows": rows,
+                    "row_id": str(rows[0]),
+                    "organisation": "organisation",
+                }
+            )
+
+    return True, "Checked for duplicate references.", issues
+
+
+def validate_references(csv_path, **kwargs):
+    issues = []
+    with csv_path.open(newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_number, row in enumerate(reader, start=1):
+            ref = row.get("reference")
+            if not ref:  # This will be True for both None and empty strings
+                issues.append(
+                    {
+                        "scope": "value",
+                        "message": f"Reference is missing on row {row_number}.",
+                        "dataset": "dataset",
+                        "table_name": "resource",
+                        "field_name": "reference",
+                        "row_id": str(row_number),
+                        "value": "Missing",
+                        "organisation": "organisation",
+                    }
+                )
+
+    return len(issues) == 0, "Checked for unpopulated references.", issues
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index 9fc1eec3..f0a072e1 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -1,34 +1,34 @@
-from ..expectations.commands import run_converted_resource_checkpoint
-
-
-class PostConversionPhase:
-    def __init__(
-        self,
-        converted_resource_path,
-        output_dir,
-        dataset,
-        typology,
-        act_on_critical_error=False,
-    ):
-        self.converted_resource_path = converted_resource_path
-        self.output_dir = output_dir
-        self.dataset = dataset
-        self.typology = typology
-        self.act_on_critical_error = act_on_critical_error
-
-    def process(self, stream=None):
-        self.run()
-        return stream
-
-    def run(self):
-        """
-        Executes the converted resource checkpoint using the provided parameters.
-        """
-        # Run the checkpoint on the converted resource
-        run_converted_resource_checkpoint(
-            self.converted_resource_path,
-            self.output_dir,
-            self.dataset,
-            self.typology,
-            self.act_on_critical_error,
-        )
+from ..expectations.commands import run_converted_resource_checkpoint
+
+
+class PostConversionPhase:
+    def __init__(
+        self,
+        converted_resource_path,
+        output_dir,
+        dataset,
+        typology,
+        act_on_critical_error=False,
+    ):
+        self.converted_resource_path = converted_resource_path
+        self.output_dir = output_dir
+        self.dataset = dataset
+        self.typology = typology
+        self.act_on_critical_error = act_on_critical_error
+
+    def process(self, stream=None):
+        self.run()
+        return stream
+
+    def run(self):
+        """
+        Executes the converted resource checkpoint using the provided parameters.
+        """
+        # Run the checkpoint on the converted resource
+        run_converted_resource_checkpoint(
+            self.converted_resource_path,
+            self.output_dir,
+            self.dataset,
+            self.typology,
+            self.act_on_critical_error,
+        )

From 2cfb7506e95f88e71850b77ecf092a4f09aa5e11 Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Thu, 18 Apr 2024 15:51:23 +0100
Subject: [PATCH 53/58] Renamed dataset checkpoint test names to make them a
 bit clearer.

---
 tests/integration/expectations/test_checkpoint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/expectations/test_checkpoint.py b/tests/integration/expectations/test_checkpoint.py
index 13ab54c0..3927c896 100644
--- a/tests/integration/expectations/test_checkpoint.py
+++ b/tests/integration/expectations/test_checkpoint.py
@@ -63,7 +63,7 @@ def csv_path(tmp_path):
     return csv_file
 
 
-def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path):
+def test_dataset_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path):
     # load data
     test_entity_data = pd.DataFrame.from_dict({"entity": [1], "name": ["test1"]})
     test_old_entity_data = pd.DataFrame.from_dict({"old_entity": [100], "entity": [10]})
@@ -94,7 +94,7 @@ def test_run_checkpoint_success(tmp_path, sqlite3_with_entity_tables_path):
     assert len(issues) == 0
 
 
-def test_run_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
+def test_dataset_checkpoint_failure(tmp_path, sqlite3_with_entity_tables_path):
     # load data
     test_entity_data = pd.DataFrame.from_dict(
         {

From f6c2ce09abd94ac08d573e4baeb7edd774c9dc0e Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Mon, 22 Apr 2024 13:56:24 +0100
Subject: [PATCH 54/58] WIP

---
 digital_land/commands.py                       |  6 ++----
 digital_land/expectations/checkpoints/base.py  | 18 ++++++++----------
 .../checkpoints/converted_resource.py          | 10 +++++++---
 .../expectations/checkpoints/dataset.py        |  4 ++--
 digital_land/phase/convert.py                  |  3 ---
 digital_land/phase/post_conversion.py          | 11 +++++------
 6 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index d7730d15..2c2a1059 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -160,13 +160,11 @@ def pipeline_run(
     run_pipeline(
         ConvertPhase(
             path=input_path,
-            dataset_resource_log=DatasetResourceLog(),
+            dataset_resource_log=dataset_resource_log,
             custom_temp_dir=custom_temp_dir,
-            output_path=output_path,
         ),
         PostConversionPhase(
-            converted_resource_path=input_path,
-            output_dir=os.path.dirname(output_path),
+            output_dir=os.path.join("exxpectations", "post-conversion"),
             dataset=dataset,
             typology=specification.get_dataset_typology(dataset),
         ),
diff --git a/digital_land/expectations/checkpoints/base.py b/digital_land/expectations/checkpoints/base.py
index e0bbaabd..5553ffd9 100644
--- a/digital_land/expectations/checkpoints/base.py
+++ b/digital_land/expectations/checkpoints/base.py
@@ -16,7 +16,7 @@ def __init__(self, checkpoint, data_path):
         self.checkpoint = checkpoint
         self.data_path = data_path
         self.data_name = Path(data_path).stem
-        self.responses = []
+        self.results = []
         self.issues = []
         # each issue is going to have different fields, so define here what all of them are
         # this will take some iterations to get right
@@ -112,28 +112,26 @@ def run(self):
         self.failed_expectation_with_error_severity = 0
 
         for expectation in self.expectations:
-            response = self.run_expectation(expectation)
-            self.responses.append(response)
-            self.issues.extend(response.issues)
-            self.failed_expectation_with_error_severity += response.act_on_failure()
+            result = self.run_expectation(expectation)
+            self.results.append(result)
+            self.issues.extend(result.issues)
+            self.failed_expectation_with_error_severity += result.act_on_failure()
 
         if self.failed_expectation_with_error_severity > 0:
             raise DataQualityException(
                 "One or more expectations with severity RaiseError failed, see results for more details"
             )
 
-    def save_responses(self, responses, file_path, format="csv"):
+    def save_results(self, results, file_path, format="csv"):
 
         os.makedirs(os.path.dirname(file_path), exist_ok=True)
         with open(file_path, "w") as f:
             if format == "csv":
                 dictwriter = DictWriter(f, fieldnames=self.result_fieldnames)
                 dictwriter.writeheader()
-                dictwriter.writerows(
-                    [response.dict_for_export() for response in responses]
-                )
+                dictwriter.writerows([result.dict_for_export() for result in results])
             elif format == "json":
-                json.dump([response.to_dict() for response in responses], f)
+                json.dump([result.to_dict() for result in results], f)
             else:
                 raise ValueError(f"format must be csv or json and cannot be {format}")
 
diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index 14be3c21..b214912e 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -56,14 +56,18 @@ def load(self):
 
     def save(self, output_dir, format="csv"):
         responses_file_path = os.path.join(
-            output_dir, self.checkpoint, f"{self.dataset}-responses.csv"
+            output_dir, self.checkpoint, f"{self.dataset}-results.csv"
         )
         issues_file_path = os.path.join(
             output_dir, self.checkpoint, f"{self.dataset}-issues.csv"
         )
 
-        self.save_responses(
-            self.responses,
+        import pdb
+
+        pdb.set_trace()
+
+        self.save_results(
+            self.results,
             responses_file_path,
             format=format,
         )
diff --git a/digital_land/expectations/checkpoints/dataset.py b/digital_land/expectations/checkpoints/dataset.py
index 2ff0d9c3..e7504cc3 100644
--- a/digital_land/expectations/checkpoints/dataset.py
+++ b/digital_land/expectations/checkpoints/dataset.py
@@ -71,13 +71,13 @@ def load(self):
 
     def save(self, output_dir, format="csv"):
         responses_file_path = os.path.join(
-            output_dir, self.checkpoint, f"{self.data_name}-responses.csv"
+            output_dir, self.checkpoint, f"{self.data_name}-results.csv"
         )
         issues_file_path = os.path.join(
             output_dir, self.checkpoint, f"{self.data_name}-issues.csv"
         )
 
-        self.save_responses(
+        self.save_results(
             self.responses,
             responses_file_path,
             format=format,
diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 9cd99f45..7a372125 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -125,9 +125,6 @@ def __init__(
         self.path = path
         self.log = dataset_resource_log
         self.charset = ""
-        self.converted_resource_path = (
-            None  # This will hold the path to the converted file
-        )
         # Allows for custom temporary directory to be specified
         # This allows symlink creation in case of /tmp & path being on different partitions
         if custom_temp_dir:
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index f0a072e1..41dee0d1 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -1,32 +1,31 @@
+from digital_land.phase.phase import Phase
 from ..expectations.commands import run_converted_resource_checkpoint
 
 
-class PostConversionPhase:
+class PostConversionPhase(Phase):
     def __init__(
         self,
-        converted_resource_path,
         output_dir,
         dataset,
         typology,
         act_on_critical_error=False,
     ):
-        self.converted_resource_path = converted_resource_path
         self.output_dir = output_dir
         self.dataset = dataset
         self.typology = typology
         self.act_on_critical_error = act_on_critical_error
 
     def process(self, stream=None):
-        self.run()
+        self.run(stream.f.name)
         return stream
 
-    def run(self):
+    def run(self, converted_resource_path):
         """
         Executes the converted resource checkpoint using the provided parameters.
         """
         # Run the checkpoint on the converted resource
         run_converted_resource_checkpoint(
-            self.converted_resource_path,
+            converted_resource_path,
             self.output_dir,
             self.dataset,
             self.typology,

From 979e6e4a225a5072747890084935c70d0f2ce671 Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Mon, 22 Apr 2024 14:46:35 +0100
Subject: [PATCH 55/58] Post-merge fixes.

---
 digital_land/expectations/checkpoints/converted_resource.py | 4 ----
 digital_land/expectations/checkpoints/dataset.py            | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
index b214912e..4ba6bcc9 100644
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ b/digital_land/expectations/checkpoints/converted_resource.py
@@ -62,10 +62,6 @@ def save(self, output_dir, format="csv"):
             output_dir, self.checkpoint, f"{self.dataset}-issues.csv"
         )
 
-        import pdb
-
-        pdb.set_trace()
-
         self.save_results(
             self.results,
             responses_file_path,
diff --git a/digital_land/expectations/checkpoints/dataset.py b/digital_land/expectations/checkpoints/dataset.py
index e7504cc3..6c42a3cf 100644
--- a/digital_land/expectations/checkpoints/dataset.py
+++ b/digital_land/expectations/checkpoints/dataset.py
@@ -78,7 +78,7 @@ def save(self, output_dir, format="csv"):
         )
 
         self.save_results(
-            self.responses,
+            self.results,
             responses_file_path,
             format=format,
         )

From 8284c8ad3940bc8cbfa7c2de82f1297d3c1d57e7 Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Tue, 23 Apr 2024 13:51:26 +0100
Subject: [PATCH 56/58] Updated PostConversionPhase to output to issues
 instead.

---
 digital_land/commands.py              |  4 +-
 digital_land/phase/post_conversion.py | 65 +++++++++++++++++----------
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 2c2a1059..5477a085 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -164,9 +164,7 @@ def pipeline_run(
             custom_temp_dir=custom_temp_dir,
         ),
         PostConversionPhase(
-            output_dir=os.path.join("exxpectations", "post-conversion"),
-            dataset=dataset,
-            typology=specification.get_dataset_typology(dataset),
+            issues=issue_log,
         ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index 41dee0d1..aa85707b 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -1,33 +1,52 @@
 from digital_land.phase.phase import Phase
-from ..expectations.commands import run_converted_resource_checkpoint
+import csv
 
 
 class PostConversionPhase(Phase):
     def __init__(
         self,
-        output_dir,
-        dataset,
-        typology,
-        act_on_critical_error=False,
+        issues,
     ):
-        self.output_dir = output_dir
-        self.dataset = dataset
-        self.typology = typology
-        self.act_on_critical_error = act_on_critical_error
+        self.issues = issues
 
-    def process(self, stream=None):
-        self.run(stream.f.name)
+    def process(self, stream):
+        self.validate_references(stream.f.name)
+        self.check_for_duplicate_references(stream.f.name)
         return stream
 
-    def run(self, converted_resource_path):
-        """
-        Executes the converted resource checkpoint using the provided parameters.
-        """
-        # Run the checkpoint on the converted resource
-        run_converted_resource_checkpoint(
-            converted_resource_path,
-            self.output_dir,
-            self.dataset,
-            self.typology,
-            self.act_on_critical_error,
-        )
+    def check_for_duplicate_references(self, csv_path):
+        duplicates = {}
+        with open(csv_path, newline="") as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row_number, row in enumerate(reader, start=1):
+                ref = row.get("reference")
+                if (
+                    ref
+                ):  # Don't check None or empty references, as these will be picked up by validate_references
+                    if ref in duplicates:
+                        duplicates[ref].append(row_number)
+                    else:
+                        duplicates[ref] = [row_number]
+
+        for ref, rows in duplicates.items():
+            if len(rows) > 1:
+                self.issues.log_issue(
+                    "reference",
+                    "duplicate-reference",
+                    ref,
+                    f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                )
+
+    def validate_references(self, csv_path):
+        with open(csv_path, newline="") as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row_number, row in enumerate(reader, start=1):
+                ref = row.get("reference")
+                if not ref:  # This will be True for both None and empty strings
+                    self.issues.log_issue(
+                        "reference",
+                        "missing-reference",
+                        ref,
+                        f"Reference missing on row {row_number}",
+                        row_number + 1,
+                    )

From 2a19aaea55ae156f114abc32c868f0c08f4f1ac4 Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Tue, 23 Apr 2024 13:59:36 +0100
Subject: [PATCH 57/58] Removed converted resource expectation.

---
 digital_land/cli.py                           | 20 ------
 .../checkpoints/converted_resource.py         | 71 -------------------
 digital_land/expectations/commands.py         | 19 -----
 digital_land/phase/convert.py                 |  2 -
 4 files changed, 112 deletions(-)
 delete mode 100644 digital_land/expectations/checkpoints/converted_resource.py

diff --git a/digital_land/cli.py b/digital_land/cli.py
index 9a30f2df..9e47e9e5 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -238,26 +238,6 @@ def expectations_run_dataset_checkpoint(data_path, output_dir, specification_dir
     run_dataset_checkpoint(data_path, output_dir, dataset, typology)
 
 
-@cli.command(
-    "expectations-converted-resource-checkpoint",
-    short_help="runs data quality expectations against a converted resource",
-)
-@click.option(
-    "--data-path", help="path to the converted resource to use", required=True
-)
-@click.option("--output-dir", help="path/name to sqlite3 dataset", required=True)
-@click.option("--specification-dir", help="checkpoint to run", required=True)
-@click.option("--dataset", help="checkpoint to run", required=True)
-def expectations_run_converted_resource_checkpoint(
-    data_path, output_dir, specification_dir, dataset
-):
-    from digital_land.expectations.commands import run_converted_resource_checkpoint
-
-    spec = Specification(specification_dir)
-    typology = spec.get_dataset_typology(dataset)
-    run_converted_resource_checkpoint(data_path, output_dir, dataset, typology)
-
-
 # edit to add collection_name in
 @cli.command("add-endpoints-and-lookups")
 @click.argument("csv-path", nargs=1, type=click.Path())
diff --git a/digital_land/expectations/checkpoints/converted_resource.py b/digital_land/expectations/checkpoints/converted_resource.py
deleted file mode 100644
index 4ba6bcc9..00000000
--- a/digital_land/expectations/checkpoints/converted_resource.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from pathlib import Path
-from .base import BaseCheckpoint
-from ..utils import QueryRunner
-import os
-from ..expectation_functions.resource_validations import (
-    check_for_duplicate_references,
-    validate_references,
-)
-
-# Define BASE expectations which should always run
-BASE = [
-    {
-        "function": check_for_duplicate_references,
-        "name": "Check for Duplicate References",
-        "severity": "error",
-        "responsibility": "system",
-        "csv_path": None,
-    },
-    {
-        "function": validate_references,
-        "name": "Validate References",
-        "severity": "error",
-        "responsibility": "system",
-        "csv_path": None,
-    },
-]
-
-# Empty TYPOLOGY and DATASET for now as per advice
-TYPOLOGY = {}
-DATASET = {}
-
-
-class ConvertedResourceCheckpoint(BaseCheckpoint):
-    def __init__(self, dataset_path, typology, dataset=None):
-        super().__init__("converted_resource", dataset_path)
-        self.csv_path = Path(dataset_path)
-        self.dataset = dataset if dataset else self.csv_path.stem
-        self.typology = typology
-
-    def load(self):
-        self.expectations = []
-        self.expectations.extend(BASE)
-        typology_expectations = TYPOLOGY.get(self.typology, [])
-        dataset_expectations = DATASET.get(self.dataset, [])
-
-        # Extend the expectations list with relevant typology and dataset-specific expectations
-        if typology_expectations:
-            self.expectations.extend(typology_expectations)
-        if dataset_expectations:
-            self.expectations.extend(dataset_expectations)
-
-        # Assign a QueryRunner instance to each expectation
-        for expectation in self.expectations:
-            expectation["csv_path"] = self.csv_path
-            expectation["query_runner"] = QueryRunner(self.csv_path)
-
-    def save(self, output_dir, format="csv"):
-        responses_file_path = os.path.join(
-            output_dir, self.checkpoint, f"{self.dataset}-results.csv"
-        )
-        issues_file_path = os.path.join(
-            output_dir, self.checkpoint, f"{self.dataset}-issues.csv"
-        )
-
-        self.save_results(
-            self.results,
-            responses_file_path,
-            format=format,
-        )
-
-        self.save_issues(self.issues, issues_file_path, format=format)
diff --git a/digital_land/expectations/commands.py b/digital_land/expectations/commands.py
index 7b7f7922..d75cf729 100644
--- a/digital_land/expectations/commands.py
+++ b/digital_land/expectations/commands.py
@@ -1,5 +1,4 @@
 from .checkpoints.dataset import DatasetCheckpoint
-from .checkpoints.converted_resource import ConvertedResourceCheckpoint
 
 
 def run_dataset_checkpoint(
@@ -18,21 +17,3 @@ def run_dataset_checkpoint(
     checkpoint.save(output_dir, format="csv")
     if act_on_critical_error:
         checkpoint.act_on_critical_error()
-
-
-def run_converted_resource_checkpoint(
-    converted_resource_path,
-    output_dir,
-    dataset,
-    typology,
-    act_on_critical_error=False,
-):
-    """
-    Function to run the expectation checkpoint for a converted resource
-    """
-    checkpoint = ConvertedResourceCheckpoint(converted_resource_path, dataset, typology)
-    checkpoint.load()
-    checkpoint.run()
-    checkpoint.save(output_dir, format="csv")
-    if act_on_critical_error:
-        checkpoint.act_on_critical_error()
diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
index 7a372125..b57c22c1 100644
--- a/digital_land/phase/convert.py
+++ b/digital_land/phase/convert.py
@@ -155,8 +155,6 @@ def process(self, stream=None):
 
             # raise StopIteration()
             reader = iter(())
-        if self.output_path:
-            self.converted_resource_path = self.output_path
 
         return Stream(input_path, f=reader, log=self.log)
 

From 77bbff5abb397330579140df2e2e853f4068f901 Mon Sep 17 00:00:00 2001
From: Christopher Johns <cjohns@scottlogic.com>
Date: Tue, 23 Apr 2024 16:43:02 +0100
Subject: [PATCH 58/58] WIP: Run the ckecks on the pipeline data.

---
 digital_land/commands.py              |  6 +--
 digital_land/phase/post_conversion.py | 67 +++++++++++++--------------
 2 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 5477a085..e0aaee14 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -163,9 +163,6 @@ def pipeline_run(
             dataset_resource_log=dataset_resource_log,
             custom_temp_dir=custom_temp_dir,
         ),
-        PostConversionPhase(
-            issues=issue_log,
-        ),
         NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
         ParsePhase(),
         ConcatFieldPhase(concats=concats, log=column_field_log),
@@ -198,6 +195,9 @@ def pipeline_run(
         ),
         OrganisationPhase(organisation=organisation, issues=issue_log),
         FieldPrunePhase(fields=specification.current_fieldnames(schema)),
+        PostConversionPhase(  # Now badly named...
+            issues=issue_log,
+        ),
         EntityReferencePhase(
             dataset=dataset,
             prefix=specification.dataset_prefix(dataset),
diff --git a/digital_land/phase/post_conversion.py b/digital_land/phase/post_conversion.py
index aa85707b..5198d112 100644
--- a/digital_land/phase/post_conversion.py
+++ b/digital_land/phase/post_conversion.py
@@ -1,5 +1,4 @@
 from digital_land.phase.phase import Phase
-import csv
 
 
 class PostConversionPhase(Phase):
@@ -8,45 +7,43 @@ def __init__(
         issues,
     ):
         self.issues = issues
+        self.duplicates = {}
 
     def process(self, stream):
-        self.validate_references(stream.f.name)
-        self.check_for_duplicate_references(stream.f.name)
-        return stream
-
-    def check_for_duplicate_references(self, csv_path):
-        duplicates = {}
-        with open(csv_path, newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-                if (
-                    ref
-                ):  # Don't check None or empty references, as these will be picked up by validate_references
-                    if ref in duplicates:
-                        duplicates[ref].append(row_number)
-                    else:
-                        duplicates[ref] = [row_number]
-
-        for ref, rows in duplicates.items():
-            if len(rows) > 1:
+        for block in stream:
+            row = block.get("row", None)
+            if not row:
+                return
+
+            reference = row.get("reference", None)
+            line_number = block.get("line-number", None)
+
+            if reference and line_number:
+                self.validate_references(reference, line_number)
+                self.check_for_duplicate_references(reference, line_number)
+            yield block
+
+        for ref, lines in self.duplicates.items():
+            if len(lines) > 1:
                 self.issues.log_issue(
                     "reference",
                     "duplicate-reference",
                     ref,
-                    f"Duplicate reference '{ref}' found on rows: {', '.join(map(str, rows))}",
+                    f"Duplicate reference '{ref}' found on lines: {', '.join(map(str, lines))}",
                 )
 
-    def validate_references(self, csv_path):
-        with open(csv_path, newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row_number, row in enumerate(reader, start=1):
-                ref = row.get("reference")
-                if not ref:  # This will be True for both None and empty strings
-                    self.issues.log_issue(
-                        "reference",
-                        "missing-reference",
-                        ref,
-                        f"Reference missing on row {row_number}",
-                        row_number + 1,
-                    )
+    def validate_references(self, reference, line_number):
+        if not reference:  # This will be True for both None and empty strings
+            self.issues.log_issue(
+                "reference",
+                "missing-reference",
+                "",
+                "",
+                line_number,
+            )
+
+    def check_for_duplicate_references(self, reference, line_number):
+        if reference in self.duplicates:
+            self.duplicates[reference].append(line_number)
+        else:
+            self.duplicates[reference] = [line_number]