cytomining · d33bs · Jan 11, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -135,3 +135,6 @@ dmypy.json
 
 # parsl ignores
 runinfo
+
+# test data ignores
+tests/in-carta/colas-lab/data
diff --git a/CITATION.cff b/CITATION.cff
@@ -178,3 +178,16 @@ references:
     notes: >-
       MapReduce techniques are used via Parsl apps and workflow configuration
       to help achieve scalable data engineering for CytoTable.
+  - authors:
+      - name: "Colas Lab"
+    date-accessed: "2024-01-09"
+    title: Colas Lab Example IN Carta Dataset
+    type: data
+    notes: >-
+      Colas Lab provided access to dataset created from IN Carta for
+      use within CytoTable tests for furthering development efforts.
+      A modified testing dataset appears within this project
+      under `tests/data/in-carta/colas-lab`.
+      See:
+      - https://sbpdiscovery.org/our-scientists/alexandre-colas-phd
+      - https://www.moleculardevices.com/products/cellular-imaging-systems/acquisition-and-analysis-software/in-carta-image-analysis-software
diff --git a/tests/data/in-carta/colas-lab/shrink_colas_lab_data_for_tests.py b/tests/data/in-carta/colas-lab/shrink_colas_lab_data_for_tests.py
@@ -0,0 +1,60 @@
+"""
+Shrink datasets from Colas Lab from IN Carta provided as collection of CSV's.
+
+Note: built to be run from CytoTable poetry dev environment from project base, e.g.:
+`poetry run python tests/data/in-carta/colas-lab/shrink_colas-lab_data_for_tests.py`
+"""
+
+import pathlib
+
+import duckdb
+from pyarrow import csv
+
+# set a path for local and target data dir
+SOURCE_DATA_DIR = "tests/data/in-carta/colas-lab/data"
+TARGET_DATA_DIR = "tests/data/in-carta/colas-lab"
+
+# build a collection of schema
+schema_collection = []
+for data_file in pathlib.Path(SOURCE_DATA_DIR).rglob("*.csv"):
+    with duckdb.connect() as ddb:
+        # read the csv file as a pyarrow table and extract detected schema
+        schema_collection.append(
+            {
+                "file": data_file,
+                "schema": ddb.execute(
+                    f"""
+                    SELECT *
+                    FROM read_csv_auto('{data_file}')
+                    """
+                )
+                .arrow()
+                .schema,
+            }
+        )
+
+# determine if the schema are exactly alike
+for schema in schema_collection:
+    for schema_to_compare in schema_collection:
+        # compare every schema to all others
+        if schema["file"] != schema_to_compare["file"]:
+            if not schema["schema"].equals(schema_to_compare["schema"]):
+                raise TypeError("Inequal schema detected.")
+
+
+for data_file in pathlib.Path(SOURCE_DATA_DIR).rglob("*.csv"):
+    with duckdb.connect() as ddb:
+        # read the csv file as a pyarrow table and output to a new csv
+        csv.write_csv(
+            data=ddb.execute(
+                f"""
+                SELECT *
+                FROM read_csv_auto('{data_file}') as data_file
+                /* select only the first three objects to limit the dataset */
+                WHERE data_file."OBJECT ID" in (1,2,3)
+                /* select rows C and D to limit the dataset */
+                AND data_file."ROW" in ('C', 'D')
+                """
+            ).arrow(),
+            output_file=f"{TARGET_DATA_DIR}/test-{pathlib.Path(data_file).name}",
+        )
diff --git a/...olas-lab/test-Rep 4 iSMR Day0 Phall Mito Dapi_2023-Jun-06-14-14-54_Single_Target_Data.csv b/...olas-lab/test-Rep 4 iSMR Day0 Phall Mito Dapi_2023-Jun-06-14-14-54_Single_Target_Data.csv
diff --git a/...olas-lab/test-Rep 4 iSMR Day1 Phall Mito Dapi_2023-Jun-06-14-54-57_Single_Target_Data.csv b/...olas-lab/test-Rep 4 iSMR Day1 Phall Mito Dapi_2023-Jun-06-14-54-57_Single_Target_Data.csv
diff --git a/...olas-lab/test-Rep 4 iSMR Day3 Phall Mito Dapi_2023-Jun-06-15-26-09_Single_Target_Data.csv b/...olas-lab/test-Rep 4 iSMR Day3 Phall Mito Dapi_2023-Jun-06-15-26-09_Single_Target_Data.csv
diff --git a/...olas-lab/test-rep 4 iSMR Day5 Phall Mito Dapi_2023-Jun-07-10-52-08_Single_Target_Data.csv b/...olas-lab/test-rep 4 iSMR Day5 Phall Mito Dapi_2023-Jun-07-10-52-08_Single_Target_Data.csv