From 5f9145324a7e0a3514dab839b0b36f95d9f881ea Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Fri, 10 Nov 2023 12:59:52 +0000
Subject: [PATCH 01/46] add write_spatialdata script

---
 bin/process_h5ad.py        | 74 ++++++++++++++++++------------
 bin/process_spaceranger.py | 21 ++-------
 bin/process_xenium.py      |  4 +-
 bin/write_spatialdata.py   | 92 ++++++++++++++++++++++++++++++++++++++
 envs/requirements.txt      |  1 +
 main.nf                    |  2 +
 6 files changed, 145 insertions(+), 49 deletions(-)
 create mode 100644 bin/write_spatialdata.py

diff --git a/bin/process_h5ad.py b/bin/process_h5ad.py
index aa2b26a..5a7ddc4 100755
--- a/bin/process_h5ad.py
+++ b/bin/process_h5ad.py
@@ -119,6 +119,47 @@ def h5ad_to_zarr(
     return zarr_file
 
 
+def reindex_anndata_obs(adata: ad.AnnData) -> ad.AnnData:
+    # check if index is numerical, if not reindex
+    if not adata.obs.index.is_integer() and not (
+        adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
+    ):
+        IDX_NAME = "label_id"
+        adata.obs = adata.obs.reset_index(names=IDX_NAME)
+        adata.obs.index = (
+            pd.Categorical(adata.obs[IDX_NAME]).codes + 1
+        )  # avoid 0's for possible label images
+    adata.obs.index = adata.obs.index.astype(str)
+
+    return adata
+
+
+def subset_anndata(
+    adata: ad.AnnData,
+    obs_subset: tuple[str, T.Any] = None,
+    var_subset: tuple[str, T.Any] = None,
+) -> ad.AnnData:
+    # Subset adata by obs
+    if obs_subset:
+        obs_subset[1] = (
+            [obs_subset[1]]
+            if not isinstance(obs_subset[1], (list, tuple))
+            else obs_subset[1]
+        )
+        adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]
+
+    # Subset adata by var
+    if var_subset:
+        var_subset[1] = (
+            [var_subset[1]]
+            if not isinstance(var_subset[1], (list, tuple))
+            else var_subset[1]
+        )
+        adata = adata[:, adata.var[var_subset[0]].isin(var_subset[1])]
+
+    return adata
+
+
 def preprocess_anndata(
     adata: ad.AnnData,
     compute_embeddings: bool = False,
@@ -140,23 +181,7 @@ def preprocess_anndata(
             to use to subset the AnnData object. Defaults to None.
     """
 
-    # Subset adata by obs
-    if obs_subset:
-        obs_subset[1] = (
-            [obs_subset[1]]
-            if not isinstance(obs_subset[1], (list, tuple))
-            else obs_subset[1]
-        )
-        adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]
-
-    # Subset adata by var
-    if var_subset:
-        var_subset[1] = (
-            [var_subset[1]]
-            if not isinstance(var_subset[1], (list, tuple))
-            else var_subset[1]
-        )
-        adata = adata[:, adata.var[var_subset[0]].isin(var_subset[1])]
+    adata = subset_anndata(adata, obs_subset=obs_subset, var_subset=var_subset)
 
     # reindex var with a specified column
     if var_index and var_index in adata.var:
@@ -165,14 +190,7 @@ def preprocess_anndata(
         adata.var.index = adata.var.index.astype(str)
     adata.var_names_make_unique()
 
-    # check if index is numerical, if not reindex
-    if not adata.obs.index.is_integer() and not (
-        adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
-    ):
-        adata.obs["label_id"] = adata.obs.index
-        adata.obs.index = pd.Categorical(adata.obs.index)
-        adata.obs.index = adata.obs.index.codes
-        adata.obs.index = adata.obs.index.astype(str)
+    adata = reindex_anndata_obs(adata)
 
     # turn obsm into a numpy array
     for k in adata.obsm_keys():
@@ -186,16 +204,14 @@ def preprocess_anndata(
             sc.pp.neighbors(adata)
             sc.tl.umap(adata)
 
+    # ensure data types for obs
     for col in adata.obs:
-        # anndata >= 0.8.0
-        # if data type is categorical vitessce will throw "path obs/X contains a group" and won"t find .zarray
-        # if adata.obs[col].dtype == "category":
-        #     adata.obs[col] = adata.obs[col].cat.codes
         if adata.obs[col].dtype in ["int8", "int64"]:
             adata.obs[col] = adata.obs[col].astype("int32")
         if adata.obs[col].dtype == "bool":
             adata.obs[col] = adata.obs[col].astype(str).astype("category")
 
+    # ensure data types for obsm
     for col in adata.obsm:
         if type(adata.obsm[col]).__name__ in ["DataFrame", "Series"]:
             adata.obsm[col] = adata.obsm[col].to_numpy()
diff --git a/bin/process_spaceranger.py b/bin/process_spaceranger.py
index afae27e..2476f52 100755
--- a/bin/process_spaceranger.py
+++ b/bin/process_spaceranger.py
@@ -16,7 +16,7 @@
 import tifffile as tf
 from pathlib import Path
 from skimage.draw import disk
-from process_h5ad import h5ad_to_zarr
+from process_h5ad import h5ad_to_zarr, reindex_anndata_obs, subset_anndata
 
 
 def spaceranger_to_anndata(
@@ -164,23 +164,9 @@ def visium_label(
         scalef = adata.uns["spatial"][sample_id]["scalefactors"]["tissue_hires_scalef"]
         shape = [int(hires_shape[0] / scalef), int(hires_shape[1] / scalef)]
 
-    # Subset adata by obs
-    if obs_subset:
-        obs_subset[1] = (
-            [obs_subset[1]]
-            if not isinstance(obs_subset[1], (list, tuple))
-            else obs_subset[1]
-        )
-        adata = adata[adata.obs[obs_subset[0]].isin(obs_subset[1])]
+    adata = subset_anndata(adata, obs_subset=obs_subset)
 
-    # check if index is numerical, if not reindex
-    if not adata.obs.index.is_integer() and not (
-        adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
-    ):
-        adata.obs["label_id"] = adata.obs.index
-        adata.obs.index = pd.Categorical(adata.obs.index)
-        adata.obs.index = adata.obs.index.codes
-        adata.obs.index = adata.obs.index.astype(str)
+    adata = reindex_anndata_obs(adata)
 
     # turn obsm into a numpy array
     for k in adata.obsm_keys():
@@ -189,7 +175,6 @@ def visium_label(
     spot_diameter_fullres = adata.uns["spatial"][sample_id]["scalefactors"][
         "spot_diameter_fullres"
     ]
-    # hires_scalef = adata.uns["spatial"][sample_id]["scalefactors"]["tissue_hires_scalef"]
     spot_coords = adata.obsm["spatial"]
     assert adata.obs.shape[0] == spot_coords.shape[0]
 
diff --git a/bin/process_xenium.py b/bin/process_xenium.py
index 7aacd06..d96e395 100755
--- a/bin/process_xenium.py
+++ b/bin/process_xenium.py
@@ -97,7 +97,7 @@ def xenium_to_anndata(
     # pd.Categorical.codes converts them to int this is done manually at this step
     # instead of reindex_anndata so we control what matches the label image
     adata.obs = adata.obs.reset_index()
-    adata.obs.index = pd.Categorical(adata.obs["cell_id"]).codes.astype(str)
+    adata.obs.index = (pd.Categorical(adata.obs["cell_id"]).codes + 1).astype(str)
 
     return adata
 
@@ -163,7 +163,7 @@ def xenium_label(
     # starting on v1.3 cell_id looks like "aaabinlp-1"
     # pd.Categorical.codes converts them to int
     # this is required so the label image matches the h5ad ids
-    ids = pd.Categorical(ids).codes
+    ids = pd.Categorical(ids).codes + 1
 
     pols = z["polygon_vertices"][1]
 
diff --git a/bin/write_spatialdata.py b/bin/write_spatialdata.py
new file mode 100644
index 0000000..f214d68
--- /dev/null
+++ b/bin/write_spatialdata.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+write_spatialdata.py
+====================================
+Processes H5AD and images into SpatialData
+"""
+
+from __future__ import annotations
+import typing as T
+import os
+import logging
+import warnings
+import fire
+import zarr
+import anndata as ad
+import xarray as xr
+import spatialdata as sd
+
+warnings.filterwarnings("ignore")
+logging.getLogger().setLevel(logging.INFO)
+
+
+def read_image(path: str, is_label: bool = False):
+    imarray = xr.open_dataarray(path, engine="rasterio", mask_and_scale=False)
+    if is_label:
+        imarray = imarray.squeeze()
+        return sd.models.Labels2DModel.parse(imarray)
+    else:
+        imarray = imarray.rename({"band": "c"})
+        return sd.models.Image2DModel.parse(imarray)
+
+
+def write_spatialdata(
+    adata_path: str,
+    stem: str = "",
+    raw_img_path: str = None,
+    label_img_path: str = None,
+) -> str:
+    """This function takes an AnnData object and image files to
+    write to a SpatialData objetc
+
+    Args:
+        path (str): Path to the h5ad file.
+        stem (str, optional): Prefix for the output file. Defaults to "".
+        raw_img_path (str, optional): Raw image to process. Defaults to None.
+        label_img_path (str, optional): Label image to process. Defaults to None.
+
+    Returns:
+        str: Output SpatialData filename
+    """
+    if adata_path.endswith(".h5ad"):
+        adata = ad.read(adata_path, backed=True)
+    elif adata_path.endswith(".zarr"):
+        adata = ad.read_zarr(adata_path)
+    else:
+        raise SystemError("Path to AnnData not .h5ad nor .zarr")
+
+    OBS_IDX_NAME = "webatlas_index"
+    adata.obs.reset_index(names=OBS_IDX_NAME).set_index(
+        OBS_IDX_NAME, drop=False, inplace=True
+    )  # have index as both index and column
+    adata.obs[OBS_IDX_NAME] = adata.obs[OBS_IDX_NAME].astype(int)
+
+    # ensure library_id in obs
+    if "library_id" not in adata.obs:
+        adata.obs["library_id"] = 0
+        adata.obs["library_id"] = adata.obs["library_id"].astype("category")
+
+    region_key = "library_id"
+    region = adata.obs["library_id"].cat.categories.to_list()
+
+    # `region_key`, `region` and `instance_key` are supposed to be optional
+    # but enforced by code
+    sd.models.TableModel.parse(
+        adata, region_key=region_key, region=region, instance_key=OBS_IDX_NAME
+    )
+
+    sdata = sd.SpatialData(table=adata)
+
+    if raw_img_path:
+        sdata.add_image("raw", read_image(raw_img_path))
+    if label_img_path:
+        sdata.add_labels("label", read_image(label_img_path, is_label=True))
+
+    zarr_file = f"{stem}-spatialdata.zarr"
+    sdata.write(zarr_file)
+
+    return zarr_file
+
+
+if __name__ == "__main__":
+    fire.Fire(write_spatialdata)
diff --git a/envs/requirements.txt b/envs/requirements.txt
index 1057634..46e95ab 100644
--- a/envs/requirements.txt
+++ b/envs/requirements.txt
@@ -74,6 +74,7 @@ session-info==1.0.0
 setuptools==68.2.2
 six==1.16.0
 sortedcontainers==2.4.0
+spatialdata==0.0.15
 Sphinx==7.2.6
 sphinx-rtd-theme==1.3.0
 statsmodels==0.14.0
diff --git a/main.nf b/main.nf
index 1dc9bbf..8c3531a 100644
--- a/main.nf
+++ b/main.nf
@@ -15,6 +15,7 @@ params.max_n_worker = 30
 params.outdir = ""
 params.args = [:]
 params.projects = []
+params.write_spatialdata = false
 
 params.vitessce_options = [:]
 params.layout = "minimal"
@@ -173,6 +174,7 @@ process route_file {
 
     output:
     tuple val(stem), stdout, emit: out_file_paths
+    tuple val(stem), path("${stem_str}-anndata.zarr"), emit: converted_anndata, optional: true
     tuple val(stem), path("${stem_str}*"), emit: converted_files, optional: true
     tuple val(stem), path("tmp-${stem_str}*"), emit: extra_files, optional: true
 

From 2145d2cbf0c78fd49331cb2cc8fbe7253034d15d Mon Sep 17 00:00:00 2001
From: Daniela Basurto Lozada <dann.3004@gmail.com>
Date: Fri, 10 Nov 2023 23:01:17 +0000
Subject: [PATCH 02/46] add process to write to spatialdata

---
 bin/write_spatialdata.py  | 33 +++++++++--------
 envs/dev/requirements.txt |  3 +-
 envs/environment.yaml     |  5 ++-
 envs/requirements.txt     |  7 ++--
 main.nf                   | 74 ++++++++++++++++++++++++++++++++++++---
 5 files changed, 98 insertions(+), 24 deletions(-)
 mode change 100644 => 100755 bin/write_spatialdata.py

diff --git a/bin/write_spatialdata.py b/bin/write_spatialdata.py
old mode 100644
new mode 100755
index f214d68..565e7c6
--- a/bin/write_spatialdata.py
+++ b/bin/write_spatialdata.py
@@ -6,12 +6,10 @@
 """
 
 from __future__ import annotations
-import typing as T
-import os
+from typing import Union
 import logging
 import warnings
 import fire
-import zarr
 import anndata as ad
 import xarray as xr
 import spatialdata as sd
@@ -31,10 +29,10 @@ def read_image(path: str, is_label: bool = False):
 
 
 def write_spatialdata(
-    adata_path: str,
+    anndata_path: str,
     stem: str = "",
-    raw_img_path: str = None,
-    label_img_path: str = None,
+    raw_img_path: Union[str, list[str]] = [],
+    label_img_path: Union[str, list[str]] = [],
 ) -> str:
     """This function takes an AnnData object and image files to
     write to a SpatialData objetc
@@ -48,16 +46,16 @@ def write_spatialdata(
     Returns:
         str: Output SpatialData filename
     """
-    if adata_path.endswith(".h5ad"):
-        adata = ad.read(adata_path, backed=True)
-    elif adata_path.endswith(".zarr"):
-        adata = ad.read_zarr(adata_path)
+    if anndata_path.endswith(".h5ad"):
+        adata = ad.read(anndata_path, backed=True)
+    elif anndata_path.endswith(".zarr"):
+        adata = ad.read_zarr(anndata_path)
     else:
         raise SystemError("Path to AnnData not .h5ad nor .zarr")
 
     OBS_IDX_NAME = "webatlas_index"
-    adata.obs.reset_index(names=OBS_IDX_NAME).set_index(
-        OBS_IDX_NAME, drop=False, inplace=True
+    adata.obs = adata.obs.reset_index(names=OBS_IDX_NAME).set_index(
+        OBS_IDX_NAME, drop=False
     )  # have index as both index and column
     adata.obs[OBS_IDX_NAME] = adata.obs[OBS_IDX_NAME].astype(int)
 
@@ -77,9 +75,14 @@ def write_spatialdata(
 
     sdata = sd.SpatialData(table=adata)
 
-    if raw_img_path:
-        sdata.add_image("raw", read_image(raw_img_path))
-    if label_img_path:
+    if isinstance(raw_img_path, str):
+        raw_img_path = [raw_img_path]
+    for raw_img in raw_img_path:
+        sdata.add_image("raw", read_image(raw_img))
+
+    if isinstance(label_img_path, str):
+        label_img_path = [label_img_path]
+    for label_img in label_img_path:
         sdata.add_labels("label", read_image(label_img_path, is_label=True))
 
     zarr_file = f"{stem}-spatialdata.zarr"
diff --git a/envs/dev/requirements.txt b/envs/dev/requirements.txt
index 3647721..b0a71bd 100644
--- a/envs/dev/requirements.txt
+++ b/envs/dev/requirements.txt
@@ -2,4 +2,5 @@ pytest==7.4.3
 pytest-cov==4.1.0
 pre-commit==3.5.0
 Sphinx==7.2.6
-sphinx-rtd-theme==1.3.0
\ No newline at end of file
+sphinx-rtd-theme==1.3.0
+vitessce==3.1.0
\ No newline at end of file
diff --git a/envs/environment.yaml b/envs/environment.yaml
index 9295618..e2a509b 100644
--- a/envs/environment.yaml
+++ b/envs/environment.yaml
@@ -21,6 +21,9 @@ dependencies:
   - scanpy==1.9.4
   - ome-zarr==0.8
   - python==3.10
-  - anndata==0.8.0
+  - anndata==0.9.1
   - zarr==2.16.1
   - h5py==3.10.0
+  - rioxarray==0.15.0
+  - pip:
+      - spatialdata==0.0.15
diff --git a/envs/requirements.txt b/envs/requirements.txt
index 46e95ab..0b2f6c3 100644
--- a/envs/requirements.txt
+++ b/envs/requirements.txt
@@ -1,6 +1,6 @@
 aiohttp==3.8.6
 aiosignal==1.3.1
-anndata==0.8.0
+anndata==0.9.1
 apeer-ometiff-library==1.9.0
 asciitree==0.3.3
 async-timeout==4.0.3
@@ -23,7 +23,7 @@ fasteners==0.17.3
 fire==0.5.0
 fonttools==4.43.1
 frozenlist==1.4.0
-fsspec==2023.10.0
+fsspec==2023.6
 h5py==3.10.0
 idna==3.4
 imagecodecs==2023.9.18
@@ -65,6 +65,7 @@ PyWavelets==1.4.1
 PyYAML==6.0.1
 regex==2023.8.8
 requests==2.31.0
+rioxarray==0.15.0
 scanpy==1.9.4
 scikit-image==0.22.0
 scikit-learn==1.3.2
@@ -90,7 +91,7 @@ typing_extensions==4.8.0
 tzdata==2023.3
 umap-learn==0.5.4
 unicodedata2==15.1.0
-urllib3==2.0.7
+urllib3==1.25.4
 wheel==0.41.3
 xmlschema==2.5.0
 yarl==1.9.2
diff --git a/main.nf b/main.nf
index 8c3531a..e9662fc 100644
--- a/main.nf
+++ b/main.nf
@@ -15,7 +15,7 @@ params.max_n_worker = 30
 params.outdir = ""
 params.args = [:]
 params.projects = []
-params.write_spatialdata = false
+params.write_spatialdata = true
 
 params.vitessce_options = [:]
 params.layout = "minimal"
@@ -167,14 +167,14 @@ process route_file {
     debug verbose_log
     cache "lenient"
 
-    publishDir outdir_with_version, mode:"copy"
+    publishDir outdir_with_version, mode: "copy"
 
     input:
     tuple val(stem), val(prefix), path(file), val(type), val(args)
 
     output:
     tuple val(stem), stdout, emit: out_file_paths
-    tuple val(stem), path("${stem_str}-anndata.zarr"), emit: converted_anndata, optional: true
+    tuple val(stem), path("${stem_str}-anndata.zarr"), emit: converted_anndatas, optional: true
     tuple val(stem), path("${stem_str}*"), emit: converted_files, optional: true
     tuple val(stem), path("tmp-${stem_str}*"), emit: extra_files, optional: true
 
@@ -221,11 +221,34 @@ process Build_config {
     """
 }
 
+process write_spatialdata {
+    tag "${stem}"
+    debug verbose_log
+    
+    publishDir outdir_with_version, mode: "copy"
+    
+    input:
+    tuple val(stem), path(anndata_path), path(raw_img_path), path(label_img_path)
+    
+    output:
+    path("${stem_str}-spatialdata.zarr")
+    
+    script:
+    stem_str = stem.join("-")
+    """
+    write_spatialdata.py \
+        --stem ${stem_str} \
+        --anndata_path ${anndata_path} \
+        --raw_img_path ${raw_img_path} \
+        --label_img_path ${label_img_path}
+    """
+}
+
 process Generate_image {
     tag "${stem}, ${img_type}, ${file_path}"
     debug verbose_log
 
-    publishDir outdir_with_version, mode:"copy"
+    publishDir outdir_with_version, mode: "copy"
 
     input:
     tuple val(stem), val(prefix), val(img_type), path(file_path), val(file_type), path(ref_img), val(args)
@@ -260,7 +283,14 @@ workflow Full_pipeline {
     Output_to_config(
         Process_files.out.file_paths,
         Process_images.out.img_zarrs
+    )
+        
+    if (params.write_spatialdata) {
+        Output_to_spatialdata(
+            Process_files.out.anndata_files,
+            Process_images.out.img_tifs
         )
+    }
     
 }
 
@@ -294,10 +324,13 @@ workflow Process_files {
     file_paths = files.map { stem, it -> 
         [ stem, it.name ]
     }
+    anndata_files = route_file.out.converted_anndatas
+//         .groupTuple(by:0)
 
     emit:
     files = files
     file_paths = file_paths
+    anndata_files = anndata_files
 }
 
 
@@ -368,6 +401,7 @@ workflow Process_images {
 
     emit:
     img_zarrs = img_zarrs
+    img_tifs = all_tifs
 }
 
 
@@ -408,4 +442,36 @@ workflow Output_to_config {
         Build_config(
             data_for_config
             )
+}
+
+
+workflow Output_to_spatialdata {
+    take: anndata_files
+    take: img_tifs
+    main:
+        
+        img_tifs
+            .map { stem, prefix, type, img, k -> 
+                [stem, [type: type, img: img]]
+            }
+            .branch { stem, data ->
+                raw: data.type == "raw"
+                label: data.type == "label"
+            }
+        .set{tif_files}
+    
+        anndata_files
+            .join(tif_files.raw, remainder: true)
+            .join(tif_files.label, remainder: true)
+            .map { stem, anndata, raw_tif, label_tif -> [
+                stem, anndata,
+                raw_tif ? raw_tif.img : [],
+                label_tif ? label_tif.img : []
+            ]}
+            .set{data_for_sd}
+    
+        write_spatialdata(
+            data_for_sd
+        )
+        
 }
\ No newline at end of file

From fcf0606975187e2a8f2fbf38385d5158bed0bbd3 Mon Sep 17 00:00:00 2001
From: Daniela Basurto-Lozada <dann.3004@gmail.com>
Date: Mon, 13 Nov 2023 11:57:52 +0000
Subject: [PATCH 03/46] fix wrong path variable

---
 bin/write_spatialdata.py |  2 +-
 main.nf                  | 29 +++++++++++++++++------------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/bin/write_spatialdata.py b/bin/write_spatialdata.py
index 565e7c6..161fd38 100755
--- a/bin/write_spatialdata.py
+++ b/bin/write_spatialdata.py
@@ -83,7 +83,7 @@ def write_spatialdata(
     if isinstance(label_img_path, str):
         label_img_path = [label_img_path]
     for label_img in label_img_path:
-        sdata.add_labels("label", read_image(label_img_path, is_label=True))
+        sdata.add_labels("label", read_image(label_img, is_label=True))
 
     zarr_file = f"{stem}-spatialdata.zarr"
     sdata.write(zarr_file)
diff --git a/main.nf b/main.nf
index e9662fc..db09feb 100644
--- a/main.nf
+++ b/main.nf
@@ -15,7 +15,7 @@ params.max_n_worker = 30
 params.outdir = ""
 params.args = [:]
 params.projects = []
-params.write_spatialdata = true
+params.write_spatialdata = false
 
 params.vitessce_options = [:]
 params.layout = "minimal"
@@ -235,12 +235,14 @@ process write_spatialdata {
     
     script:
     stem_str = stem.join("-")
+    raw_img_str = raw_img_path ? "--raw_img_path ${raw_img_path}" : ""
+    label_img_str = label_img_path ? "--label_img_path ${label_img_path}" : ""
     """
     write_spatialdata.py \
         --stem ${stem_str} \
         --anndata_path ${anndata_path} \
-        --raw_img_path ${raw_img_path} \
-        --label_img_path ${label_img_path}
+        ${raw_img_str} \
+        ${label_img_str}
     """
 }
 
@@ -325,7 +327,6 @@ workflow Process_files {
         [ stem, it.name ]
     }
     anndata_files = route_file.out.converted_anndatas
-//         .groupTuple(by:0)
 
     emit:
     files = files
@@ -386,6 +387,7 @@ workflow Process_images {
         .set {label_tifs}
 
     all_tifs = img_tifs.mix(label_tifs)
+    all_tifs.tap{tifs}
     image_to_zarr(all_tifs)
 
     ome_zarr_metadata(image_to_zarr.out.ome_xml)
@@ -401,13 +403,15 @@ workflow Process_images {
 
     emit:
     img_zarrs = img_zarrs
-    img_tifs = all_tifs
+    img_tifs = tifs
 }
 
 
 workflow Output_to_config {
-    take: out_file_paths
-    take: out_img_zarrs
+    take: 
+    out_file_paths
+    out_img_zarrs
+    
     main:
 
         // Map workflows' outputs to:
@@ -446,10 +450,11 @@ workflow Output_to_config {
 
 
 workflow Output_to_spatialdata {
-    take: anndata_files
-    take: img_tifs
+    take: 
+    anndata_files
+    img_tifs
+    
     main:
-        
         img_tifs
             .map { stem, prefix, type, img, k -> 
                 [stem, [type: type, img: img]]
@@ -459,7 +464,7 @@ workflow Output_to_spatialdata {
                 label: data.type == "label"
             }
         .set{tif_files}
-    
+
         anndata_files
             .join(tif_files.raw, remainder: true)
             .join(tif_files.label, remainder: true)
@@ -469,7 +474,7 @@ workflow Output_to_spatialdata {
                 label_tif ? label_tif.img : []
             ]}
             .set{data_for_sd}
-    
+
         write_spatialdata(
             data_for_sd
         )

From 6009b39cb032a6df03ea2aaa80a54baa3a2366c6 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Mon, 13 Nov 2023 17:17:40 +0000
Subject: [PATCH 04/46] add write_spatialdata param in docs

---
 sphinx/configuration.rst | 4 ++++
 sphinx/modules.rst       | 3 +++
 sphinx/run.rst           | 1 +
 3 files changed, 8 insertions(+)

diff --git a/sphinx/configuration.rst b/sphinx/configuration.rst
index a28159a..435adf8 100644
--- a/sphinx/configuration.rst
+++ b/sphinx/configuration.rst
@@ -23,6 +23,8 @@ Currently, the WebAtlas pipeline can process several types of data files as well
 
 Running the pipeline requires a YAML parameters file that lists the data to be processed. Templates of this parameters file are available in the `templates directory <https://github.com/haniffalab/webatlas-pipeline/tree/main/templates/>`__.
 
+Optionally, the pipeline can also write datasets (table data, raw images and label images) to `SpatialData <https://spatialdata.scverse.org/en/latest/index.html>`_.
+
 
 .. _parameters_file:
 
@@ -127,6 +129,8 @@ The more detailed list of parameters is as follows:
       - value 
     * - ``outdir``
       - the path to the directory to which output files will be written.
+    * - ``write_spatialdata``
+      - ``true`` or ``false``, if processed datasets should also get each written to `SpatialData <https://spatialdata.scverse.org/en/latest/index.html>`_
     * - ``args``
       - a map of optional arguments per data type for the scripts that process them. 
         
diff --git a/sphinx/modules.rst b/sphinx/modules.rst
index e2457dc..120466c 100644
--- a/sphinx/modules.rst
+++ b/sphinx/modules.rst
@@ -37,3 +37,6 @@ Modules
 
 .. automodule:: integrate_image
     :members:
+
+.. automodule:: write_spatialdata
+    :members:
\ No newline at end of file
diff --git a/sphinx/run.rst b/sphinx/run.rst
index c854730..07b6c8d 100644
--- a/sphinx/run.rst
+++ b/sphinx/run.rst
@@ -17,6 +17,7 @@ pipeline through the command line.
 - The ``Process_images`` workflow handles image files and/or label image data and their conversions.
 
 Configurations and data are input through a :ref:`parameters yaml file <parameters_file>`.
+If specified in the parameters file, the pipeline can additionally write datasets to `SpatialData <https://spatialdata.scverse.org/en/latest/index.html>`_.
 
 To run the ``Full_pipeline`` use
 

From 5ea1817a83c67c3bd78de792449b3b970d5c0dbb Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Fri, 17 Nov 2023 09:16:37 +0000
Subject: [PATCH 05/46] fixes #122

---
 sphinx/index.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sphinx/index.rst b/sphinx/index.rst
index d94a655..810b44c 100644
--- a/sphinx/index.rst
+++ b/sphinx/index.rst
@@ -2,8 +2,8 @@
  
 .. |Tests| image:: https://github.com/haniffalab/webatlas-pipeline/actions/workflows/tests-python.yml/badge.svg
    :target: https://github.com/haniffalab/webatlas-pipeline/actions/workflows/tests-python.yml
-.. |Sphinx| image:: https://github.com/haniffalab/webatlas-pipeline/actions/workflows/sphinx-build.yml/badge.svg
-   :target: https://github.com/haniffalab/webatlas-pipeline/actions/workflows/sphinx-build.yml
+.. |Sphinx| image:: https://github.com/haniffalab/webatlas-pipeline/actions/workflows/deploy-sphinx.yml/badge.svg
+   :target: https://github.com/haniffalab/webatlas-pipeline/actions/workflows/deploy-sphinx.yml
 .. |Coverage| image:: https://codecov.io/gh/haniffalab/webatlas-pipeline/branch/main/graph/badge.svg?token=7HQVFH08WJ
    :target: https://app.codecov.io/gh/haniffalab/webatlas-pipeline
 .. |DOI| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.7405818.svg

From b98b95ef1299c64cb41bd667bd0404848b8362f9 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Fri, 17 Nov 2023 09:44:57 +0000
Subject: [PATCH 06/46] run examples with conda profile

---
 sphinx/examples/visium.rst | 3 ++-
 sphinx/examples/xenium.rst | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sphinx/examples/visium.rst b/sphinx/examples/visium.rst
index 1f0daac..8fb2ddf 100644
--- a/sphinx/examples/visium.rst
+++ b/sphinx/examples/visium.rst
@@ -78,7 +78,8 @@ example requires you to have already :ref:`setup your environment first <environ
 
    nextflow run main.nf \
          -params-file templates/examples/CytAssist_FFPE_Human_Breast_Cancer.yaml \
-         -entry Full_pipeline
+         -entry Full_pipeline \
+         -profile conda
 
 .. code-block:: shell
    :caption: Output
diff --git a/sphinx/examples/xenium.rst b/sphinx/examples/xenium.rst
index c9ee248..b2ae4a6 100644
--- a/sphinx/examples/xenium.rst
+++ b/sphinx/examples/xenium.rst
@@ -76,7 +76,8 @@ example requires you to have already :ref:`setup your environment first <environ
 
    nextflow run main.nf \
          -params-file templates/examples/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs.yaml \
-         -entry Full_pipeline
+         -entry Full_pipeline \
+         -profile conda
 
 .. code-block:: shell
    :caption: Output

From 59461b9c6e6e575cc88932efeb6fd644f13c5673 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Fri, 17 Nov 2023 09:56:38 +0000
Subject: [PATCH 07/46] tweak SpatialData docs

---
 sphinx/configuration.rst |  7 +++++--
 sphinx/run.rst           | 37 +++++++++++++++++++------------------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/sphinx/configuration.rst b/sphinx/configuration.rst
index 435adf8..7b31e11 100644
--- a/sphinx/configuration.rst
+++ b/sphinx/configuration.rst
@@ -23,8 +23,10 @@ Currently, the WebAtlas pipeline can process several types of data files as well
 
 Running the pipeline requires a YAML parameters file that lists the data to be processed. Templates of this parameters file are available in the `templates directory <https://github.com/haniffalab/webatlas-pipeline/tree/main/templates/>`__.
 
-Optionally, the pipeline can also write datasets (table data, raw images and label images) to `SpatialData <https://spatialdata.scverse.org/en/latest/index.html>`_.
-
+.. note::
+  The pipeline can optionally write datasets to the
+  `SpatialData <https://spatialdata.scverse.org/en/latest/index.html>`_ format. Please see the ``write_spatialdata``
+  flag in the :ref:`list of parameters <parameters_table>` for more information. 
 
 .. _parameters_file:
 
@@ -118,6 +120,7 @@ The YAML file has multiple configuration segements. This is a simple breakdown o
         mappings:
           obsm/X_umap: [0,1]
 
+.. _parameters_table:
 
 The more detailed list of parameters is as follows:
 
diff --git a/sphinx/run.rst b/sphinx/run.rst
index cbab333..6450e44 100644
--- a/sphinx/run.rst
+++ b/sphinx/run.rst
@@ -16,8 +16,9 @@ pipeline through the command line.
 - The ``Process_files`` workflow handles data files and their conversions.
 - The ``Process_images`` workflow handles image files and/or label image data and their conversions.
 
-Configurations and data are input through a :ref:`parameters yaml file <parameters_file>`.
-If specified in the parameters file, the pipeline can additionally write datasets to `SpatialData <https://spatialdata.scverse.org/en/latest/index.html>`_.
+The pipeline configuration and the location of input data is specified through the 
+:ref:`parameters yaml file <parameters_file>`. This file is passed to Nextflow when
+running the pipeline. 
 
 To run the ``Full_pipeline`` use
 
@@ -32,49 +33,49 @@ defined.
 You can modify the entry point if you're interested in only getting the converted outputs.
 Use ``-entry Process_files`` or ``-entry Process_images`` as you need.
 
-Running using Docker 
---------------------
+Running using Conda 
+-------------------
 
-The default pipeline will run on local executor without any type of environment creation. To run the pipeline using Docker containers use the ``-profile docker`` option:
+The default pipeline will run on local executor without any type of environment creation. If you've already setup your conda environment you don't have to do anything else.
+
+However, if you are working on a compute cluster you will need to make sure the conda environment is avaiable and active in your worker nodes. To run the pipeline using a new conda environment use the ``-profile conda`` option:
 
 .. code-block:: shell
 
    nextflow run main.nf \
             -params-file /path/to/params.yaml \
             -entry Full_pipeline \
-            -profile docker
+            -profile conda
 
-Pulling the containers when the pipline is launched may take a few minutes.
+Creating the environment when the pipleine is launched may take a few minutes.
 
-Running using Singularity 
--------------------------
+Running using Docker 
+--------------------
 
-The default pipeline will run on local executor without any type of environment creation. To run the pipeline using Singularity containers use the ``-profile singularity`` option:
+The default pipeline will run on local executor without any type of environment creation. To run the pipeline using Docker containers use the ``-profile docker`` option:
 
 .. code-block:: shell
 
    nextflow run main.nf \
             -params-file /path/to/params.yaml \
             -entry Full_pipeline \
-            -profile singularity
+            -profile docker
 
 Pulling the containers when the pipline is launched may take a few minutes.
 
-Running using Conda 
--------------------
-
-The default pipeline will run on local executor without any type of environment creation. If you've already setup your conda environment you don't have to do anything else.
+Running using Singularity 
+-------------------------
 
-However, if you are working on a compute cluster you will need to make sure the conda environment is avaiable and active in your worker nodes. To run the pipeline using a new conda environment use the ``-profile conda`` option:
+The default pipeline will run on local executor without any type of environment creation. To run the pipeline using Singularity containers use the ``-profile singularity`` option:
 
 .. code-block:: shell
 
    nextflow run main.nf \
             -params-file /path/to/params.yaml \
             -entry Full_pipeline \
-            -profile conda
+            -profile singularity
 
-Creating the environment when the pipleine is launched may take a few minutes.
+Pulling the containers when the pipline is launched may take a few minutes.
 
 Further reading
 ---------------

From f6e7b66115adc23e5f21941190191fd3b57ed406 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Fri, 17 Nov 2023 09:57:14 +0000
Subject: [PATCH 08/46] yaml linting

---
 .../CytAssist_FFPE_Human_Breast_Cancer.yaml   |  9 +++-----
 ...um_FFPE_Human_Breast_Cancer_Rep1_outs.yaml | 11 ++++-----
 templates/iss_template.yaml                   | 13 ++++-------
 templates/merscope_template.yaml              | 13 ++++-------
 templates/multimodal-template.yaml            | 13 ++++-------
 templates/scrnaseq_s3_template.yaml           |  7 +++---
 templates/scrnaseq_template.yaml              |  7 +++---
 templates/template.yaml                       | 23 +++++++------------
 templates/visium_template.yaml                | 13 ++++-------
 templates/xenium_template.yaml                | 15 +++++-------
 10 files changed, 47 insertions(+), 77 deletions(-)

diff --git a/templates/examples/CytAssist_FFPE_Human_Breast_Cancer.yaml b/templates/examples/CytAssist_FFPE_Human_Breast_Cancer.yaml
index b19259a..9afcb01 100644
--- a/templates/examples/CytAssist_FFPE_Human_Breast_Cancer.yaml
+++ b/templates/examples/CytAssist_FFPE_Human_Breast_Cancer.yaml
@@ -10,14 +10,11 @@ projects:
       - dataset: breast-cancer
         title: "Visium CytAssist - High resolution mapping of the breast cancer tumor microenvironment"
         data:
-          -
-            data_type: spaceranger
+          - data_type: spaceranger
             data_path: ./input/CytAssist_FFPE_Human_Breast_Cancer/
-          -
-            data_type: raw_image
+          - data_type: raw_image
             data_path: ./input/CytAssist_FFPE_Human_Breast_Cancer/tissue_image.tif
-          -
-            data_type: label_image_data
+          - data_type: label_image_data
             data_path: ./input/CytAssist_FFPE_Human_Breast_Cancer/
             file_type: visium
             ref_img: ./input/CytAssist_FFPE_Human_Breast_Cancer/tissue_image.tif
diff --git a/templates/examples/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs.yaml b/templates/examples/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs.yaml
index 80664f3..b59a93f 100644
--- a/templates/examples/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs.yaml
+++ b/templates/examples/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs.yaml
@@ -10,14 +10,11 @@ projects:
       - dataset: breast-cancer
         title: "Xenium - High resolution mapping of the breast cancer tumor microenvironment"
         data:
-          -
-            data_type: xenium
+          - data_type: xenium
             data_path: ./input/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/outs/
-          -
-            data_type: raw_image
+          - data_type: raw_image
             data_path: ./input/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/outs/morphology.ome.tif
-          -
-            data_type: label_image_data
+          - data_type: label_image_data
             data_path: ./input/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/outs/
             file_type: xenium
             ref_img: ./input/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/outs/morphology.ome.tif
@@ -26,6 +23,6 @@ vitessce_options:
   spatial:
     xy: "obsm/spatial"
   mappings:
-    obsm/X_umap: [0,1]
+    obsm/X_umap: [0, 1]
   matrix: "X"
 layout: advanced
diff --git a/templates/iss_template.yaml b/templates/iss_template.yaml
index 6e75f32..4db4f69 100755
--- a/templates/iss_template.yaml
+++ b/templates/iss_template.yaml
@@ -10,22 +10,19 @@ projects:
       - dataset: iss
         title: "Test ISS Dataset"
         data:
-          -
-            data_type: h5ad
+          - data_type: h5ad
             data_path: /path/to/test/iss/anndata.h5ad
-          -
-            data_type: raw_image
+          - data_type: raw_image
             data_path: /path/to/test/iss/raw_image.tif
-          -
-            data_type: label_image
+          - data_type: label_image
             data_path: /path/to/iss/label_image.tif
 
 vitessce_options:
   spatial:
     xy: "obsm/spatial"
   mappings:
-    obsm/X_umap: [0,1]
-    obsm/X_pca: [0,1]
+    obsm/X_umap: [0, 1]
+    obsm/X_pca: [0, 1]
   factors:
     - "obs/sample"
   sets:
diff --git a/templates/merscope_template.yaml b/templates/merscope_template.yaml
index 3c134f8..d6fc065 100644
--- a/templates/merscope_template.yaml
+++ b/templates/merscope_template.yaml
@@ -10,15 +10,12 @@ projects:
       - dataset: merscope
         title: "Test MERSCOPE Dataset"
         data:
-          -
-            data_type: merscope
+          - data_type: merscope
             data_path: /path/to/test/merscope/
-          -
-            data_type: raw_image_data
+          - data_type: raw_image_data
             data_path: /path/to/merscope/
             file_type: merscope
-          -
-            data_type: label_image_data
+          - data_type: label_image_data
             data_path: /path/to/merscope/
             file_type: merscope
             shape: [10000, 10000]
@@ -27,8 +24,8 @@ vitessce_options:
   spatial:
     xy: "obsm/spatial"
   mappings:
-    obsm/X_umap: [0,1]
-    obsm/X_pca: [0,1]
+    obsm/X_umap: [0, 1]
+    obsm/X_pca: [0, 1]
   factors:
     - "obs/sample"
   sets:
diff --git a/templates/multimodal-template.yaml b/templates/multimodal-template.yaml
index 9fc3b48..c0b39e9 100644
--- a/templates/multimodal-template.yaml
+++ b/templates/multimodal-template.yaml
@@ -6,8 +6,7 @@ outdir: ./output/
 extend_feature_name: "celltype"
 
 data:
-  -
-    dataset: iss_demo
+  - dataset: iss_demo
     obs_type: "cell"
     anndata: test-iss-anndata.zarr
     offset: 0
@@ -19,14 +18,13 @@ data:
       spatial:
         xy: "obsm/spatial"
       mappings:
-        obsm/X_umap: [0,1]
+        obsm/X_umap: [0, 1]
       factors:
         - "obs/celltype"
       sets:
         - "obs/celltype"
       matrix: "X"
-  -
-    dataset: visium_demo
+  - dataset: visium_demo
     obs_type: "spot"
     anndata: test-visium-anndata.zarr
     offset: 1000000
@@ -38,8 +36,7 @@ data:
       spatial:
         xy: "obsm/spatial"
       matrix: "X"
-  -
-    dataset: scrnaseq_demo
+  - dataset: scrnaseq_demo
     obs_type: "cell"
     anndata: test-scrnaseq-anndata.zarr
     offset: 2000000
@@ -49,7 +46,7 @@ data:
       spatial:
         xy: "obsm/spatial"
       mappings:
-        obsm/X_umap: [0,1]
+        obsm/X_umap: [0, 1]
       factors:
         - "obs/celltype"
       sets:
diff --git a/templates/scrnaseq_s3_template.yaml b/templates/scrnaseq_s3_template.yaml
index d557522..9a1dca5 100755
--- a/templates/scrnaseq_s3_template.yaml
+++ b/templates/scrnaseq_s3_template.yaml
@@ -10,14 +10,13 @@ projects:
       - dataset: scRNAseq
         title: "Test scRNAseq Dataset"
         data:
-          -
-            data_type: h5ad
+          - data_type: h5ad
             data_path: s3://bucket/path/to/anndata.h5ad
 
 vitessce_options:
   mappings:
-    obsm/X_umap: [0,1]
-    obsm/X_pca: [0,1]
+    obsm/X_umap: [0, 1]
+    obsm/X_pca: [0, 1]
   sets:
     - "obs/tissue"
     - name: "obs/celltype"
diff --git a/templates/scrnaseq_template.yaml b/templates/scrnaseq_template.yaml
index c27dc2e..e653950 100755
--- a/templates/scrnaseq_template.yaml
+++ b/templates/scrnaseq_template.yaml
@@ -10,14 +10,13 @@ projects:
       - dataset: scRNAseq
         title: "Test scRNAseq Dataset"
         data:
-          -
-            data_type: h5ad
+          - data_type: h5ad
             data_path: /path/to/test/scrnaseq/anndata.h5ad
 
 vitessce_options:
   mappings:
-    obsm/X_umap: [0,1]
-    obsm/X_pca: [0,1]
+    obsm/X_umap: [0, 1]
+    obsm/X_pca: [0, 1]
   sets:
     - "obs/tissue"
     - name: "obs/celltype"
diff --git a/templates/template.yaml b/templates/template.yaml
index 1c7d72d..eaa7c0d 100644
--- a/templates/template.yaml
+++ b/templates/template.yaml
@@ -10,22 +10,18 @@ projects:
       - dataset: scRNAseq
         title: "Project 1 scRNAseq Dataset"
         data:
-          -
-            data_type: h5ad
+          - data_type: h5ad
             data_path: /path/to/project_1/scrnaseq/anndata.h5ad
   - project: project_2
     datasets:
       - dataset: visium
         title: "Project 2 Visium Dataset"
         data:
-          -
-            data_type: h5ad
+          - data_type: h5ad
             data_path: /path/to/project_2/visium/anndata.h5ad
-          -
-            data_type: raw_image
+          - data_type: raw_image
             data_path: /path/to/project_2/visium/raw_image.tif
-          -
-            data_type: label_image_data
+          - data_type: label_image_data
             data_path: /path/to/project_2/visium/anndata.h5ad
             file_type: visium
             ref_img: /path/to/project_2/visium/raw_image.tif
@@ -34,19 +30,16 @@ projects:
       - dataset: iss
         title: "Project 3 ISS Dataset"
         data:
-          -
-            data_type: h5ad
+          - data_type: h5ad
             data_path: /path/to/project_3/iss/anndata.h5ad
-          -
-            data_type: raw_image
+          - data_type: raw_image
             data_path: /path/to/project_3/iss/raw_image.tif
-          -
-            data_type: label_image
+          - data_type: label_image
             data_path: /path/to/project_3/iss/label_image.tif
 
 vitessce_options:
   mappings:
-    obsm/X_umap: [0,1]
+    obsm/X_umap: [0, 1]
   factors:
     - "obs/sample"
   sets:
diff --git a/templates/visium_template.yaml b/templates/visium_template.yaml
index 7648da6..54bcb1b 100755
--- a/templates/visium_template.yaml
+++ b/templates/visium_template.yaml
@@ -12,14 +12,11 @@ projects:
       - dataset: visium
         title: "Test Visium Dataset"
         data:
-          -
-            data_type: spaceranger
+          - data_type: spaceranger
             data_path: /path/to/test/visium/slide_1/
-          -
-            data_type: raw_image
+          - data_type: raw_image
             data_path: /path/to/visium/slide_1/raw_image.tif
-          -
-            data_type: label_image_data
+          - data_type: label_image_data
             data_path: /path/to/visium/slide_1/
             file_type: visium
             ref_img: /path/to/visium/slide_1/raw_image.tif
@@ -28,8 +25,8 @@ vitessce_options:
   spatial:
     xy: "obsm/spatial"
   mappings:
-    obsm/X_umap: [0,1]
-    obsm/X_pca: [0,1]
+    obsm/X_umap: [0, 1]
+    obsm/X_pca: [0, 1]
   factors:
     - "obs/sample"
   sets:
diff --git a/templates/xenium_template.yaml b/templates/xenium_template.yaml
index 341bf5c..930e7ae 100644
--- a/templates/xenium_template.yaml
+++ b/templates/xenium_template.yaml
@@ -9,16 +9,13 @@ projects:
     datasets:
       - dataset: xenium
         title: "Test Xenium Dataset"
-        url: ''
+        url: ""
         data:
-          -
-            data_type: xenium
+          - data_type: xenium
             data_path: /path/to/test/xenium/
-          -
-            data_type: raw_image
+          - data_type: raw_image
             data_path: /path/to/xenium/raw_image.tif
-          -
-            data_type: label_image_data
+          - data_type: label_image_data
             data_path: /path/to/xenium/
             file_type: xenium
             ref_img: /path/to/xenium/raw_image.tif
@@ -27,8 +24,8 @@ vitessce_options:
   spatial:
     xy: "obsm/X_spatial"
   mappings:
-    obsm/X_umap: [0,1]
-    obsm/X_pca: [0,1]
+    obsm/X_umap: [0, 1]
+    obsm/X_pca: [0, 1]
   sets:
     - "obs/graphclust"
   matrix: "X"

From a24f3eba9fb00e45046541ee243c32c3e3ecfad0 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Fri, 17 Nov 2023 14:13:57 +0000
Subject: [PATCH 09/46] read tif with dask instead of rioxarray

---
 bin/write_spatialdata.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/bin/write_spatialdata.py b/bin/write_spatialdata.py
index 161fd38..29b14bc 100755
--- a/bin/write_spatialdata.py
+++ b/bin/write_spatialdata.py
@@ -10,21 +10,24 @@
 import logging
 import warnings
 import fire
+import tifffile as tf
 import anndata as ad
 import xarray as xr
 import spatialdata as sd
+from dask_image.imread import imread
 
 warnings.filterwarnings("ignore")
 logging.getLogger().setLevel(logging.INFO)
 
 
 def read_image(path: str, is_label: bool = False):
-    imarray = xr.open_dataarray(path, engine="rasterio", mask_and_scale=False)
+    tif = tf.TiffFile(path)
+    dims = list(tif.series[0].axes.lower().replace("s", "c"))
+    image = imread(path).squeeze()
+    imarray = xr.DataArray(image, dims=dims)
     if is_label:
-        imarray = imarray.squeeze()
         return sd.models.Labels2DModel.parse(imarray)
     else:
-        imarray = imarray.rename({"band": "c"})
         return sd.models.Image2DModel.parse(imarray)
 
 

From 92f4c1aec5ffe02da67eb107114407fa020b27fb Mon Sep 17 00:00:00 2001
From: Daniela Basurto-Lozada <dann.3004@gmail.com>
Date: Fri, 17 Nov 2023 20:13:44 +0000
Subject: [PATCH 10/46] auto rechunk of images for spatialdata

rename 'i' axes
---
 bin/write_spatialdata.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bin/write_spatialdata.py b/bin/write_spatialdata.py
index 29b14bc..eb18530 100755
--- a/bin/write_spatialdata.py
+++ b/bin/write_spatialdata.py
@@ -22,9 +22,12 @@
 
 def read_image(path: str, is_label: bool = False):
     tif = tf.TiffFile(path)
-    dims = list(tif.series[0].axes.lower().replace("s", "c"))
+    dims = list(tif.series[0].axes.lower()
+                .replace("s", "c")
+                .replace("i", "c")
+               )
     image = imread(path).squeeze()
-    imarray = xr.DataArray(image, dims=dims)
+    imarray = xr.DataArray(image, dims=dims).chunk(chunks="auto")
     if is_label:
         return sd.models.Labels2DModel.parse(imarray)
     else:

From d548b59f57e6a4359e7f467340c65ad63d976e1a Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Mon, 20 Nov 2023 16:11:54 +0000
Subject: [PATCH 11/46] add option for extend_feature args in multimodal yaml
 config

add cell2location filtering by obs
---
 bin/integrate_anndata.py           | 28 ++++++++++++++++----------
 multimodal.nf                      | 32 +++++++++++++++++++++++++++---
 templates/multimodal-template.yaml |  5 ++++-
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/bin/integrate_anndata.py b/bin/integrate_anndata.py
index 88188ce..0ab4eda 100755
--- a/bin/integrate_anndata.py
+++ b/bin/integrate_anndata.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from typing import Union
+import typing as T
 import os
 import fire
 import zarr
@@ -13,17 +14,19 @@
 from pathlib import Path
 
 
-def reindex_and_concat(path: str, offset: int, features: str = None, **kwargs):
+def reindex_and_concat(
+    path: str, offset: int, features: str = None, args: dict[str, T.Any] = {}, **kwargs
+):
     adata = read_anndata(path)
 
-    adata = reindex_anndata(adata, offset, no_save=True)
+    adata = reindex_anndata(adata, offset, **args, **kwargs)
     if features:
-        adata = concat_features(adata, features, no_save=True)
+        adata = concat_features(adata, features, **args, **kwargs)
 
     out_filename = "reindexed-concat-{}".format(
         os.path.splitext(os.path.basename(path))[0]
     )
-    write_anndata(adata, out_filename, **kwargs)
+    write_anndata(adata, out_filename, **args, **kwargs)
 
     return
 
@@ -31,7 +34,7 @@ def reindex_and_concat(path: str, offset: int, features: str = None, **kwargs):
 def reindex_anndata(
     data: Union[ad.AnnData, str],
     offset: int,
-    no_save: bool = False,
+    no_save: bool = True,
     out_filename: str = None,
     **kwargs,
 ):
@@ -55,7 +58,7 @@ def reindex_anndata(
 def concat_features(
     data: Union[ad.AnnData, str],
     features: str,
-    no_save: bool = False,
+    no_save: bool = True,
     out_filename: str = None,
     **kwargs,
 ):
@@ -68,11 +71,11 @@ def concat_features(
         )
 
     if features.endswith(".h5ad") and os.path.isfile(features):
-        adata = concat_matrix_from_cell2location(adata, features)
+        adata = concat_matrix_from_cell2location(adata, features, **kwargs)
     elif features.startswith("obs/"):
-        adata = concat_matrix_from_obs(adata, features.split("/")[1])
+        adata = concat_matrix_from_obs(adata, features.split("/")[1], **kwargs)
     elif features.startswith("obsm/"):
-        adata = concat_matrix_from_obsm(adata, features.split("/")[1])
+        adata = concat_matrix_from_obsm(adata, features.split("/")[1], **kwargs)
 
     if no_save:
         return adata
@@ -134,9 +137,10 @@ def concat_matrix_from_cell2location(
     data: Union[ad.AnnData, str],
     c2l_file: str,
     q: str = "q05_cell_abundance_w_sf",
-    sample: str = None,
+    sample: tuple[str, str] = None,
     feature_name: str = "gene",
     obs_feature_name: str = None,
+    **kwargs,
 ):
     if isinstance(data, ad.AnnData):
         adata = data
@@ -162,7 +166,9 @@ def concat_matrix_from_cell2location(
         dtype="float32",
     )
 
-    return concat_matrices(adata, c2l_df, "celltype", feature_name, obs_feature_name)
+    return concat_matrices(
+        adata, c2l_df, "celltype", feature_name, obs_feature_name, **kwargs
+    )
 
 
 def concat_matrices(
diff --git a/multimodal.nf b/multimodal.nf
index dc99924..a5baab0 100644
--- a/multimodal.nf
+++ b/multimodal.nf
@@ -33,6 +33,30 @@ def copyFile (inputFile, outdir) {
     return inputFile
 }
 
+def parseExtendFeature (extend_feature){
+    if (!extend_feature){
+        return [ file("NO_FT"), [:] ]
+    }
+    else if (extend_feature instanceof String ){
+        return [
+            file(extend_feature.endsWith(".h5ad") ? extend_feature : "NO_FT"),
+            [:]
+        ]
+    }
+    else if (extend_feature instanceof Map){
+        if (extend_feature["path"]){
+            if (!(extend_feature.path instanceof String && extend_feature.path.endsWith(".h5ad"))){
+                error "Invalid value for `extend_feature.path`. Expecting an .h5ad file."
+            }
+            return [ file(extend_feature.path), extend_feature.args ?: [:] ]
+        }
+        else {
+            error "Invalid map for `extend_feature`. Expecting key `path`."
+        }
+    }
+    error "Invalid value for `extend_feature`"
+}
+
 process process_label {
     tag "${label_image}"
     debug verbose_log
@@ -64,7 +88,7 @@ process process_anndata {
     publishDir outdir_with_version, mode:"copy"
 
     input:
-    tuple val(dataset), path(anndata), val(offset), val(features), path(features_file)
+    tuple val(dataset), path(anndata), val(offset), val(features), path(features_file), val(features_args)
 
     output:
     tuple val(dataset), path("*")
@@ -73,11 +97,13 @@ process process_anndata {
     features_str = features
         ? "--features ${features_file.name != 'NO_FT' ? features_file : features}"
         : ""
+    args_str = features_file.name != 'NO_FT' && features_args
+        ? "--args '" + new JsonBuilder(features_args).toString() + "'" : ""
     """
     integrate_anndata.py reindex_and_concat \
         --path ${anndata} \
         --offset ${offset} \
-        ${features_str}
+        ${features_str} ${args_str}
     """
 }
 
@@ -140,7 +166,7 @@ workflow {
             raws : [it.dataset, it.raw_image] // not processed but necessary for writing config
             labels : [it.dataset, it.label_image, it.offset]
             adatas : [it.dataset, file(it.anndata), it.offset, it.extend_feature,
-                file(it.extend_feature && it.extend_feature.endsWith(".h5ad") ? it.extend_feature : "NO_FT")
+                *parseExtendFeature(it.extend_feature)
             ]
         }
         .set{data}
diff --git a/templates/multimodal-template.yaml b/templates/multimodal-template.yaml
index 9fc3b48..b8a07b7 100644
--- a/templates/multimodal-template.yaml
+++ b/templates/multimodal-template.yaml
@@ -33,7 +33,10 @@ data:
     is_spatial: true
     raw_image: test-visium-raw.zarr
     label_image: test-visium-label.zarr
-    extend_feature: /path/to/c2l_output.h5ad # obsm/celltype or obsm/q05_cell_abundance_w_sf
+    extend_feature: 
+      path: /path/to/c2l_output.h5ad
+      args:
+        sample: ["library_id", "sample_1"]
     vitessce_options:
       spatial:
         xy: "obsm/spatial"

From 1e6204540ab03171cacb12899b0e0628a1a88f00 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 21 Nov 2023 12:05:08 +0000
Subject: [PATCH 12/46] add c2l output default sorting option

---
 bin/integrate_anndata.py | 18 ++++++++++++++++++
 bin/process_h5ad.py      |  3 +++
 2 files changed, 21 insertions(+)

diff --git a/bin/integrate_anndata.py b/bin/integrate_anndata.py
index 0ab4eda..aef4be7 100755
--- a/bin/integrate_anndata.py
+++ b/bin/integrate_anndata.py
@@ -140,8 +140,11 @@ def concat_matrix_from_cell2location(
     sample: tuple[str, str] = None,
     feature_name: str = "gene",
     obs_feature_name: str = None,
+    sort: bool = True,
+    sort_index: str = None,
     **kwargs,
 ):
+    sort = sort or sort_index is not None
     if isinstance(data, ad.AnnData):
         adata = data
     else:
@@ -157,6 +160,21 @@ def concat_matrix_from_cell2location(
     if sample:
         c2l_adata = c2l_adata[c2l_adata.obs[sample[0]] == sample[1]]
 
+    if sort:
+        if not sort_index and adata.uns.get("webatlas_reindexed"):
+            sort_index = "label_id"
+        if sort_index:
+            idx = c2l_adata.obs.index.get_indexer(adata.obs[sort_index].tolist())
+        else:
+            idx = c2l_adata.obs.index.get_indexer(adata.obs.index.tolist())
+        if -1 in idx:
+            raise SystemError(
+                "Values do not match between AnnData object's"
+                f" `{sort_index or 'index'}`"
+                " and cell2location output index."
+            )
+        c2l_adata = c2l_adata[idx,]
+
     c2l_df = pd.DataFrame(
         c2l_adata.obsm[q].to_numpy(),
         index=c2l_adata.obs.index,
diff --git a/bin/process_h5ad.py b/bin/process_h5ad.py
index 5a7ddc4..e525d38 100755
--- a/bin/process_h5ad.py
+++ b/bin/process_h5ad.py
@@ -125,10 +125,13 @@ def reindex_anndata_obs(adata: ad.AnnData) -> ad.AnnData:
         adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
     ):
         IDX_NAME = "label_id"
+        if IDX_NAME in adata.obs:
+            adata.obs.rename(columns={IDX_NAME: f"_{IDX_NAME}"})
         adata.obs = adata.obs.reset_index(names=IDX_NAME)
         adata.obs.index = (
             pd.Categorical(adata.obs[IDX_NAME]).codes + 1
         )  # avoid 0's for possible label images
+        adata.uns["webatlas_reindexed"] = True
     adata.obs.index = adata.obs.index.astype(str)
 
     return adata

From 59163a2ff437a17e66908d070d4478e063915dc3 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 21 Nov 2023 13:28:38 +0000
Subject: [PATCH 13/46] add 'extend_feature' args to multimodal docs

---
 sphinx/multimodal/configuration.rst | 38 ++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/sphinx/multimodal/configuration.rst b/sphinx/multimodal/configuration.rst
index 47e3054..dd45ed8 100644
--- a/sphinx/multimodal/configuration.rst
+++ b/sphinx/multimodal/configuration.rst
@@ -134,4 +134,40 @@ The full parameters file will then look like this
             xy: obsm/spatial
           matrix: X
 
-With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
\ No newline at end of file
+With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
+
+In the case of providing a *cell2location* output file, you can further configure ``extend_feature`` with arguments for how the file should be processed.
+Instead of only setting the path to the file you would need to define ``extend_feature`` as a map containing ``path`` and optional ``args``.
+
+.. code-block:: yaml
+
+    extend_feature_name: celltype
+    data:
+      -
+        dataset: visium
+        obs_type: spot
+        anndata: /path/to/main/output/visium-anndata.zarr
+        extend_feature: 
+          path: /path/to/c2l.h5ad
+          args:
+            sample: ["library_id", "sample_1"] # tuple containing the obs column name and value to filter the object. By default the object is not filtered.
+            q: "q05_cell_abundance_w_sf" # matrix in obsm to use. Defaults to "q05_cell_abundance_w_sf".
+            sort_index: "index_column" # column in the AnnData object that contains an index that matches the index in cell2location.
+            sort: True # can be set to False to skip ordering the cell2location matrix but observations might not match in order between files. Defaults to True.
+
+For example, ``sample`` can be used when a *cell2location* output file contains predictions for multiple samples.
+Setting ``sample`` to filter the output file enables the pipeline to obtain the appropriate prediction matrix for the data being processed,
+without having to split the *cell2location* output file for each sample. Otherwise, if a file with multiple sample prediction is input
+it will not match the number of observations of the AnnData object and the process will throw an error.
+
+``q`` can be set to use a different prediction matrix from the *cell2location* output file.
+It defaults to ``"q05_cell_abundance_w_sf"``
+
+``sort`` and ``sort_index`` can be used to define how a *cell2location* output file matches the AnnData object.
+By default the pipeline will try to ensure the order of observations between the prediction matrix and AnnData object match
+so values are correctly concatenated.
+The pipeline will attempt to order the prediction matrix given the index of the AnnData object 
+(or the original index if the main pipeline re-indexed it).
+However you can override the observations column of the AnnData object that contains the index that the prediction matrix should match.
+``sort`` can be set to ``False`` to disable any re-ordering. If disabled, the prediction matrix would be concatenated as-is into the AnnData object
+without checking if observations' IDs match.

From 6e839654c9584f455bbed13a94f143542b3b140a Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 7 Nov 2023 14:54:25 +0000
Subject: [PATCH 14/46] add documentation on multimodal pipeline

---
 bin/build_config_multimodal.py      |   5 +-
 sphinx/index.rst                    |   1 +
 sphinx/modules.rst                  |   2 +-
 sphinx/multimodal_configuration.rst | 149 ++++++++++++++++++++++++++++
 sphinx/run.rst                      |  16 ++-
 5 files changed, 168 insertions(+), 5 deletions(-)
 create mode 100644 sphinx/multimodal_configuration.rst

diff --git a/bin/build_config_multimodal.py b/bin/build_config_multimodal.py
index 2b5acdf..16b6be7 100755
--- a/bin/build_config_multimodal.py
+++ b/bin/build_config_multimodal.py
@@ -42,8 +42,8 @@ def write_json(
                 "file_paths" : [],
                 "images": {"raw": [], "label": []},
                 "options": {},
-                "obs_type": "cell"
-                "is_spatial": True // if has images should be enough
+                "obs_type": "cell",
+                "is_spatial": True
                 }
             }
             Defaults to {}.
@@ -62,7 +62,6 @@ def write_json(
         outdir (str, optional): Directory in which the config file will be written to.
             Defaults to "./".
     """
-
     config = VitessceConfig(
         "1.0.15",
         name=str(title) if len(title) else str(project),
diff --git a/sphinx/index.rst b/sphinx/index.rst
index 1a250c9..8c1685e 100644
--- a/sphinx/index.rst
+++ b/sphinx/index.rst
@@ -26,6 +26,7 @@ The pipeline generates data files for `supported data types`_, and builds a `vie
 
    installation
    configuration
+   multimodal_configuration
    run
    visualise
    Demos <https://cellatlas.io/webatlas>
diff --git a/sphinx/modules.rst b/sphinx/modules.rst
index 120466c..ddfe0f5 100644
--- a/sphinx/modules.rst
+++ b/sphinx/modules.rst
@@ -38,5 +38,5 @@ Modules
 .. automodule:: integrate_image
     :members:
 
-.. automodule:: write_spatialdata
+.. automodule:: build_config_multimodal
     :members:
\ No newline at end of file
diff --git a/sphinx/multimodal_configuration.rst b/sphinx/multimodal_configuration.rst
new file mode 100644
index 0000000..3e905a4
--- /dev/null
+++ b/sphinx/multimodal_configuration.rst
@@ -0,0 +1,149 @@
+.. _multimodal_configuration:
+
+########################################
+Configuration for multimodal integration
+########################################
+
+The WebAtlas pipeline can process a group of datasets that share common features to be visualised together.
+This requires a YAML parameters file similar to :ref:`the one used to run the main pipeline. <parameters_file>`.
+The outputs generated by running the main conversion pipeline serve as inputs for this multimodal integration pipeline.
+
+The multimodal integration pipeline performs several tasks:
+
+1. Reindex each dataset by a user-inputed offset so ID's do not clash between them.
+2. *Optionally*, concatenate other observation-by-feature matrices or categorical values to the expression matrix to enable their visualisation as continuous values. For example, a celltype prediction matrix and/or celltype categories.
+3. Find the intersection of features between all datasets and subset them to visualise only the intersection (as including features not present in all datasets can produce misleading visualisations.)
+   **Note** the features are intersected using their index in the AnnData objects (``var`` table). All datasets must use the same type of data as index for the intersection to be correctly computed. For example, all datasets use names as index, or all datasets use IDs as index.
+
+After running the main conversion pipeline you can populate the required YAML parameters file to run the multimodal integration pipeline
+
+
+.. _multimodal_parameters_file:
+
+***************
+Parameters file
+***************
+
+The parameters file looks like this:
+
+.. code-block:: yaml
+
+    outdir: "/path/to/output/"
+
+    url: http://localhost:3000/
+    project: my_project
+    title: "My Project"
+
+    data:
+      -
+        dataset: scrnaseq
+        obs_type: cell
+        anndata: /path/to/main/output/scrnaseq-anndata.zarr
+        offset: 0
+        is_spatial: false
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          mappings:
+            obsm/X_umap: [0,1]
+          matrix: X
+      -
+        dataset: visium
+        obs_type: spot
+        anndata: /path/to/main/output/visium-anndata.zarr
+        offset: 1000000
+        is_spatial: true
+        raw_image: /path/to/main/output/visium-raw.zarr
+        label_image: /path/to/main/output/visium-label.zarr
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          matrix: X
+
+In contrast to the main conversion pipeline's parameters file, this file includes a single `project` to which multiple `datasets` belong to.
+
+Each ``dataset`` block defines the name of the dataset and paths to the converted data and image files (if any).
+
+Each ``dataset`` also requires a set of ``vitessce_options`` that specify the location of certain data (spatial coordinates, embeddings, expression matrix, etc.) within the AnnData object that is processed/generated.
+This follows the same structure as in the :ref:`main conversion's vitessce_options <vitessce_options>`.
+
+Additionally, each ``dataset`` requires:
+
+* ``obs_type``, the type of observation of the dataset. For example, "cell" or "spot".
+* ``offset``, an integer offset to add to the dataset's ID's so they don't clash with the other datasets.
+* ``is_spatial``, whether the dataset contains spatial information and has associated image files (raw and/or label images)
+
+Given that raw images are only read but not modified the pipeline does not generate new output from them.
+In order for the output directory (defined by ``outdir``) to contain all necessary files that need to be served for the web application to consume,
+by default, the pipeline copies the raw images to the output directory.
+This process can take a long time depending on the size of the image.
+You may want to manually copy or move the image or serve it from its own directory separate from the rest of the output.
+The default copying can be disabled by setting ``copy_raw: false`` as a project-wide parameter (at the same level as ``outdir``, ``project``, etc).
+For example,
+
+.. code-block:: yaml
+
+    outdir: "/path/to/output/"
+    url: http://localhost:3000/
+    project: my_project
+    title: "My Project"
+    copy_raw: false
+
+
+With additional features
+========================
+
+Using the above example parameters file to run the multimodal integration pipeline will run the reindexing and intersection steps.
+To perform the concatenation of additional features (like celltypes) to visualise them as continuous values, some extra parameters need to be added.
+
+As a project-wide parameter (at the same level as ``outdir``, ``project``, etc.):
+
+* ``extend_feature_name``, the name of the additional feature. For example, "celltype"
+
+And at a ``dataset`` level:
+
+* ``extend_feature``, the location of the additional feature information.
+  This can be either the path to a *cell2location* output file, or the location within the AnnData object where the feature is stored as a categorical within ``obs``.
+  For example, ``/path/to/c2l.h5ad`` containing predicted continuous values, or ``obs/celltype`` containing categoricals.
+
+The full parameters file will then look like this
+
+.. code-block:: yaml
+
+    outdir: "/path/to/output/"
+
+    url: http://localhost:3000/
+    project: my_project
+    title: "My Project"
+
+    extend_feature_name: celltype
+
+    data:
+      -
+        dataset: scrnaseq
+        obs_type: cell
+        anndata: /path/to/main/output/scrnaseq-anndata.zarr
+        extend_feature: obs/celltype
+        offset: 0
+        is_spatial: false
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          mappings:
+            obsm/X_umap: [0,1]
+          matrix: X
+      -
+        dataset: visium
+        obs_type: spot
+        anndata: /path/to/main/output/visium-anndata.zarr
+        extend_feature: /path/to/c2l.h5ad
+        offset: 1000000
+        is_spatial: true
+        raw_image: /path/to/main/output/visium-raw.zarr
+        label_image: /path/to/main/output/visium-label.zarr
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          matrix: X
+
+With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
\ No newline at end of file
diff --git a/sphinx/run.rst b/sphinx/run.rst
index 07b6c8d..0813262 100644
--- a/sphinx/run.rst
+++ b/sphinx/run.rst
@@ -32,6 +32,20 @@ defined.
 You can modify the entry point if you're interested in only getting the converted outputs.
 Use ``-entry Process_files`` or ``-entry Process_images`` as you need.
 
+Multimodal integration
+----------------------
+
+Additional to the main conversion pipeline, we offer a subsequent pipeline to process multiple datasets with matching features to be able to visualise and query across all of them in a single portal.
+This pipeline can also process extra features (e.g. celltypes) to visualise them across datasets in addition to their expression matrices.
+
+Configurations and data are input through a :ref:`parameters yaml file <multimodal_configuration>` (slightly different from the parameters file required by the main pipeline.)
+
+To run this pipeline use
+
+.. code-block:: shell
+
+   nextflow run multimodal.nf -params-file /path/to/multimodal-params.yaml
+
 Running using Docker 
 --------------------
 
@@ -61,7 +75,7 @@ The default pipeline will run on local executor without any type of environment
 Pulling the containers when the pipline is launched may take a few minutes.
 
 Running using Conda 
--------------------------
+-------------------
 
 The default pipeline will run on local executor without any type of environment creation. If you've already setup your conda environment you don't have to do anything else.
 

From 3bb30417aa06739f03dd745954ab3f30b6846924 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Tue, 7 Nov 2023 16:45:56 +0000
Subject: [PATCH 15/46] restructure multimodal docs

---
 sphinx/_static/css/custom.css                 | 37 +++++------
 sphinx/index.rst                              | 12 +++-
 .../configuration.rst}                        | 20 ++----
 sphinx/multimodal/overview.rst                | 31 ++++++++++
 sphinx/multimodal/run.rst                     | 61 +++++++++++++++++++
 sphinx/multimodal/visualise.rst               | 37 +++++++++++
 sphinx/run.rst                                | 14 -----
 7 files changed, 162 insertions(+), 50 deletions(-)
 rename sphinx/{multimodal_configuration.rst => multimodal/configuration.rst} (78%)
 create mode 100644 sphinx/multimodal/overview.rst
 create mode 100644 sphinx/multimodal/run.rst
 create mode 100644 sphinx/multimodal/visualise.rst

diff --git a/sphinx/_static/css/custom.css b/sphinx/_static/css/custom.css
index d820eda..bcf0326 100644
--- a/sphinx/_static/css/custom.css
+++ b/sphinx/_static/css/custom.css
@@ -1,27 +1,28 @@
 .rst-content .code-block-caption {
-    text-align: left;
-    font-weight: 700;
-    display: block;
-    color: #fff;
-    background: #6ab0de;
-    padding: 6px 12px;
+  text-align: left;
+  font-weight: 700;
+  display: block;
+  color: #fff;
+  background: #6ab0de;
+  padding: 6px 12px;
 }
-.rst-content div[class^=highlight] {
-    margin-top: 0;
+.rst-content div[class^="highlight"] {
+  margin-top: 0;
 }
 .admonition.shell-output {
-    font-size: 85%;
-    line-height: 1;
-} 
+  font-size: 85%;
+  line-height: 1;
+}
 .admonition.shell-output .admonition-title:before {
-    content: None;
+  content: None;
 }
 .admonition.shell-output .admonition-title {
-    font-style: italic;
+  font-style: italic;
 }
-
-
 .citation {
-    font-weight: bold;
-    font-style: italic;
-}
\ No newline at end of file
+  font-weight: bold;
+  font-style: italic;
+}
+code.docutils.literal.notranslate {
+  white-space: nowrap;
+}
diff --git a/sphinx/index.rst b/sphinx/index.rst
index 8c1685e..d94a655 100644
--- a/sphinx/index.rst
+++ b/sphinx/index.rst
@@ -9,7 +9,7 @@
 .. |DOI| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.7405818.svg
    :target: https://doi.org/10.5281/zenodo.7405818
 
-WebAtlas Pipeline
+WebAtlas pipeline
 =================
 
 This Nextflow pipeline processes spatial and single-cell experiment data for visualisation in `WebAtlas App`_. 
@@ -26,7 +26,6 @@ The pipeline generates data files for `supported data types`_, and builds a `vie
 
    installation
    configuration
-   multimodal_configuration
    run
    visualise
    Demos <https://cellatlas.io/webatlas>
@@ -39,6 +38,15 @@ Indices and tables
 * :ref:`modindex`
 * :ref:`search`
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Multimodal
+  
+   multimodal/overview
+   multimodal/configuration
+   multimodal/run
+   multimodal/visualise
+
 .. toctree::
    :maxdepth: 2
    :hidden:
diff --git a/sphinx/multimodal_configuration.rst b/sphinx/multimodal/configuration.rst
similarity index 78%
rename from sphinx/multimodal_configuration.rst
rename to sphinx/multimodal/configuration.rst
index 3e905a4..3899ddc 100644
--- a/sphinx/multimodal_configuration.rst
+++ b/sphinx/multimodal/configuration.rst
@@ -1,22 +1,10 @@
 .. _multimodal_configuration:
 
-########################################
-Configuration for multimodal integration
-########################################
-
-The WebAtlas pipeline can process a group of datasets that share common features to be visualised together.
-This requires a YAML parameters file similar to :ref:`the one used to run the main pipeline. <parameters_file>`.
-The outputs generated by running the main conversion pipeline serve as inputs for this multimodal integration pipeline.
-
-The multimodal integration pipeline performs several tasks:
-
-1. Reindex each dataset by a user-inputed offset so ID's do not clash between them.
-2. *Optionally*, concatenate other observation-by-feature matrices or categorical values to the expression matrix to enable their visualisation as continuous values. For example, a celltype prediction matrix and/or celltype categories.
-3. Find the intersection of features between all datasets and subset them to visualise only the intersection (as including features not present in all datasets can produce misleading visualisations.)
-   **Note** the features are intersected using their index in the AnnData objects (``var`` table). All datasets must use the same type of data as index for the intersection to be correctly computed. For example, all datasets use names as index, or all datasets use IDs as index.
-
-After running the main conversion pipeline you can populate the required YAML parameters file to run the multimodal integration pipeline
+######################
+Multimodal configuration
+######################
 
+After running the main conversion pipeline you can populate the required YAML parameters file to run the multimodal integration pipeline.
 
 .. _multimodal_parameters_file:
 
diff --git a/sphinx/multimodal/overview.rst b/sphinx/multimodal/overview.rst
new file mode 100644
index 0000000..46fb273
--- /dev/null
+++ b/sphinx/multimodal/overview.rst
@@ -0,0 +1,31 @@
+.. _multimodal_overview:
+
+###################
+Multimodal overview
+###################
+
+After the ``main.nf`` pipeline has been successfully run, WebAtlas can optionally process a group of multimodal datasets that 
+share common features. This step will prepare the unified multimodal visualision for the web app. 
+
+The data outputs generated by running the ``main.nf`` conversion pipeline serve as inputs for this multimodal integration pipeline.
+
+***************
+Running the multimodal pipeline
+***************
+
+Follow the instructions below to run the multimodal pipeline.
+
+1. :ref:`Configure <multimodal_configuration>` the parameters file for the ``multimodal.nf`` pipeline
+2. :ref:`Run <multimodal_run>` the ``multimodal.nf`` pipeline
+3. :ref:`Visualise <multimodal_visualise>` the multimodal data in a web browser
+
+***************
+Tasks completed by the pipeline
+***************
+
+The multimodal integration pipeline performs several tasks:
+
+1. Reindex each dataset by a user-inputed offset so ID's do not clash between modalities.
+2. *Optionally*, concatenate other observation-by-feature matrices or categorical values to the expression matrix to enable their visualisation as continuous values. For example, a celltype prediction matrix and/or celltype categories.
+3. Find the intersection of features between all datasets and subset them to visualise only the intersection (as including features not present in all datasets can produce misleading visualisations.)
+   **Note** the features are intersected using their index in the AnnData objects (``var`` table). All datasets must use the same type of data as index for the intersection to be correctly computed. For example, all datasets use names as index, or all datasets use IDs as index.
\ No newline at end of file
diff --git a/sphinx/multimodal/run.rst b/sphinx/multimodal/run.rst
new file mode 100644
index 0000000..2d57141
--- /dev/null
+++ b/sphinx/multimodal/run.rst
@@ -0,0 +1,61 @@
+.. _multimodal_run:
+
+Multimodal run
+===========
+
+In additional to the main conversion pipeline, we offer a subsequent pipeline to process multiple datasets with matching features. This allows the users to 
+visualise and query all common features such as genes and cell types across all modalities from a single web portal.
+
+Configurations and data are input through a :ref:`parameters yaml file <multimodal_configuration>` (slightly different from the parameters file required by the main pipeline).
+
+To run this pipeline use
+
+.. code-block:: shell
+
+   nextflow run multimodal.nf -params-file /path/to/multimodal-params.yaml
+
+Running using Docker 
+--------------------
+
+The default pipeline will run on local executor without any type of environment creation. To run the pipeline using Docker containers use the ``-profile docker`` option:
+
+.. code-block:: shell
+
+   nextflow run multimodal.nf \
+            -params-file /path/to/multimodal-params.yaml \
+            -profile docker
+
+Pulling the containers when the pipline is launched may take a few minutes.
+
+Running using Singularity 
+-------------------------
+
+The default pipeline will run on local executor without any type of environment creation. To run the pipeline using Singularity containers use the ``-profile singularity`` option:
+
+.. code-block:: shell
+
+   nextflow run multimodal.nf \
+            -params-file /path/to/multimodal-params.yaml \
+            -profile singularity
+
+Pulling the containers when the pipline is launched may take a few minutes.
+
+Running using Conda 
+-------------------
+
+The default pipeline will run on local executor without any type of environment creation. If you've already setup your conda environment you don't have to do anything else.
+
+However, if you are working on a compute cluster you will need to make sure the conda environment is avaiable and active in your worker nodes. To run the pipeline using a new conda environment use the ``-profile conda`` option:
+
+.. code-block:: shell
+
+   nextflow run multimodal.nf \
+            -params-file /path/to/multimodal-params.yaml \
+            -profile conda
+
+Creating the environment when the pipleine is launched may take a few minutes.
+
+Further reading
+---------------
+
+For more information about Docker image pulling/local conda env creation in Nextflow please refer to Nextflow's official docs for `containers <https://www.nextflow.io/docs/latest/container.html>`__ and `conda <https://www.nextflow.io/docs/latest/conda.html>`__.
\ No newline at end of file
diff --git a/sphinx/multimodal/visualise.rst b/sphinx/multimodal/visualise.rst
new file mode 100644
index 0000000..88115a0
--- /dev/null
+++ b/sphinx/multimodal/visualise.rst
@@ -0,0 +1,37 @@
+.. _multimodal_visualise:
+
+Multimodal visualision
+===========
+
+The pipeline generates a Vitessce view config file for each processed dataset.
+This file can then be used to load the views and data as configured in the parameters files.
+
+You can locally serve and visualize the data in a few steps.
+
+By default, the base ``url`` used within the view config is ``http://localhost:3000/`` 
+(this can be changed in the :ref:`parameters file <parameters_file>`).
+This ``url`` indicates Vitessce to look for data at that location.
+
+You can set up a ``http`` server locally to serve the processed files so a Vitessce instance can load them.
+There are several tools that can setup a ``http`` server.
+We recommend using `serve <https://www.npmjs.com/package/serve>`__ (requires `Node.js <https://nodejs.org/en/>`__),
+but you can use any tool that can enable CORS. 
+
+You can serve the view config file and data by specifying the output directory
+(note that the pipeline adds its version to the ``outdir`` defined in the :ref:`parameters file <parameters_file>` file). 
+
+.. parsed-literal::
+
+   serve -C -p 3000 /path/to/outdir/|release|/
+
+Make sure to enable CORS and set the appropriate port number.
+In this case, using `serve <https://www.npmjs.com/package/serve>`__, this is done through the ``-C`` and ``-p`` flags respectively.
+
+Your view configs should then be accessible at ``http://localhost:3000/{project}-{dataset}-config.json``.
+
+You can then load them in a Vitessce instance like the `WebAtlas app <https://github.com/haniffalab/webatlas-app>`__ 
+deployed at `<https://webatlas.cog.sanger.ac.uk/latest/index.html>`__.
+
+Specify your locally served view config through the ``config`` parameter like
+``https://webatlas.cog.sanger.ac.uk/latest/index.html?config=http://localhost:3000/{project}-{dataset}-config.json``
+and load this URL in your browser to visualize your data in a Vitessce viewer.
\ No newline at end of file
diff --git a/sphinx/run.rst b/sphinx/run.rst
index 0813262..cbab333 100644
--- a/sphinx/run.rst
+++ b/sphinx/run.rst
@@ -32,20 +32,6 @@ defined.
 You can modify the entry point if you're interested in only getting the converted outputs.
 Use ``-entry Process_files`` or ``-entry Process_images`` as you need.
 
-Multimodal integration
-----------------------
-
-Additional to the main conversion pipeline, we offer a subsequent pipeline to process multiple datasets with matching features to be able to visualise and query across all of them in a single portal.
-This pipeline can also process extra features (e.g. celltypes) to visualise them across datasets in addition to their expression matrices.
-
-Configurations and data are input through a :ref:`parameters yaml file <multimodal_configuration>` (slightly different from the parameters file required by the main pipeline.)
-
-To run this pipeline use
-
-.. code-block:: shell
-
-   nextflow run multimodal.nf -params-file /path/to/multimodal-params.yaml
-
 Running using Docker 
 --------------------
 

From d54c2be20e3063662b668b83a0457213e7a1342c Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Wed, 8 Nov 2023 11:24:50 +0000
Subject: [PATCH 16/46] add info about multimodal visualisation to docs

---
 bin/build_config_multimodal.py      | 12 ++++--------
 sphinx/multimodal/configuration.rst | 14 +++++++-------
 sphinx/multimodal/overview.rst      | 28 ++++++++++++++--------------
 sphinx/multimodal/run.rst           |  6 +++---
 sphinx/multimodal/visualise.rst     | 15 +++++++++++++--
 5 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/bin/build_config_multimodal.py b/bin/build_config_multimodal.py
index 16b6be7..57a17b5 100755
--- a/bin/build_config_multimodal.py
+++ b/bin/build_config_multimodal.py
@@ -38,14 +38,10 @@ def write_json(
     Args:
         project (str, optional): Project name. Defaults to "".
         datasets (dict[str, dict[str]], optional): Dictionary of datasets.
-            Expected structure: { dataset_name: {
-                "file_paths" : [],
-                "images": {"raw": [], "label": []},
-                "options": {},
-                "obs_type": "cell",
-                "is_spatial": True
-                }
-            }
+            Expected structure: { dataset_name: { "file_paths" : [],
+            "images": {"raw": [], "label": []},
+            "options": {}, "obs_type": "cell",
+            "is_spatial": True } }
             Defaults to {}.
         extended_features (Union[list[str], str], optional): List of features or
             string of single feature on which the expression matrix was extended
diff --git a/sphinx/multimodal/configuration.rst b/sphinx/multimodal/configuration.rst
index 3899ddc..47e3054 100644
--- a/sphinx/multimodal/configuration.rst
+++ b/sphinx/multimodal/configuration.rst
@@ -1,8 +1,8 @@
 .. _multimodal_configuration:
 
-######################
+########################
 Multimodal configuration
-######################
+########################
 
 After running the main conversion pipeline you can populate the required YAML parameters file to run the multimodal integration pipeline.
 
@@ -48,22 +48,22 @@ The parameters file looks like this:
             xy: obsm/spatial
           matrix: X
 
-In contrast to the main conversion pipeline's parameters file, this file includes a single `project` to which multiple `datasets` belong to.
+In contrast to the main conversion pipeline's parameters file, this file includes a single `project` to which multiple `datasets` belong.
 
 Each ``dataset`` block defines the name of the dataset and paths to the converted data and image files (if any).
 
 Each ``dataset`` also requires a set of ``vitessce_options`` that specify the location of certain data (spatial coordinates, embeddings, expression matrix, etc.) within the AnnData object that is processed/generated.
-This follows the same structure as in the :ref:`main conversion's vitessce_options <vitessce_options>`.
+This follows the same structure as in the :ref:`main pipeline's vitessce_options <vitessce_options>`.
 
 Additionally, each ``dataset`` requires:
 
-* ``obs_type``, the type of observation of the dataset. For example, "cell" or "spot".
+* ``obs_type``, a string indicating the type of observation of the dataset. For example, "cell" or "spot".
 * ``offset``, an integer offset to add to the dataset's ID's so they don't clash with the other datasets.
-* ``is_spatial``, whether the dataset contains spatial information and has associated image files (raw and/or label images)
+* ``is_spatial``, a boolean indicating whether the dataset contains spatial information and has associated image files (raw and/or label images)
 
 Given that raw images are only read but not modified the pipeline does not generate new output from them.
 In order for the output directory (defined by ``outdir``) to contain all necessary files that need to be served for the web application to consume,
-by default, the pipeline copies the raw images to the output directory.
+by default, the pipeline copies the raw images to the output directory (unless a file with the same name already exists in the output directory).
 This process can take a long time depending on the size of the image.
 You may want to manually copy or move the image or serve it from its own directory separate from the rest of the output.
 The default copying can be disabled by setting ``copy_raw: false`` as a project-wide parameter (at the same level as ``outdir``, ``project``, etc).
diff --git a/sphinx/multimodal/overview.rst b/sphinx/multimodal/overview.rst
index 46fb273..7b8ce29 100644
--- a/sphinx/multimodal/overview.rst
+++ b/sphinx/multimodal/overview.rst
@@ -9,23 +9,23 @@ share common features. This step will prepare the unified multimodal visualision
 
 The data outputs generated by running the ``main.nf`` conversion pipeline serve as inputs for this multimodal integration pipeline.
 
-***************
-Running the multimodal pipeline
-***************
-
-Follow the instructions below to run the multimodal pipeline.
-
-1. :ref:`Configure <multimodal_configuration>` the parameters file for the ``multimodal.nf`` pipeline
-2. :ref:`Run <multimodal_run>` the ``multimodal.nf`` pipeline
-3. :ref:`Visualise <multimodal_visualise>` the multimodal data in a web browser
-
-***************
+*******************************
 Tasks completed by the pipeline
-***************
+*******************************
 
 The multimodal integration pipeline performs several tasks:
 
-1. Reindex each dataset by a user-inputed offset so ID's do not clash between modalities.
+1. Reindex each dataset by a user-inputted offset so ID's do not clash between modalities.
 2. *Optionally*, concatenate other observation-by-feature matrices or categorical values to the expression matrix to enable their visualisation as continuous values. For example, a celltype prediction matrix and/or celltype categories.
 3. Find the intersection of features between all datasets and subset them to visualise only the intersection (as including features not present in all datasets can produce misleading visualisations.)
-   **Note** the features are intersected using their index in the AnnData objects (``var`` table). All datasets must use the same type of data as index for the intersection to be correctly computed. For example, all datasets use names as index, or all datasets use IDs as index.
\ No newline at end of file
+   **Note** the features are intersected using their index in the AnnData objects (``var`` table). All datasets must use the same type of data as index for the intersection to be correctly computed. For example, all datasets use names as index, or all datasets use IDs as index.
+
+*******************************
+Running the multimodal pipeline
+*******************************
+
+Follow the instructions below to run the multimodal pipeline.
+
+1. :ref:`Configure the parameters file <multimodal_configuration>` for the ``multimodal.nf`` pipeline
+2. :ref:`Run <multimodal_run>` the ``multimodal.nf`` pipeline
+3. :ref:`Visualise <multimodal_visualise>` the multimodal data in a web browser
diff --git a/sphinx/multimodal/run.rst b/sphinx/multimodal/run.rst
index 2d57141..418cd16 100644
--- a/sphinx/multimodal/run.rst
+++ b/sphinx/multimodal/run.rst
@@ -1,12 +1,12 @@
 .. _multimodal_run:
 
 Multimodal run
-===========
+==============
 
-In additional to the main conversion pipeline, we offer a subsequent pipeline to process multiple datasets with matching features. This allows the users to 
+In addition to the main conversion pipeline, we offer a subsequent pipeline to process multiple datasets with matching features. This allows the users to 
 visualise and query all common features such as genes and cell types across all modalities from a single web portal.
 
-Configurations and data are input through a :ref:`parameters yaml file <multimodal_configuration>` (slightly different from the parameters file required by the main pipeline).
+Configurations and data are inputted through a :ref:`parameters yaml file <multimodal_configuration>` (slightly different from the parameters file required by the main pipeline).
 
 To run this pipeline use
 
diff --git a/sphinx/multimodal/visualise.rst b/sphinx/multimodal/visualise.rst
index 88115a0..a381970 100644
--- a/sphinx/multimodal/visualise.rst
+++ b/sphinx/multimodal/visualise.rst
@@ -1,10 +1,21 @@
 .. _multimodal_visualise:
 
 Multimodal visualision
-===========
+======================
 
-The pipeline generates a Vitessce view config file for each processed dataset.
+The pipeline generates a Vitessce view config file for each ``project`` to visualise its multiple ``datasets`` together.
 This file can then be used to load the views and data as configured in the parameters files.
+Unlike the main conversion pipeline, the layout of the view config file generated for multimodal datasets cannot be configured.
+
+For each spatial ``dataset`` (has ``is_spatial: true`` and contains either a ``raw_image`` or ``label_image``) the pipeline adds an spatial image component and a layer controller to the view config.
+For each non-spatial ``dataset`` (has ``is_spatial: false`` or contains no images) the pipeline adds a scatterplot component to the view config to visualise an embedding.
+As datasets have been subsetted to the shared features only, the pipeline adds one feature selection component to the view config from which the user can query across datasets.
+If an additional feature has been specified, the pipeline adds another feature selection component to the view config for that particular feature type.
+
+To set the layout of the view config the pipeline just concatenates components in a recursive manner.
+Thus, components might end up with sizes that are not the most ideal.
+During visualisation the user can resize and reorganise the components as needed.
+Alternatively, the generated view config file can be manually modified to set a more appropriate layout.
 
 You can locally serve and visualize the data in a few steps.
 

From eac3b9ddcc776403d92448931acc8b95db5d7b74 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Wed, 8 Nov 2023 11:45:48 +0000
Subject: [PATCH 17/46] fix multimodal visualisation ref

---
 sphinx/multimodal/visualise.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sphinx/multimodal/visualise.rst b/sphinx/multimodal/visualise.rst
index a381970..99e1203 100644
--- a/sphinx/multimodal/visualise.rst
+++ b/sphinx/multimodal/visualise.rst
@@ -20,7 +20,7 @@ Alternatively, the generated view config file can be manually modified to set a
 You can locally serve and visualize the data in a few steps.
 
 By default, the base ``url`` used within the view config is ``http://localhost:3000/`` 
-(this can be changed in the :ref:`parameters file <parameters_file>`).
+(this can be changed in the :ref:`parameters file <multimodal_parameters_file>`).
 This ``url`` indicates Vitessce to look for data at that location.
 
 You can set up a ``http`` server locally to serve the processed files so a Vitessce instance can load them.
@@ -29,7 +29,7 @@ We recommend using `serve <https://www.npmjs.com/package/serve>`__ (requires `No
 but you can use any tool that can enable CORS. 
 
 You can serve the view config file and data by specifying the output directory
-(note that the pipeline adds its version to the ``outdir`` defined in the :ref:`parameters file <parameters_file>` file). 
+(note that the pipeline adds its version to the ``outdir`` defined in the :ref:`parameters file <multimodal_parameters_file>` file). 
 
 .. parsed-literal::
 

From fad3b458594eb6c9f75476008eb1e258fdcaf113 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 13:14:18 +0000
Subject: [PATCH 18/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 40 +++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/docker-builds.yml

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
new file mode 100644
index 0000000..fb0abef
--- /dev/null
+++ b/.github/workflows/docker-builds.yml
@@ -0,0 +1,40 @@
+name: Docker Builds
+
+on:
+  push:
+    branches: [dev]
+  pull_request:
+    branches: [dev]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v2
+      - name: LatestTag
+        id: Latest Tag
+        uses: "WyriHaximus/github-action-get-previous-tag@v1"
+        with:
+          fallback: 1.0.0 # Optional fallback tag to use when no tag can be found
+
+      - name: docker login
+        env:
+          DOCKER_USER: ${{ secrets.ARC_USERNAME }}
+          DOCKER_PASSWORD: ${{ secrets.ARC_SECRET }}
+        run: |
+          docker login fintrust.azurecr.io -u $DOCKER_USER -p $DOCKER_PASSWORD
+      - name: docker build
+        run: |
+          docker build . -f ./deployment/app.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-app:${GITHUB_REF##*/}
+          docker build . -f ./deployment/app.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-app:latest
+          docker build . -f ./deployment/web.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-web:${GITHUB_REF##*/}
+          docker build . -f ./deployment/web.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-web:latest
+      - name: docker push
+        run: |
+          docker push fintrust.azurecr.io/fintrust-bot-app:${GITHUB_REF##*/}
+          docker push fintrust.azurecr.io/fintrust-bot-app:latest
+          docker push fintrust.azurecr.io/fintrust-bot-web:${GITHUB_REF##*/}
+          docker push fintrust.azurecr.io/fintrust-bot-web:latest

From 83d68eb84e62564607fc4a551c8117fd261252a0 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 13:15:03 +0000
Subject: [PATCH 19/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index fb0abef..b354201 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -14,27 +14,25 @@ jobs:
         uses: actions/checkout@v2
       - name: Set up Python 3.10
         uses: actions/setup-python@v2
-      - name: LatestTag
-        id: Latest Tag
+      - name: GetTag
+        id: previoustag
         uses: "WyriHaximus/github-action-get-previous-tag@v1"
         with:
           fallback: 1.0.0 # Optional fallback tag to use when no tag can be found
-
       - name: docker login
         env:
           DOCKER_USER: ${{ secrets.ARC_USERNAME }}
           DOCKER_PASSWORD: ${{ secrets.ARC_SECRET }}
         run: |
-          docker login fintrust.azurecr.io -u $DOCKER_USER -p $DOCKER_PASSWORD
+          docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
       - name: docker build
         run: |
-          docker build . -f ./deployment/app.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-app:${GITHUB_REF##*/}
-          docker build . -f ./deployment/app.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-app:latest
-          docker build . -f ./deployment/web.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-web:${GITHUB_REF##*/}
-          docker build . -f ./deployment/web.prod.dockerfile -t fintrust.azurecr.io/fintrust-bot-web:latest
+          docker build . -f ./envs/Dockerfile -t latest -t webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
+          docker build . -f ./envs/build_config/Dockerfile -t latest -t webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
+
       - name: docker push
         run: |
-          docker push fintrust.azurecr.io/fintrust-bot-app:${GITHUB_REF##*/}
-          docker push fintrust.azurecr.io/fintrust-bot-app:latest
-          docker push fintrust.azurecr.io/fintrust-bot-web:${GITHUB_REF##*/}
-          docker push fintrust.azurecr.io/fintrust-bot-web:latest
+          docker push webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
+          docker push webatlas-pipeline:latest
+          docker push webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
+          docker push webatlas-pipeline-build-config:latest

From 941064705da2d9157cf25c789c8fd112658c0477 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 13:18:52 +0000
Subject: [PATCH 20/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index b354201..bbad85f 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -21,8 +21,8 @@ jobs:
           fallback: 1.0.0 # Optional fallback tag to use when no tag can be found
       - name: docker login
         env:
-          DOCKER_USER: ${{ secrets.ARC_USERNAME }}
-          DOCKER_PASSWORD: ${{ secrets.ARC_SECRET }}
+          DOCKER_USER: ${{ secrets.DOCKER_USER }}
+          DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
         run: |
           docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
       - name: docker build

From a3cdebb6f226aa0a305d86157cefb3fed1950bc2 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 13:36:14 +0000
Subject: [PATCH 21/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 13 ++++---------
 envs/build_config/Dockerfile        |  2 +-
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index bbad85f..5e59832 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -12,9 +12,7 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v2
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v2
-      - name: GetTag
+      - name: Get tag
         id: previoustag
         uses: "WyriHaximus/github-action-get-previous-tag@v1"
         with:
@@ -25,14 +23,11 @@ jobs:
           DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
         run: |
           docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
-      - name: docker build
+      - name: Build webatlas-pipeline docker
+        working-directory: ./envs
         run: |
-          docker build . -f ./envs/Dockerfile -t latest -t webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
-          docker build . -f ./envs/build_config/Dockerfile -t latest -t webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
-
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:latest -t haniffalab/haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
       - name: docker push
         run: |
           docker push webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
           docker push webatlas-pipeline:latest
-          docker push webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
-          docker push webatlas-pipeline-build-config:latest
diff --git a/envs/build_config/Dockerfile b/envs/build_config/Dockerfile
index aecdb58..30af41a 100644
--- a/envs/build_config/Dockerfile
+++ b/envs/build_config/Dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.10
 
-COPY ./requirements-build_config.txt /requirements.txt
+COPY ./requirements.txt /requirements.txt
 RUN apt-get update && \
     apt-get install -y --no-install-recommends && \
     apt-get clean && \

From 9a8b7e814782b930a20b53271adf73e994a73203 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 13:40:28 +0000
Subject: [PATCH 22/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 5e59832..865a39a 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -1,4 +1,4 @@
-name: Docker Builds
+name: docker-builds
 
 on:
   push:
@@ -17,17 +17,23 @@ jobs:
         uses: "WyriHaximus/github-action-get-previous-tag@v1"
         with:
           fallback: 1.0.0 # Optional fallback tag to use when no tag can be found
-      - name: docker login
+      - name: Docker login
         env:
           DOCKER_USER: ${{ secrets.DOCKER_USER }}
           DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
         run: |
           docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
-      - name: Build webatlas-pipeline docker
+      - name: Build docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
           docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:latest -t haniffalab/haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
+      - name: Build docker (webatlas-pipeline-build-config)
+        working-directory: ./envs/build_config
+        run: |
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:latest -t haniffalab/haniffalab/webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
       - name: docker push
         run: |
-          docker push webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
-          docker push webatlas-pipeline:latest
+          docker push haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
+          docker push haniffalab/webatlas-pipeline:latest
+          docker push haniffalab/webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
+          docker push haniffalab/webatlas-pipeline-build-config:latest

From 18fa8cdaa7981be1c35eebd411ed0987feebba9b Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 13:46:32 +0000
Subject: [PATCH 23/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 865a39a..6329637 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -13,10 +13,11 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Get tag
-        id: previoustag
-        uses: "WyriHaximus/github-action-get-previous-tag@v1"
+        uses: oprypin/find-latest-tag@v1
         with:
-          fallback: 1.0.0 # Optional fallback tag to use when no tag can be found
+          repository: haniffalab/webatlas-pipeline # The repository to scan.
+          releases-only: true # We know that all relevant tags have a GitHub release for them.
+        id: previoustag # The step ID to refer to later.
       - name: Docker login
         env:
           DOCKER_USER: ${{ secrets.DOCKER_USER }}
@@ -31,7 +32,7 @@ jobs:
         working-directory: ./envs/build_config
         run: |
           docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:latest -t haniffalab/haniffalab/webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
-      - name: docker push
+      - name: Push Docker images
         run: |
           docker push haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
           docker push haniffalab/webatlas-pipeline:latest

From 88c59bad729528825a8e8520e0dd979619b6caa1 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 13:54:00 +0000
Subject: [PATCH 24/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 6329637..a7a34a2 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -15,9 +15,9 @@ jobs:
       - name: Get tag
         uses: oprypin/find-latest-tag@v1
         with:
-          repository: haniffalab/webatlas-pipeline # The repository to scan.
-          releases-only: true # We know that all relevant tags have a GitHub release for them.
-        id: previoustag # The step ID to refer to later.
+          repository: haniffalab/webatlas-pipeline
+          releases-only: true
+        id: previoustag
       - name: Docker login
         env:
           DOCKER_USER: ${{ secrets.DOCKER_USER }}
@@ -27,14 +27,12 @@ jobs:
       - name: Build docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:latest -t haniffalab/haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t webatlas-pipeline:latest -t webatlas-pipeline:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
       - name: Build docker (webatlas-pipeline-build-config)
         working-directory: ./envs/build_config
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:latest -t haniffalab/haniffalab/webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t webatlas-pipeline-build-config:latest -t webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
       - name: Push Docker images
         run: |
-          docker push haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
-          docker push haniffalab/webatlas-pipeline:latest
-          docker push haniffalab/webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
-          docker push haniffalab/webatlas-pipeline-build-config:latest
+          docker push webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
+          docker push webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}

From ae2b099022e9dfcbebe39cce2e19b427b24a4324 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:00:52 +0000
Subject: [PATCH 25/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index a7a34a2..035da99 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -17,7 +17,11 @@ jobs:
         with:
           repository: haniffalab/webatlas-pipeline
           releases-only: true
-        id: previoustag
+        id: find-latest-tag
+      - name: Set version
+        run: |
+          TAG=${{ steps.find-latest-tag.outputs.tag }}
+          echo "find-latest-tag=${TAG#v}" >> $GITHUB_ENV
       - name: Docker login
         env:
           DOCKER_USER: ${{ secrets.DOCKER_USER }}
@@ -27,12 +31,12 @@ jobs:
       - name: Build docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
-          docker build --platform=linux/amd64 -t webatlas-pipeline:latest -t webatlas-pipeline:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t webatlas-pipeline:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
       - name: Build docker (webatlas-pipeline-build-config)
         working-directory: ./envs/build_config
         run: |
-          docker build --platform=linux/amd64 -t webatlas-pipeline-build-config:latest -t webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
       - name: Push Docker images
         run: |
-          docker push webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
-          docker push webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
+          docker push haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
+          docker push haniffalab/webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}

From 646fe19cd75bf91d750cce55b9042c802ee7bf3b Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:04:13 +0000
Subject: [PATCH 26/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 035da99..27da3d7 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -31,12 +31,12 @@ jobs:
       - name: Build docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
-          docker build --platform=linux/amd64 -t webatlas-pipeline:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t webatlas-pipeline:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
       - name: Build docker (webatlas-pipeline-build-config)
         working-directory: ./envs/build_config
         run: |
-          docker build --platform=linux/amd64 -t webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t webatlas-pipeline-build-config:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
       - name: Push Docker images
         run: |
-          docker push haniffalab/webatlas-pipeline:${{ steps.previoustag.outputs.tag }}
-          docker push haniffalab/webatlas-pipeline-build-config:${{ steps.previoustag.outputs.tag }}
+          docker push haniffalab/webatlas-pipeline:${{ steps.find-latest-tag.outputs.tag }}
+          docker push haniffalab/webatlas-pipeline-build-config:${{ steps.find-latest-tag.outputs.tag }}

From 826fbf404218272af9c82a84b26c7ec35e0cf727 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:15:14 +0000
Subject: [PATCH 27/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 27da3d7..d9a5547 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -22,20 +22,19 @@ jobs:
         run: |
           TAG=${{ steps.find-latest-tag.outputs.tag }}
           echo "find-latest-tag=${TAG#v}" >> $GITHUB_ENV
-      - name: Docker login
-        env:
-          DOCKER_USER: ${{ secrets.DOCKER_USER }}
-          DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
-        run: |
-          docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
-      - name: Build docker (webatlas-pipeline)
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Build Docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
-          docker build --platform=linux/amd64 -t webatlas-pipeline:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
-      - name: Build docker (webatlas-pipeline-build-config)
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
+      - name: Build Docker (webatlas-pipeline-build-config)
         working-directory: ./envs/build_config
         run: |
-          docker build --platform=linux/amd64 -t webatlas-pipeline-build-config:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
       - name: Push Docker images
         run: |
           docker push haniffalab/webatlas-pipeline:${{ steps.find-latest-tag.outputs.tag }}

From 7072586ff0f7ed787db51294ca086842d21bc6ba Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:20:54 +0000
Subject: [PATCH 28/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index d9a5547..3ec31fb 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Set version
         run: |
           TAG=${{ steps.find-latest-tag.outputs.tag }}
-          echo "find-latest-tag=${TAG#v}" >> $GITHUB_ENV
+          echo "VERSION=${TAG#v}" >> $GITHUB_ENV
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:
@@ -30,12 +30,12 @@ jobs:
       - name: Build Docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${{ VERSION }} -f ./Dockerfile .
       - name: Build Docker (webatlas-pipeline-build-config)
         working-directory: ./envs/build_config
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${{ steps.find-latest-tag.outputs.tag }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${{ VERSION }} -f ./Dockerfile .
       - name: Push Docker images
         run: |
-          docker push haniffalab/webatlas-pipeline:${{ steps.find-latest-tag.outputs.tag }}
-          docker push haniffalab/webatlas-pipeline-build-config:${{ steps.find-latest-tag.outputs.tag }}
+          docker push haniffalab/webatlas-pipeline:${{ VERSION }}
+          docker push haniffalab/webatlas-pipeline-build-config:${{ VERSION }}

From 8e0aa51a251273c4cc05dbfeb89f7deb18f19c33 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:23:12 +0000
Subject: [PATCH 29/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 3ec31fb..53344a6 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -30,12 +30,12 @@ jobs:
       - name: Build Docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${{ VERSION }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${ VERSION } -f ./Dockerfile .
       - name: Build Docker (webatlas-pipeline-build-config)
         working-directory: ./envs/build_config
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${{ VERSION }} -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${ VERSION } -f ./Dockerfile .
       - name: Push Docker images
         run: |
-          docker push haniffalab/webatlas-pipeline:${{ VERSION }}
-          docker push haniffalab/webatlas-pipeline-build-config:${{ VERSION }}
+          docker push haniffalab/webatlas-pipeline:${ VERSION }
+          docker push haniffalab/webatlas-pipeline-build-config:${ VERSION }

From 8f76e7d4a3f33443b1c503a3a7878fa9fb69a07b Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:24:57 +0000
Subject: [PATCH 30/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 53344a6..b9aae26 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -21,7 +21,9 @@ jobs:
       - name: Set version
         run: |
           TAG=${{ steps.find-latest-tag.outputs.tag }}
-          echo "VERSION=${TAG#v}" >> $GITHUB_ENV
+          VERSION=${TAG#v}
+          echo Version: $VERSION
+          echo "VERSION=$VERSION" >> $GITHUB_ENV
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:

From c702e6c9a9d0b601c20ef42d7df6ba9574aac3f2 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:26:23 +0000
Subject: [PATCH 31/46] test workflow for #120

---
 .github/workflows/docker-builds.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index b9aae26..1a0478f 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -32,12 +32,12 @@ jobs:
       - name: Build Docker (webatlas-pipeline)
         working-directory: ./envs
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${ VERSION } -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${VERSION} -f ./Dockerfile .
       - name: Build Docker (webatlas-pipeline-build-config)
         working-directory: ./envs/build_config
         run: |
-          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${ VERSION } -f ./Dockerfile .
+          docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${VERSION} -f ./Dockerfile .
       - name: Push Docker images
         run: |
-          docker push haniffalab/webatlas-pipeline:${ VERSION }
-          docker push haniffalab/webatlas-pipeline-build-config:${ VERSION }
+          docker push haniffalab/webatlas-pipeline:${VERSION}
+          docker push haniffalab/webatlas-pipeline-build-config:${VERSION}

From afb8ecf011203f143405ab1a99f1daff577b04bf Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:33:46 +0000
Subject: [PATCH 32/46] close #120

---
 .github/workflows/docker-builds.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 1a0478f..e60d636 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -2,9 +2,7 @@ name: docker-builds
 
 on:
   push:
-    branches: [dev]
-  pull_request:
-    branches: [dev]
+    tags: ["v*"]
 
 jobs:
   build:
@@ -20,7 +18,8 @@ jobs:
         id: find-latest-tag
       - name: Set version
         run: |
-          TAG=${{ steps.find-latest-tag.outputs.tag }}
+          # TAG=${{ steps.find-latest-tag.outputs.tag }}
+          TAG=${GITHUB_REF#refs/*/}
           VERSION=${TAG#v}
           echo Version: $VERSION
           echo "VERSION=$VERSION" >> $GITHUB_ENV

From 382dd620e32fa7f03f3633b730c7f2e231c5df72 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:42:39 +0000
Subject: [PATCH 33/46] fix issue with docker build

---
 envs/build-docker-imgs.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/envs/build-docker-imgs.sh b/envs/build-docker-imgs.sh
index 3f6e917..34b711f 100755
--- a/envs/build-docker-imgs.sh
+++ b/envs/build-docker-imgs.sh
@@ -2,4 +2,5 @@
 VERSION=0.4.0
 
 docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${VERSION} -f ./Dockerfile .
-docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build_config:${VERSION} -f ./build_config/Dockerfile .
+cd build_config/
+docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build_config:${VERSION} -f ./Dockerfile .

From 69b447321b044fcc7d62cfa78831150a1ed22b67 Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:46:43 +0000
Subject: [PATCH 34/46] bump version

---
 docs/citing.html           |  4 ++--
 envs/build-docker-imgs.sh  |  2 +-
 main.nf                    |  2 +-
 multimodal.nf              |  2 +-
 nextflow.config            |  4 ++--
 sphinx/citing.rst          |  2 +-
 sphinx/conf.py             |  2 +-
 sphinx/examples/visium.rst |  4 ++--
 sphinx/examples/xenium.rst |  4 ++--
 sphinx/installation.rst    | 24 ++++++++++++------------
 10 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/docs/citing.html b/docs/citing.html
index 81b4d19..a62b4e2 100644
--- a/docs/citing.html
+++ b/docs/citing.html
@@ -4,7 +4,7 @@
   <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-  <title>Citation &mdash; WebAtlas 0.4.0 documentation</title>
+  <title>Citation &mdash; WebAtlas 0.4.1 documentation</title>
       <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
       <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
       <link rel="stylesheet" href="_static/css/custom.css" type="text/css" />
@@ -107,7 +107,7 @@
 <span id="citing"></span><h1>Citation<a class="headerlink" href="#citation" title="Link to this heading"></a></h1>
 <p><a class="reference external" href="https://doi.org/10.5281/zenodo.7405818"><img alt="DOI" src="https://zenodo.org/badge/DOI/10.5281/zenodo.7405818.svg" /></a></p>
 <p>If you use this software in a scientific publication, please cite using the following Zenodo reference.</p>
-<p><strong>Li, Tong, Horsfall, David, Basurto-Lozada, Daniela, Prete, Martin, Jessica, Cox, &amp; Iolo, Squires. (2023). WebAtlas Pipeline (v0.4.0). Zenodo. https://doi.org/10.5281/zenodo.7863308</strong></p>
+<p><strong>Li, Tong, Horsfall, David, Basurto-Lozada, Daniela, Prete, Martin, Jessica, Cox, &amp; Iolo, Squires. (2023). WebAtlas Pipeline (v0.4.1). Zenodo. https://doi.org/10.5281/zenodo.7863308</strong></p>
 </section>
 
 
diff --git a/envs/build-docker-imgs.sh b/envs/build-docker-imgs.sh
index 34b711f..0a53fc6 100755
--- a/envs/build-docker-imgs.sh
+++ b/envs/build-docker-imgs.sh
@@ -1,5 +1,5 @@
 #! /bin/sh
-VERSION=0.4.0
+VERSION=0.4.1
 
 docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${VERSION} -f ./Dockerfile .
 cd build_config/
diff --git a/main.nf b/main.nf
index db09feb..54c3f61 100644
--- a/main.nf
+++ b/main.nf
@@ -5,7 +5,7 @@ import groovy.json.*
 nextflow.enable.dsl=2
 
 verbose_log = true
-version = "0.4.0"
+version = "0.4.1"
 
 //////////////////////////////////////////////////////
 
diff --git a/multimodal.nf b/multimodal.nf
index 76d1561..dc99924 100644
--- a/multimodal.nf
+++ b/multimodal.nf
@@ -11,7 +11,7 @@ params.outdir = ""
 params.copy_raw = true
 params.description = ""
 
-version="0.4.0"
+version="0.4.1"
 verbose_log=true
 outdir_with_version = "${params.outdir.replaceFirst(/\/*$/, "")}\/${version}"
 
diff --git a/nextflow.config b/nextflow.config
index 0312339..534983c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,9 +13,9 @@ profiles {
   docker { 
     docker.enabled = true
     process {
-      container = 'haniffalab/webatlas-pipeline:0.4.0'
+      container = 'haniffalab/webatlas-pipeline:0.4.1'
       withLabel: build_config {
-        container = 'haniffalab/webatlas-pipeline-build_config:0.4.0'
+        container = 'haniffalab/webatlas-pipeline-build_config:0.4.1'
       }
     }
   }
diff --git a/sphinx/citing.rst b/sphinx/citing.rst
index 2253110..e0404cd 100644
--- a/sphinx/citing.rst
+++ b/sphinx/citing.rst
@@ -11,4 +11,4 @@ Citation
 
 If you use this software in a scientific publication, please cite using the following Zenodo reference. 
 
-**Li, Tong, Horsfall, David, Basurto-Lozada, Daniela, Prete, Martin, Jessica, Cox, & Iolo, Squires. (2023). WebAtlas Pipeline (v0.4.0). Zenodo. https://doi.org/10.5281/zenodo.7863308**
+**Li, Tong, Horsfall, David, Basurto-Lozada, Daniela, Prete, Martin, Jessica, Cox, & Iolo, Squires. (2023). WebAtlas Pipeline (v0.4.1). Zenodo. https://doi.org/10.5281/zenodo.7863308**
diff --git a/sphinx/conf.py b/sphinx/conf.py
index b5d58d8..55f8950 100644
--- a/sphinx/conf.py
+++ b/sphinx/conf.py
@@ -23,7 +23,7 @@
 author = "Haniffa Lab"
 
 # The full version, including alpha/beta/rc tags
-release = "0.4.0"
+release = "0.4.1"
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/sphinx/examples/visium.rst b/sphinx/examples/visium.rst
index c6fce7b..1f0daac 100644
--- a/sphinx/examples/visium.rst
+++ b/sphinx/examples/visium.rst
@@ -99,7 +99,7 @@ verify the expected directories are created.
 .. code-block:: shell
    :caption: Input
 
-   ls -l output/CytAssist_FFPE_Human_Breast_Cancer/0.4.0
+   ls -l output/CytAssist_FFPE_Human_Breast_Cancer/0.4.1
 
 .. code-block:: shell
    :caption: Output
@@ -121,7 +121,7 @@ at http://localhost:3000, and that CORS is enabled via the Access-Control-Allow-
 .. code-block:: shell
    :caption: Input
 
-   npx http-server output/CytAssist_FFPE_Human_Breast_Cancer/0.4.0 --port 3000 --cors
+   npx http-server output/CytAssist_FFPE_Human_Breast_Cancer/0.4.1 --port 3000 --cors
 
 .. code-block:: shell
    :caption: Output
diff --git a/sphinx/examples/xenium.rst b/sphinx/examples/xenium.rst
index d2bcbd0..c9ee248 100644
--- a/sphinx/examples/xenium.rst
+++ b/sphinx/examples/xenium.rst
@@ -102,7 +102,7 @@ verify the expected directories are created.
 .. code-block:: shell
    :caption: Input
 
-   ls -l output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.4.0
+   ls -l output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.4.1
 
 .. code-block:: shell
    :caption: Output
@@ -123,7 +123,7 @@ at http://localhost:3000, and that CORS is enabled via the Access-Control-Allow-
 .. code-block:: shell
    :caption: Input
 
-   npx http-server output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.4.0 --port 3000 --cors
+   npx http-server output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.4.1 --port 3000 --cors
 
 .. code-block:: shell
    :caption: Output
diff --git a/sphinx/installation.rst b/sphinx/installation.rst
index 068796a..2af9301 100644
--- a/sphinx/installation.rst
+++ b/sphinx/installation.rst
@@ -14,7 +14,7 @@ Download the WebAtlas Pipeline release. You can look for previous `releases on G
 .. code-block:: shell
    :caption: Input
 
-   wget https://github.com/haniffalab/webatlas-pipeline/archive/refs/tags/v0.4.0.tar.gz
+   wget https://github.com/haniffalab/webatlas-pipeline/archive/refs/tags/v0.4.1.tar.gz
 
 .. code-block:: shell
    :caption: Expected Output
@@ -22,35 +22,35 @@ Download the WebAtlas Pipeline release. You can look for previous `releases on G
    Resolving github.com (github.com)... 140.82.121.3
    Connecting to github.com (github.com)|140.82.121.3|:443... connected.
    HTTP request sent, awaiting response... 302 Found
-   Location: https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.4.0 [following]
-   --2023-05-18 09:30:15--  https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.4.0
+   Location: https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.4.1 [following]
+   --2023-05-18 09:30:15--  https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.4.1
    Resolving codeload.github.com (codeload.github.com)... 140.82.121.9
    Connecting to codeload.github.com (codeload.github.com)|140.82.121.9|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: unspecified [application/x-gzip]
-   Saving to: ‘v0.4.0.tar.gz’
+   Saving to: ‘v0.4.1.tar.gz’
 
-   v0.4.0.tar.gz [ <=>                                           ]   2.70M  9.12MB/s    in 0.3s    
+   v0.4.1.tar.gz [ <=>                                           ]   2.70M  9.12MB/s    in 0.3s    
 
-   2023-05-18 09:30:16 (9.12 MB/s) - ‘v0.4.0.tar.gz’ saved [2835534]
+   2023-05-18 09:30:16 (9.12 MB/s) - ‘v0.4.1.tar.gz’ saved [2835534]
 
 Extract the WebAtlas compressed tag and change directory into the new repo.
 
 .. code-block:: shell
    :caption: Input
 
-   tar -xzvf ./v0.4.0.tar.gz
-   cd webatlas-pipeline-0.4.0
+   tar -xzvf ./v0.4.1.tar.gz
+   cd webatlas-pipeline-0.4.1
 
 .. code-block:: shell
    :caption: Expected Output
     
-   webatlas-pipeline-0.4.0/
-   webatlas-pipeline-0.4.0/.github/
+   webatlas-pipeline-0.4.1/
+   webatlas-pipeline-0.4.1/.github/
    ...
    ...
-   webatlas-pipeline-0.4.0/tests/input/simple_config.json
-   webatlas-pipeline-0.4.0/tests/test_class.py
+   webatlas-pipeline-0.4.1/tests/input/simple_config.json
+   webatlas-pipeline-0.4.1/tests/test_class.py
 
 .. _environment:
 

From b2d542dcd891101db71c5a6c252fe2b2566789cd Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 9 Nov 2023 14:47:52 +0000
Subject: [PATCH 35/46] update docker image name

---
 envs/build-docker-imgs.sh | 2 +-
 nextflow.config           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/envs/build-docker-imgs.sh b/envs/build-docker-imgs.sh
index 0a53fc6..ccfeb7c 100755
--- a/envs/build-docker-imgs.sh
+++ b/envs/build-docker-imgs.sh
@@ -3,4 +3,4 @@ VERSION=0.4.1
 
 docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${VERSION} -f ./Dockerfile .
 cd build_config/
-docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build_config:${VERSION} -f ./Dockerfile .
+docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline-build-config:${VERSION} -f ./Dockerfile .
diff --git a/nextflow.config b/nextflow.config
index 534983c..188fd4c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -15,7 +15,7 @@ profiles {
     process {
       container = 'haniffalab/webatlas-pipeline:0.4.1'
       withLabel: build_config {
-        container = 'haniffalab/webatlas-pipeline-build_config:0.4.1'
+        container = 'haniffalab/webatlas-pipeline-build-config:0.4.1'
       }
     }
   }

From 30575b89f45117f3e6761cb1c2f5390523d9a19b Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Mon, 20 Nov 2023 16:11:54 +0000
Subject: [PATCH 36/46] add option for extend_feature args in multimodal yaml
 config

add cell2location filtering by obs
---
 bin/integrate_anndata.py           | 28 ++++++++++++++++----------
 multimodal.nf                      | 32 +++++++++++++++++++++++++++---
 templates/multimodal-template.yaml |  5 ++++-
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/bin/integrate_anndata.py b/bin/integrate_anndata.py
index 88188ce..0ab4eda 100755
--- a/bin/integrate_anndata.py
+++ b/bin/integrate_anndata.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from typing import Union
+import typing as T
 import os
 import fire
 import zarr
@@ -13,17 +14,19 @@
 from pathlib import Path
 
 
-def reindex_and_concat(path: str, offset: int, features: str = None, **kwargs):
+def reindex_and_concat(
+    path: str, offset: int, features: str = None, args: dict[str, T.Any] = {}, **kwargs
+):
     adata = read_anndata(path)
 
-    adata = reindex_anndata(adata, offset, no_save=True)
+    adata = reindex_anndata(adata, offset, **args, **kwargs)
     if features:
-        adata = concat_features(adata, features, no_save=True)
+        adata = concat_features(adata, features, **args, **kwargs)
 
     out_filename = "reindexed-concat-{}".format(
         os.path.splitext(os.path.basename(path))[0]
     )
-    write_anndata(adata, out_filename, **kwargs)
+    write_anndata(adata, out_filename, **args, **kwargs)
 
     return
 
@@ -31,7 +34,7 @@ def reindex_and_concat(path: str, offset: int, features: str = None, **kwargs):
 def reindex_anndata(
     data: Union[ad.AnnData, str],
     offset: int,
-    no_save: bool = False,
+    no_save: bool = True,
     out_filename: str = None,
     **kwargs,
 ):
@@ -55,7 +58,7 @@ def reindex_anndata(
 def concat_features(
     data: Union[ad.AnnData, str],
     features: str,
-    no_save: bool = False,
+    no_save: bool = True,
     out_filename: str = None,
     **kwargs,
 ):
@@ -68,11 +71,11 @@ def concat_features(
         )
 
     if features.endswith(".h5ad") and os.path.isfile(features):
-        adata = concat_matrix_from_cell2location(adata, features)
+        adata = concat_matrix_from_cell2location(adata, features, **kwargs)
     elif features.startswith("obs/"):
-        adata = concat_matrix_from_obs(adata, features.split("/")[1])
+        adata = concat_matrix_from_obs(adata, features.split("/")[1], **kwargs)
     elif features.startswith("obsm/"):
-        adata = concat_matrix_from_obsm(adata, features.split("/")[1])
+        adata = concat_matrix_from_obsm(adata, features.split("/")[1], **kwargs)
 
     if no_save:
         return adata
@@ -134,9 +137,10 @@ def concat_matrix_from_cell2location(
     data: Union[ad.AnnData, str],
     c2l_file: str,
     q: str = "q05_cell_abundance_w_sf",
-    sample: str = None,
+    sample: tuple[str, str] = None,
     feature_name: str = "gene",
     obs_feature_name: str = None,
+    **kwargs,
 ):
     if isinstance(data, ad.AnnData):
         adata = data
@@ -162,7 +166,9 @@ def concat_matrix_from_cell2location(
         dtype="float32",
     )
 
-    return concat_matrices(adata, c2l_df, "celltype", feature_name, obs_feature_name)
+    return concat_matrices(
+        adata, c2l_df, "celltype", feature_name, obs_feature_name, **kwargs
+    )
 
 
 def concat_matrices(
diff --git a/multimodal.nf b/multimodal.nf
index dc99924..a5baab0 100644
--- a/multimodal.nf
+++ b/multimodal.nf
@@ -33,6 +33,30 @@ def copyFile (inputFile, outdir) {
     return inputFile
 }
 
+def parseExtendFeature (extend_feature){
+    if (!extend_feature){
+        return [ file("NO_FT"), [:] ]
+    }
+    else if (extend_feature instanceof String ){
+        return [
+            file(extend_feature.endsWith(".h5ad") ? extend_feature : "NO_FT"),
+            [:]
+        ]
+    }
+    else if (extend_feature instanceof Map){
+        if (extend_feature["path"]){
+            if (!(extend_feature.path instanceof String && extend_feature.path.endsWith(".h5ad"))){
+                error "Invalid value for `extend_feature.path`. Expecting an .h5ad file."
+            }
+            return [ file(extend_feature.path), extend_feature.args ?: [:] ]
+        }
+        else {
+            error "Invalid map for `extend_feature`. Expecting key `path`."
+        }
+    }
+    error "Invalid value for `extend_feature`"
+}
+
 process process_label {
     tag "${label_image}"
     debug verbose_log
@@ -64,7 +88,7 @@ process process_anndata {
     publishDir outdir_with_version, mode:"copy"
 
     input:
-    tuple val(dataset), path(anndata), val(offset), val(features), path(features_file)
+    tuple val(dataset), path(anndata), val(offset), val(features), path(features_file), val(features_args)
 
     output:
     tuple val(dataset), path("*")
@@ -73,11 +97,13 @@ process process_anndata {
     features_str = features
         ? "--features ${features_file.name != 'NO_FT' ? features_file : features}"
         : ""
+    args_str = features_file.name != 'NO_FT' && features_args
+        ? "--args '" + new JsonBuilder(features_args).toString() + "'" : ""
     """
     integrate_anndata.py reindex_and_concat \
         --path ${anndata} \
         --offset ${offset} \
-        ${features_str}
+        ${features_str} ${args_str}
     """
 }
 
@@ -140,7 +166,7 @@ workflow {
             raws : [it.dataset, it.raw_image] // not processed but necessary for writing config
             labels : [it.dataset, it.label_image, it.offset]
             adatas : [it.dataset, file(it.anndata), it.offset, it.extend_feature,
-                file(it.extend_feature && it.extend_feature.endsWith(".h5ad") ? it.extend_feature : "NO_FT")
+                *parseExtendFeature(it.extend_feature)
             ]
         }
         .set{data}
diff --git a/templates/multimodal-template.yaml b/templates/multimodal-template.yaml
index 9fc3b48..b8a07b7 100644
--- a/templates/multimodal-template.yaml
+++ b/templates/multimodal-template.yaml
@@ -33,7 +33,10 @@ data:
     is_spatial: true
     raw_image: test-visium-raw.zarr
     label_image: test-visium-label.zarr
-    extend_feature: /path/to/c2l_output.h5ad # obsm/celltype or obsm/q05_cell_abundance_w_sf
+    extend_feature: 
+      path: /path/to/c2l_output.h5ad
+      args:
+        sample: ["library_id", "sample_1"]
     vitessce_options:
       spatial:
         xy: "obsm/spatial"

From bdc0731f4ffb1fcf2e034803f22d8c9ac349a60b Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 21 Nov 2023 12:05:08 +0000
Subject: [PATCH 37/46] add c2l output default sorting option

---
 bin/integrate_anndata.py | 18 ++++++++++++++++++
 bin/process_h5ad.py      |  3 +++
 2 files changed, 21 insertions(+)

diff --git a/bin/integrate_anndata.py b/bin/integrate_anndata.py
index 0ab4eda..aef4be7 100755
--- a/bin/integrate_anndata.py
+++ b/bin/integrate_anndata.py
@@ -140,8 +140,11 @@ def concat_matrix_from_cell2location(
     sample: tuple[str, str] = None,
     feature_name: str = "gene",
     obs_feature_name: str = None,
+    sort: bool = True,
+    sort_index: str = None,
     **kwargs,
 ):
+    sort = sort or sort_index is not None
     if isinstance(data, ad.AnnData):
         adata = data
     else:
@@ -157,6 +160,21 @@ def concat_matrix_from_cell2location(
     if sample:
         c2l_adata = c2l_adata[c2l_adata.obs[sample[0]] == sample[1]]
 
+    if sort:
+        if not sort_index and adata.uns.get("webatlas_reindexed"):
+            sort_index = "label_id"
+        if sort_index:
+            idx = c2l_adata.obs.index.get_indexer(adata.obs[sort_index].tolist())
+        else:
+            idx = c2l_adata.obs.index.get_indexer(adata.obs.index.tolist())
+        if -1 in idx:
+            raise SystemError(
+                "Values do not match between AnnData object's"
+                f" `{sort_index or 'index'}`"
+                " and cell2location output index."
+            )
+        c2l_adata = c2l_adata[idx,]
+
     c2l_df = pd.DataFrame(
         c2l_adata.obsm[q].to_numpy(),
         index=c2l_adata.obs.index,
diff --git a/bin/process_h5ad.py b/bin/process_h5ad.py
index 5a7ddc4..e525d38 100755
--- a/bin/process_h5ad.py
+++ b/bin/process_h5ad.py
@@ -125,10 +125,13 @@ def reindex_anndata_obs(adata: ad.AnnData) -> ad.AnnData:
         adata.obs.index.is_object() and all(adata.obs.index.str.isnumeric())
     ):
         IDX_NAME = "label_id"
+        if IDX_NAME in adata.obs:
+            adata.obs.rename(columns={IDX_NAME: f"_{IDX_NAME}"})
         adata.obs = adata.obs.reset_index(names=IDX_NAME)
         adata.obs.index = (
             pd.Categorical(adata.obs[IDX_NAME]).codes + 1
         )  # avoid 0's for possible label images
+        adata.uns["webatlas_reindexed"] = True
     adata.obs.index = adata.obs.index.astype(str)
 
     return adata

From 22c538e1c4bb7bd666ce62c5620d3dc9da4d46c9 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 21 Nov 2023 13:28:38 +0000
Subject: [PATCH 38/46] add 'extend_feature' args to multimodal docs

---
 sphinx/multimodal/configuration.rst | 38 ++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/sphinx/multimodal/configuration.rst b/sphinx/multimodal/configuration.rst
index 47e3054..dd45ed8 100644
--- a/sphinx/multimodal/configuration.rst
+++ b/sphinx/multimodal/configuration.rst
@@ -134,4 +134,40 @@ The full parameters file will then look like this
             xy: obsm/spatial
           matrix: X
 
-With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
\ No newline at end of file
+With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
+
+In the case of providing a *cell2location* output file, you can further configure ``extend_feature`` with arguments for how the file should be processed.
+Instead of only setting the path to the file you would need to define ``extend_feature`` as a map containing ``path`` and optional ``args``.
+
+.. code-block:: yaml
+
+    extend_feature_name: celltype
+    data:
+      -
+        dataset: visium
+        obs_type: spot
+        anndata: /path/to/main/output/visium-anndata.zarr
+        extend_feature: 
+          path: /path/to/c2l.h5ad
+          args:
+            sample: ["library_id", "sample_1"] # tuple containing the obs column name and value to filter the object. By default the object is not filtered.
+            q: "q05_cell_abundance_w_sf" # matrix in obsm to use. Defaults to "q05_cell_abundance_w_sf".
+            sort_index: "index_column" # column in the AnnData object that contains an index that matches the index in cell2location.
+            sort: True # can be set to False to skip ordering the cell2location matrix but observations might not match in order between files. Defaults to True.
+
+For example, ``sample`` can be used when a *cell2location* output file contains predictions for multiple samples.
+Setting ``sample`` to filter the output file enables the pipeline to obtain the appropriate prediction matrix for the data being processed,
+without having to split the *cell2location* output file for each sample. Otherwise, if a file with multiple sample prediction is input
+it will not match the number of observations of the AnnData object and the process will throw an error.
+
+``q`` can be set to use a different prediction matrix from the *cell2location* output file.
+It defaults to ``"q05_cell_abundance_w_sf"``
+
+``sort`` and ``sort_index`` can be used to define how a *cell2location* output file matches the AnnData object.
+By default the pipeline will try to ensure the order of observations between the prediction matrix and AnnData object match
+so values are correctly concatenated.
+The pipeline will attempt to order the prediction matrix given the index of the AnnData object 
+(or the original index if the main pipeline re-indexed it).
+However you can override the observations column of the AnnData object that contains the index that the prediction matrix should match.
+``sort`` can be set to ``False`` to disable any re-ordering. If disabled, the prediction matrix would be concatenated as-is into the AnnData object
+without checking if observations' IDs match.

From cd05e0d2f772eacaa21d2d51e946ec5840bfbaf2 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 7 Nov 2023 14:54:25 +0000
Subject: [PATCH 39/46] add documentation on multimodal pipeline

---
 bin/build_config_multimodal.py      |  12 ++-
 sphinx/index.rst                    |   1 +
 sphinx/multimodal_configuration.rst | 149 ++++++++++++++++++++++++++++
 sphinx/run.rst                      |  14 +++
 4 files changed, 172 insertions(+), 4 deletions(-)
 create mode 100644 sphinx/multimodal_configuration.rst

diff --git a/bin/build_config_multimodal.py b/bin/build_config_multimodal.py
index 57a17b5..16b6be7 100755
--- a/bin/build_config_multimodal.py
+++ b/bin/build_config_multimodal.py
@@ -38,10 +38,14 @@ def write_json(
     Args:
         project (str, optional): Project name. Defaults to "".
         datasets (dict[str, dict[str]], optional): Dictionary of datasets.
-            Expected structure: { dataset_name: { "file_paths" : [],
-            "images": {"raw": [], "label": []},
-            "options": {}, "obs_type": "cell",
-            "is_spatial": True } }
+            Expected structure: { dataset_name: {
+                "file_paths" : [],
+                "images": {"raw": [], "label": []},
+                "options": {},
+                "obs_type": "cell",
+                "is_spatial": True
+                }
+            }
             Defaults to {}.
         extended_features (Union[list[str], str], optional): List of features or
             string of single feature on which the expression matrix was extended
diff --git a/sphinx/index.rst b/sphinx/index.rst
index d94a655..21d7f84 100644
--- a/sphinx/index.rst
+++ b/sphinx/index.rst
@@ -26,6 +26,7 @@ The pipeline generates data files for `supported data types`_, and builds a `vie
 
    installation
    configuration
+   multimodal_configuration
    run
    visualise
    Demos <https://cellatlas.io/webatlas>
diff --git a/sphinx/multimodal_configuration.rst b/sphinx/multimodal_configuration.rst
new file mode 100644
index 0000000..3e905a4
--- /dev/null
+++ b/sphinx/multimodal_configuration.rst
@@ -0,0 +1,149 @@
+.. _multimodal_configuration:
+
+########################################
+Configuration for multimodal integration
+########################################
+
+The WebAtlas pipeline can process a group of datasets that share common features to be visualised together.
+This requires a YAML parameters file similar to :ref:`the one used to run the main pipeline. <parameters_file>`.
+The outputs generated by running the main conversion pipeline serve as inputs for this multimodal integration pipeline.
+
+The multimodal integration pipeline performs several tasks:
+
+1. Reindex each dataset by a user-inputed offset so ID's do not clash between them.
+2. *Optionally*, concatenate other observation-by-feature matrices or categorical values to the expression matrix to enable their visualisation as continuous values. For example, a celltype prediction matrix and/or celltype categories.
+3. Find the intersection of features between all datasets and subset them to visualise only the intersection (as including features not present in all datasets can produce misleading visualisations.)
+   **Note** the features are intersected using their index in the AnnData objects (``var`` table). All datasets must use the same type of data as index for the intersection to be correctly computed. For example, all datasets use names as index, or all datasets use IDs as index.
+
+After running the main conversion pipeline you can populate the required YAML parameters file to run the multimodal integration pipeline
+
+
+.. _multimodal_parameters_file:
+
+***************
+Parameters file
+***************
+
+The parameters file looks like this:
+
+.. code-block:: yaml
+
+    outdir: "/path/to/output/"
+
+    url: http://localhost:3000/
+    project: my_project
+    title: "My Project"
+
+    data:
+      -
+        dataset: scrnaseq
+        obs_type: cell
+        anndata: /path/to/main/output/scrnaseq-anndata.zarr
+        offset: 0
+        is_spatial: false
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          mappings:
+            obsm/X_umap: [0,1]
+          matrix: X
+      -
+        dataset: visium
+        obs_type: spot
+        anndata: /path/to/main/output/visium-anndata.zarr
+        offset: 1000000
+        is_spatial: true
+        raw_image: /path/to/main/output/visium-raw.zarr
+        label_image: /path/to/main/output/visium-label.zarr
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          matrix: X
+
+In contrast to the main conversion pipeline's parameters file, this file includes a single `project` to which multiple `datasets` belong to.
+
+Each ``dataset`` block defines the name of the dataset and paths to the converted data and image files (if any).
+
+Each ``dataset`` also requires a set of ``vitessce_options`` that specify the location of certain data (spatial coordinates, embeddings, expression matrix, etc.) within the AnnData object that is processed/generated.
+This follows the same structure as in the :ref:`main conversion's vitessce_options <vitessce_options>`.
+
+Additionally, each ``dataset`` requires:
+
+* ``obs_type``, the type of observation of the dataset. For example, "cell" or "spot".
+* ``offset``, an integer offset to add to the dataset's ID's so they don't clash with the other datasets.
+* ``is_spatial``, whether the dataset contains spatial information and has associated image files (raw and/or label images)
+
+Given that raw images are only read but not modified the pipeline does not generate new output from them.
+In order for the output directory (defined by ``outdir``) to contain all necessary files that need to be served for the web application to consume,
+by default, the pipeline copies the raw images to the output directory.
+This process can take a long time depending on the size of the image.
+You may want to manually copy or move the image or serve it from its own directory separate from the rest of the output.
+The default copying can be disabled by setting ``copy_raw: false`` as a project-wide parameter (at the same level as ``outdir``, ``project``, etc).
+For example,
+
+.. code-block:: yaml
+
+    outdir: "/path/to/output/"
+    url: http://localhost:3000/
+    project: my_project
+    title: "My Project"
+    copy_raw: false
+
+
+With additional features
+========================
+
+Using the above example parameters file to run the multimodal integration pipeline will run the reindexing and intersection steps.
+To perform the concatenation of additional features (like celltypes) to visualise them as continuous values, some extra parameters need to be added.
+
+As a project-wide parameter (at the same level as ``outdir``, ``project``, etc.):
+
+* ``extend_feature_name``, the name of the additional feature. For example, "celltype"
+
+And at a ``dataset`` level:
+
+* ``extend_feature``, the location of the additional feature information.
+  This can be either the path to a *cell2location* output file, or the location within the AnnData object where the feature is stored as a categorical within ``obs``.
+  For example, ``/path/to/c2l.h5ad`` containing predicted continuous values, or ``obs/celltype`` containing categoricals.
+
+The full parameters file will then look like this
+
+.. code-block:: yaml
+
+    outdir: "/path/to/output/"
+
+    url: http://localhost:3000/
+    project: my_project
+    title: "My Project"
+
+    extend_feature_name: celltype
+
+    data:
+      -
+        dataset: scrnaseq
+        obs_type: cell
+        anndata: /path/to/main/output/scrnaseq-anndata.zarr
+        extend_feature: obs/celltype
+        offset: 0
+        is_spatial: false
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          mappings:
+            obsm/X_umap: [0,1]
+          matrix: X
+      -
+        dataset: visium
+        obs_type: spot
+        anndata: /path/to/main/output/visium-anndata.zarr
+        extend_feature: /path/to/c2l.h5ad
+        offset: 1000000
+        is_spatial: true
+        raw_image: /path/to/main/output/visium-raw.zarr
+        label_image: /path/to/main/output/visium-label.zarr
+        vitessce_options:
+          spatial:
+            xy: obsm/spatial
+          matrix: X
+
+With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
\ No newline at end of file
diff --git a/sphinx/run.rst b/sphinx/run.rst
index cbab333..0813262 100644
--- a/sphinx/run.rst
+++ b/sphinx/run.rst
@@ -32,6 +32,20 @@ defined.
 You can modify the entry point if you're interested in only getting the converted outputs.
 Use ``-entry Process_files`` or ``-entry Process_images`` as you need.
 
+Multimodal integration
+----------------------
+
+Additional to the main conversion pipeline, we offer a subsequent pipeline to process multiple datasets with matching features to be able to visualise and query across all of them in a single portal.
+This pipeline can also process extra features (e.g. celltypes) to visualise them across datasets in addition to their expression matrices.
+
+Configurations and data are input through a :ref:`parameters yaml file <multimodal_configuration>` (slightly different from the parameters file required by the main pipeline.)
+
+To run this pipeline use
+
+.. code-block:: shell
+
+   nextflow run multimodal.nf -params-file /path/to/multimodal-params.yaml
+
 Running using Docker 
 --------------------
 

From 8f6fad8859979a29df05fe6a9ff8e3e1583df47b Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Tue, 7 Nov 2023 16:45:56 +0000
Subject: [PATCH 40/46] restructure multimodal docs

---
 sphinx/index.rst                    |   1 -
 sphinx/multimodal_configuration.rst | 149 ----------------------------
 sphinx/run.rst                      |  14 ---
 3 files changed, 164 deletions(-)
 delete mode 100644 sphinx/multimodal_configuration.rst

diff --git a/sphinx/index.rst b/sphinx/index.rst
index 21d7f84..d94a655 100644
--- a/sphinx/index.rst
+++ b/sphinx/index.rst
@@ -26,7 +26,6 @@ The pipeline generates data files for `supported data types`_, and builds a `vie
 
    installation
    configuration
-   multimodal_configuration
    run
    visualise
    Demos <https://cellatlas.io/webatlas>
diff --git a/sphinx/multimodal_configuration.rst b/sphinx/multimodal_configuration.rst
deleted file mode 100644
index 3e905a4..0000000
--- a/sphinx/multimodal_configuration.rst
+++ /dev/null
@@ -1,149 +0,0 @@
-.. _multimodal_configuration:
-
-########################################
-Configuration for multimodal integration
-########################################
-
-The WebAtlas pipeline can process a group of datasets that share common features to be visualised together.
-This requires a YAML parameters file similar to :ref:`the one used to run the main pipeline. <parameters_file>`.
-The outputs generated by running the main conversion pipeline serve as inputs for this multimodal integration pipeline.
-
-The multimodal integration pipeline performs several tasks:
-
-1. Reindex each dataset by a user-inputed offset so ID's do not clash between them.
-2. *Optionally*, concatenate other observation-by-feature matrices or categorical values to the expression matrix to enable their visualisation as continuous values. For example, a celltype prediction matrix and/or celltype categories.
-3. Find the intersection of features between all datasets and subset them to visualise only the intersection (as including features not present in all datasets can produce misleading visualisations.)
-   **Note** the features are intersected using their index in the AnnData objects (``var`` table). All datasets must use the same type of data as index for the intersection to be correctly computed. For example, all datasets use names as index, or all datasets use IDs as index.
-
-After running the main conversion pipeline you can populate the required YAML parameters file to run the multimodal integration pipeline
-
-
-.. _multimodal_parameters_file:
-
-***************
-Parameters file
-***************
-
-The parameters file looks like this:
-
-.. code-block:: yaml
-
-    outdir: "/path/to/output/"
-
-    url: http://localhost:3000/
-    project: my_project
-    title: "My Project"
-
-    data:
-      -
-        dataset: scrnaseq
-        obs_type: cell
-        anndata: /path/to/main/output/scrnaseq-anndata.zarr
-        offset: 0
-        is_spatial: false
-        vitessce_options:
-          spatial:
-            xy: obsm/spatial
-          mappings:
-            obsm/X_umap: [0,1]
-          matrix: X
-      -
-        dataset: visium
-        obs_type: spot
-        anndata: /path/to/main/output/visium-anndata.zarr
-        offset: 1000000
-        is_spatial: true
-        raw_image: /path/to/main/output/visium-raw.zarr
-        label_image: /path/to/main/output/visium-label.zarr
-        vitessce_options:
-          spatial:
-            xy: obsm/spatial
-          matrix: X
-
-In contrast to the main conversion pipeline's parameters file, this file includes a single `project` to which multiple `datasets` belong to.
-
-Each ``dataset`` block defines the name of the dataset and paths to the converted data and image files (if any).
-
-Each ``dataset`` also requires a set of ``vitessce_options`` that specify the location of certain data (spatial coordinates, embeddings, expression matrix, etc.) within the AnnData object that is processed/generated.
-This follows the same structure as in the :ref:`main conversion's vitessce_options <vitessce_options>`.
-
-Additionally, each ``dataset`` requires:
-
-* ``obs_type``, the type of observation of the dataset. For example, "cell" or "spot".
-* ``offset``, an integer offset to add to the dataset's ID's so they don't clash with the other datasets.
-* ``is_spatial``, whether the dataset contains spatial information and has associated image files (raw and/or label images)
-
-Given that raw images are only read but not modified the pipeline does not generate new output from them.
-In order for the output directory (defined by ``outdir``) to contain all necessary files that need to be served for the web application to consume,
-by default, the pipeline copies the raw images to the output directory.
-This process can take a long time depending on the size of the image.
-You may want to manually copy or move the image or serve it from its own directory separate from the rest of the output.
-The default copying can be disabled by setting ``copy_raw: false`` as a project-wide parameter (at the same level as ``outdir``, ``project``, etc).
-For example,
-
-.. code-block:: yaml
-
-    outdir: "/path/to/output/"
-    url: http://localhost:3000/
-    project: my_project
-    title: "My Project"
-    copy_raw: false
-
-
-With additional features
-========================
-
-Using the above example parameters file to run the multimodal integration pipeline will run the reindexing and intersection steps.
-To perform the concatenation of additional features (like celltypes) to visualise them as continuous values, some extra parameters need to be added.
-
-As a project-wide parameter (at the same level as ``outdir``, ``project``, etc.):
-
-* ``extend_feature_name``, the name of the additional feature. For example, "celltype"
-
-And at a ``dataset`` level:
-
-* ``extend_feature``, the location of the additional feature information.
-  This can be either the path to a *cell2location* output file, or the location within the AnnData object where the feature is stored as a categorical within ``obs``.
-  For example, ``/path/to/c2l.h5ad`` containing predicted continuous values, or ``obs/celltype`` containing categoricals.
-
-The full parameters file will then look like this
-
-.. code-block:: yaml
-
-    outdir: "/path/to/output/"
-
-    url: http://localhost:3000/
-    project: my_project
-    title: "My Project"
-
-    extend_feature_name: celltype
-
-    data:
-      -
-        dataset: scrnaseq
-        obs_type: cell
-        anndata: /path/to/main/output/scrnaseq-anndata.zarr
-        extend_feature: obs/celltype
-        offset: 0
-        is_spatial: false
-        vitessce_options:
-          spatial:
-            xy: obsm/spatial
-          mappings:
-            obsm/X_umap: [0,1]
-          matrix: X
-      -
-        dataset: visium
-        obs_type: spot
-        anndata: /path/to/main/output/visium-anndata.zarr
-        extend_feature: /path/to/c2l.h5ad
-        offset: 1000000
-        is_spatial: true
-        raw_image: /path/to/main/output/visium-raw.zarr
-        label_image: /path/to/main/output/visium-label.zarr
-        vitessce_options:
-          spatial:
-            xy: obsm/spatial
-          matrix: X
-
-With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
\ No newline at end of file
diff --git a/sphinx/run.rst b/sphinx/run.rst
index 0813262..cbab333 100644
--- a/sphinx/run.rst
+++ b/sphinx/run.rst
@@ -32,20 +32,6 @@ defined.
 You can modify the entry point if you're interested in only getting the converted outputs.
 Use ``-entry Process_files`` or ``-entry Process_images`` as you need.
 
-Multimodal integration
-----------------------
-
-Additional to the main conversion pipeline, we offer a subsequent pipeline to process multiple datasets with matching features to be able to visualise and query across all of them in a single portal.
-This pipeline can also process extra features (e.g. celltypes) to visualise them across datasets in addition to their expression matrices.
-
-Configurations and data are input through a :ref:`parameters yaml file <multimodal_configuration>` (slightly different from the parameters file required by the main pipeline.)
-
-To run this pipeline use
-
-.. code-block:: shell
-
-   nextflow run multimodal.nf -params-file /path/to/multimodal-params.yaml
-
 Running using Docker 
 --------------------
 

From 870f4dd1905f95ac5cf92d7156e8be690d649392 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Wed, 8 Nov 2023 11:24:50 +0000
Subject: [PATCH 41/46] add info about multimodal visualisation to docs

---
 bin/build_config_multimodal.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/bin/build_config_multimodal.py b/bin/build_config_multimodal.py
index 16b6be7..57a17b5 100755
--- a/bin/build_config_multimodal.py
+++ b/bin/build_config_multimodal.py
@@ -38,14 +38,10 @@ def write_json(
     Args:
         project (str, optional): Project name. Defaults to "".
         datasets (dict[str, dict[str]], optional): Dictionary of datasets.
-            Expected structure: { dataset_name: {
-                "file_paths" : [],
-                "images": {"raw": [], "label": []},
-                "options": {},
-                "obs_type": "cell",
-                "is_spatial": True
-                }
-            }
+            Expected structure: { dataset_name: { "file_paths" : [],
+            "images": {"raw": [], "label": []},
+            "options": {}, "obs_type": "cell",
+            "is_spatial": True } }
             Defaults to {}.
         extended_features (Union[list[str], str], optional): List of features or
             string of single feature on which the expression matrix was extended

From 604c54ebb4de2b834d7ed3e9494e43dd7d96442b Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Wed, 22 Nov 2023 10:03:54 +0000
Subject: [PATCH 42/46] fix docs

---
 sphinx/modules.rst                  | 5 ++++-
 sphinx/multimodal/configuration.rst | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sphinx/modules.rst b/sphinx/modules.rst
index ddfe0f5..ab7050e 100644
--- a/sphinx/modules.rst
+++ b/sphinx/modules.rst
@@ -39,4 +39,7 @@ Modules
     :members:
 
 .. automodule:: build_config_multimodal
-    :members:
\ No newline at end of file
+    :members:
+
+.. automodule:: write_spatialdata
+    :members:
diff --git a/sphinx/multimodal/configuration.rst b/sphinx/multimodal/configuration.rst
index dd45ed8..c345da1 100644
--- a/sphinx/multimodal/configuration.rst
+++ b/sphinx/multimodal/configuration.rst
@@ -134,7 +134,7 @@ The full parameters file will then look like this
             xy: obsm/spatial
           matrix: X
 
-With this parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
+With these parameters the multimodal integration pipeline will concatenate the expression matrix with the additional feature values so both can be queried and visualised across datasets within the same portal.
 
 In the case of providing a *cell2location* output file, you can further configure ``extend_feature`` with arguments for how the file should be processed.
 Instead of only setting the path to the file you would need to define ``extend_feature`` as a map containing ``path`` and optional ``args``.

From f8b54fcbe4593991b99a580c47ac7445b1fbba7c Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 12 Dec 2023 15:05:09 +0000
Subject: [PATCH 43/46] add attempt to match c2l indices with substrings

---
 bin/integrate_anndata.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/bin/integrate_anndata.py b/bin/integrate_anndata.py
index aef4be7..e65d153 100755
--- a/bin/integrate_anndata.py
+++ b/bin/integrate_anndata.py
@@ -6,6 +6,7 @@
 import fire
 import zarr
 import h5py
+import logging
 import numpy as np
 import pandas as pd
 import anndata as ad
@@ -164,15 +165,32 @@ def concat_matrix_from_cell2location(
         if not sort_index and adata.uns.get("webatlas_reindexed"):
             sort_index = "label_id"
         if sort_index:
-            idx = c2l_adata.obs.index.get_indexer(adata.obs[sort_index].tolist())
+            data_idx = adata.obs[sort_index]
         else:
-            idx = c2l_adata.obs.index.get_indexer(adata.obs.index.tolist())
-        if -1 in idx:
-            raise SystemError(
+            data_idx = adata.obs.index
+        idx = c2l_adata.obs.index.get_indexer(data_idx.tolist())
+        if -1 in idx:  # Indices do not match
+            logging.error(
                 "Values do not match between AnnData object's"
                 f" `{sort_index or 'index'}`"
                 " and cell2location output index."
             )
+
+            logging.info("Attempting to match indices as substrings")
+            try:
+                data_idx = match_substring_indices(c2l_adata.obs.index, data_idx)
+                if not data_idx.is_unique:
+                    raise Exception(
+                        "Found non-unique matches between indices as substrings."
+                    )
+                idx = c2l_adata.obs.index.get_indexer(data_idx.tolist())
+                if -1 in idx:
+                    raise Exception("Non-matching indices present.")
+            except Exception:
+                raise SystemError(
+                    "Failed to find a match between indices as substrings."
+                )
+
         c2l_adata = c2l_adata[idx,]
 
     c2l_df = pd.DataFrame(
@@ -293,5 +311,11 @@ def write_anndata(
     return
 
 
+def match_substring_indices(fullstring_idx, substring_idx):
+    return pd.Series(substring_idx).apply(
+        lambda x: fullstring_idx[fullstring_idx.str.contains(x)].values[0]
+    )
+
+
 if __name__ == "__main__":
     fire.Fire()

From 5de458e6592c1dc2a479373f47a49b03f21b7707 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Tue, 12 Dec 2023 15:24:03 +0000
Subject: [PATCH 44/46] fix docs

---
 sphinx/multimodal/configuration.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sphinx/multimodal/configuration.rst b/sphinx/multimodal/configuration.rst
index c345da1..505a87a 100644
--- a/sphinx/multimodal/configuration.rst
+++ b/sphinx/multimodal/configuration.rst
@@ -168,6 +168,6 @@ By default the pipeline will try to ensure the order of observations between the
 so values are correctly concatenated.
 The pipeline will attempt to order the prediction matrix given the index of the AnnData object 
 (or the original index if the main pipeline re-indexed it).
-However you can override the observations column of the AnnData object that contains the index that the prediction matrix should match.
+However you can override the observations column of the AnnData object that contains the index that the prediction matrix should match using ``sort_index``.
 ``sort`` can be set to ``False`` to disable any re-ordering. If disabled, the prediction matrix would be concatenated as-is into the AnnData object
 without checking if observations' IDs match.

From cdec45113e5da19603f261b8a35260a79ceeff05 Mon Sep 17 00:00:00 2001
From: dannda <dann.3004@gmail.com>
Date: Wed, 13 Dec 2023 14:15:25 +0000
Subject: [PATCH 45/46] patch preprocess_anndata

---
 bin/process_h5ad.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/process_h5ad.py b/bin/process_h5ad.py
index e525d38..a55eecd 100755
--- a/bin/process_h5ad.py
+++ b/bin/process_h5ad.py
@@ -169,6 +169,7 @@ def preprocess_anndata(
     var_index: str = None,
     obs_subset: tuple[str, T.Any] = None,
     var_subset: tuple[str, T.Any] = None,
+    **kwargs,
 ):
     """This function preprocesses an AnnData object, ensuring correct dtypes for zarr conversion
 

From 2d1f98a8483cb34b925c4d80a2f1780c184cfbcf Mon Sep 17 00:00:00 2001
From: Dave Horsfall <dave@horsfall.dev>
Date: Thu, 21 Dec 2023 15:05:44 +0000
Subject: [PATCH 46/46] prepare release 0.5.0

---
 envs/build-docker-imgs.sh  |  2 +-
 main.nf                    |  2 +-
 multimodal.nf              |  2 +-
 nextflow.config            |  4 ++--
 sphinx/citing.rst          |  2 +-
 sphinx/conf.py             |  2 +-
 sphinx/examples/visium.rst |  4 ++--
 sphinx/examples/xenium.rst |  4 ++--
 sphinx/installation.rst    | 24 ++++++++++++------------
 9 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/envs/build-docker-imgs.sh b/envs/build-docker-imgs.sh
index ccfeb7c..123fb5a 100755
--- a/envs/build-docker-imgs.sh
+++ b/envs/build-docker-imgs.sh
@@ -1,5 +1,5 @@
 #! /bin/sh
-VERSION=0.4.1
+VERSION=0.5.0
 
 docker build --platform=linux/amd64 -t haniffalab/webatlas-pipeline:${VERSION} -f ./Dockerfile .
 cd build_config/
diff --git a/main.nf b/main.nf
index 54c3f61..c7d949d 100644
--- a/main.nf
+++ b/main.nf
@@ -5,7 +5,7 @@ import groovy.json.*
 nextflow.enable.dsl=2
 
 verbose_log = true
-version = "0.4.1"
+version = "0.5.0"
 
 //////////////////////////////////////////////////////
 
diff --git a/multimodal.nf b/multimodal.nf
index a5baab0..5665ecd 100644
--- a/multimodal.nf
+++ b/multimodal.nf
@@ -11,7 +11,7 @@ params.outdir = ""
 params.copy_raw = true
 params.description = ""
 
-version="0.4.1"
+version="0.5.0"
 verbose_log=true
 outdir_with_version = "${params.outdir.replaceFirst(/\/*$/, "")}\/${version}"
 
diff --git a/nextflow.config b/nextflow.config
index 188fd4c..d551df0 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,9 +13,9 @@ profiles {
   docker { 
     docker.enabled = true
     process {
-      container = 'haniffalab/webatlas-pipeline:0.4.1'
+      container = 'haniffalab/webatlas-pipeline:0.5.0'
       withLabel: build_config {
-        container = 'haniffalab/webatlas-pipeline-build-config:0.4.1'
+        container = 'haniffalab/webatlas-pipeline-build-config:0.5.0'
       }
     }
   }
diff --git a/sphinx/citing.rst b/sphinx/citing.rst
index e0404cd..7d34c07 100644
--- a/sphinx/citing.rst
+++ b/sphinx/citing.rst
@@ -11,4 +11,4 @@ Citation
 
 If you use this software in a scientific publication, please cite using the following Zenodo reference. 
 
-**Li, Tong, Horsfall, David, Basurto-Lozada, Daniela, Prete, Martin, Jessica, Cox, & Iolo, Squires. (2023). WebAtlas Pipeline (v0.4.1). Zenodo. https://doi.org/10.5281/zenodo.7863308**
+**Li, Tong, Horsfall, David, Basurto-Lozada, Daniela, Prete, Martin, Jessica, Cox, & Iolo, Squires. (2023). WebAtlas Pipeline (v0.5.0). Zenodo. https://doi.org/10.5281/zenodo.7863308**
diff --git a/sphinx/conf.py b/sphinx/conf.py
index 55f8950..73b16f5 100644
--- a/sphinx/conf.py
+++ b/sphinx/conf.py
@@ -23,7 +23,7 @@
 author = "Haniffa Lab"
 
 # The full version, including alpha/beta/rc tags
-release = "0.4.1"
+release = "0.5.0"
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/sphinx/examples/visium.rst b/sphinx/examples/visium.rst
index 8fb2ddf..163b4c8 100644
--- a/sphinx/examples/visium.rst
+++ b/sphinx/examples/visium.rst
@@ -100,7 +100,7 @@ verify the expected directories are created.
 .. code-block:: shell
    :caption: Input
 
-   ls -l output/CytAssist_FFPE_Human_Breast_Cancer/0.4.1
+   ls -l output/CytAssist_FFPE_Human_Breast_Cancer/0.5.0
 
 .. code-block:: shell
    :caption: Output
@@ -122,7 +122,7 @@ at http://localhost:3000, and that CORS is enabled via the Access-Control-Allow-
 .. code-block:: shell
    :caption: Input
 
-   npx http-server output/CytAssist_FFPE_Human_Breast_Cancer/0.4.1 --port 3000 --cors
+   npx http-server output/CytAssist_FFPE_Human_Breast_Cancer/0.5.0 --port 3000 --cors
 
 .. code-block:: shell
    :caption: Output
diff --git a/sphinx/examples/xenium.rst b/sphinx/examples/xenium.rst
index b2ae4a6..f0ef3d3 100644
--- a/sphinx/examples/xenium.rst
+++ b/sphinx/examples/xenium.rst
@@ -103,7 +103,7 @@ verify the expected directories are created.
 .. code-block:: shell
    :caption: Input
 
-   ls -l output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.4.1
+   ls -l output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.5.0
 
 .. code-block:: shell
    :caption: Output
@@ -124,7 +124,7 @@ at http://localhost:3000, and that CORS is enabled via the Access-Control-Allow-
 .. code-block:: shell
    :caption: Input
 
-   npx http-server output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.4.1 --port 3000 --cors
+   npx http-server output/Xenium_FFPE_Human_Breast_Cancer_Rep1_outs/0.5.0 --port 3000 --cors
 
 .. code-block:: shell
    :caption: Output
diff --git a/sphinx/installation.rst b/sphinx/installation.rst
index 2af9301..f22c4f6 100644
--- a/sphinx/installation.rst
+++ b/sphinx/installation.rst
@@ -14,7 +14,7 @@ Download the WebAtlas Pipeline release. You can look for previous `releases on G
 .. code-block:: shell
    :caption: Input
 
-   wget https://github.com/haniffalab/webatlas-pipeline/archive/refs/tags/v0.4.1.tar.gz
+   wget https://github.com/haniffalab/webatlas-pipeline/archive/refs/tags/v0.5.0.tar.gz
 
 .. code-block:: shell
    :caption: Expected Output
@@ -22,35 +22,35 @@ Download the WebAtlas Pipeline release. You can look for previous `releases on G
    Resolving github.com (github.com)... 140.82.121.3
    Connecting to github.com (github.com)|140.82.121.3|:443... connected.
    HTTP request sent, awaiting response... 302 Found
-   Location: https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.4.1 [following]
-   --2023-05-18 09:30:15--  https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.4.1
+   Location: https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.5.0 [following]
+   --2023-05-18 09:30:15--  https://codeload.github.com/haniffalab/webatlas-pipeline/tar.gz/refs/tags/v0.5.0
    Resolving codeload.github.com (codeload.github.com)... 140.82.121.9
    Connecting to codeload.github.com (codeload.github.com)|140.82.121.9|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: unspecified [application/x-gzip]
-   Saving to: ‘v0.4.1.tar.gz’
+   Saving to: ‘v0.5.0.tar.gz’
 
-   v0.4.1.tar.gz [ <=>                                           ]   2.70M  9.12MB/s    in 0.3s    
+   v0.5.0.tar.gz [ <=>                                           ]   2.70M  9.12MB/s    in 0.3s    
 
-   2023-05-18 09:30:16 (9.12 MB/s) - ‘v0.4.1.tar.gz’ saved [2835534]
+   2023-05-18 09:30:16 (9.12 MB/s) - ‘v0.5.0.tar.gz’ saved [2835534]
 
 Extract the WebAtlas compressed tag and change directory into the new repo.
 
 .. code-block:: shell
    :caption: Input
 
-   tar -xzvf ./v0.4.1.tar.gz
-   cd webatlas-pipeline-0.4.1
+   tar -xzvf ./v0.5.0.tar.gz
+   cd webatlas-pipeline-0.5.0
 
 .. code-block:: shell
    :caption: Expected Output
     
-   webatlas-pipeline-0.4.1/
-   webatlas-pipeline-0.4.1/.github/
+   webatlas-pipeline-0.5.0/
+   webatlas-pipeline-0.5.0/.github/
    ...
    ...
-   webatlas-pipeline-0.4.1/tests/input/simple_config.json
-   webatlas-pipeline-0.4.1/tests/test_class.py
+   webatlas-pipeline-0.5.0/tests/input/simple_config.json
+   webatlas-pipeline-0.5.0/tests/test_class.py
 
 .. _environment: