From 7e4442aba066be2db629af29714e04b205447532 Mon Sep 17 00:00:00 2001
From: Andreas Eisenbarth <andreas.eisenbarth@embl.de>
Date: Fri, 8 Mar 2024 16:48:01 +0100
Subject: [PATCH] Make annotations and ion images matching more robust

---
 metaspace_converter/to_anndata.py | 49 +++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/metaspace_converter/to_anndata.py b/metaspace_converter/to_anndata.py
index e2fcb87..92852d2 100644
--- a/metaspace_converter/to_anndata.py
+++ b/metaspace_converter/to_anndata.py
@@ -93,7 +93,6 @@ def metaspace_to_anndata(
 
     # Download annotations
     annotations = dataset.results(database=database, fdr=fdr, **annotation_filter)
-    annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME)
     annotations = _normalize_annotations_for_serialization(annotations)
 
     # Download ion images
@@ -109,10 +108,24 @@ def metaspace_to_anndata(
             f"No isotope images available for dataset {dataset.id} and database "
             f"{database[0]} – {database[1]}. Was the database selected for processing on METASPACE?"
         )
-    assert len(annotations) == len(isotope_images)
+    # Isotope images are also specific to neutral loss and chemical modification (if any)
+    # whereas annotations only include formula and adduct. Thus there can be a mismatch.
+    # Since we want to keep all isotope images, we add missing rows to annotations
+    isotope_images_index = pd.DataFrame(
+        [(img.formula, img.adduct, img.chem_mod, img.neutral_loss) for img in isotope_images],
+        columns=["formula", "adduct", "chem_mod", "neutral_loss"],
+    )
+    annotations = pd.merge(
+        annotations,
+        isotope_images_index,
+        how="inner",
+        left_on=("formula", "adduct"),
+        right_on=("formula", "adduct"),
+    )
+    annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME)
 
-    # Sort them matching the annotations.
-    isotope_images = _sort_isotope_images_like(isotope_images, annotations.index)
+    # Sort isotope images to match the annotations.
+    isotope_images = _sort_isotope_images_like(isotope_images, annotations)
 
     # Create X matrix (all ion pixels flattened to primary axis)
     shape = get_ion_image_shape(dataset)
@@ -146,13 +159,20 @@ def metaspace_to_anndata(
     return adata
 
 
-def create_annotation_id(formula: str, adduct: str) -> str:
-    return f"{formula}{adduct}"
+def create_annotation_id(
+    formula: str, adduct: str, chem_mod: str = "", neutral_loss: str = ""
+) -> str:
+    return f"{formula}{adduct}{chem_mod}{neutral_loss}"
 
 
 def _add_annotations_index(df: pd.DataFrame, index_name: str = VAR_INDEX_NAME) -> pd.DataFrame:
     df = df.reset_index()
-    df[index_name] = df.apply(lambda row: create_annotation_id(row.formula, row.adduct), axis=1)
+    df[index_name] = df.apply(
+        lambda row: create_annotation_id(
+            row.formula, row.adduct, getattr(row, "chem_mod", ""), getattr(row, "neutral_loss", "")
+        ),
+        axis=1,
+    )
     return df.set_index(index_name)
 
 
@@ -201,14 +221,17 @@ def get_ion_image_shape(
 
 
 def _sort_isotope_images_like(
-    isotope_images: list[IsotopeImages], index: pd.Index
+    isotope_images: list[IsotopeImages], df: pd.DataFrame
 ) -> list[IsotopeImages]:
-    images_dict = {}
-    for isotope_image in isotope_images:
-        annotation_id = create_annotation_id(isotope_image.formula, isotope_image.adduct)
-        images_dict[annotation_id] = isotope_image
+    images_dict = {
+        (img.formula, img.adduct, img.chem_mod, img.neutral_loss): img for img in isotope_images
+    }
     # Return them in the requested order.
-    return [images_dict[key] for key in index]
+    # Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows.
+    return [
+        images_dict[(row.formula, row.adduct, row.chem_mod, row.neutral_loss)]
+        for row in df.itertuples(index=False)
+    ]
 
 
 def _create_anndata_x(isotope_images: list[IsotopeImages], shape: Shape2d) -> np.ndarray: