From c64123c7d95ab3958565f0f6b93bfd86e04cbf54 Mon Sep 17 00:00:00 2001 From: Andreas Eisenbarth Date: Mon, 11 Mar 2024 11:34:06 +0100 Subject: [PATCH] Fix dataframe/images mismatch for neutral loss --- metaspace_converter/to_anndata.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/metaspace_converter/to_anndata.py b/metaspace_converter/to_anndata.py index 82c51d4..db42962 100644 --- a/metaspace_converter/to_anndata.py +++ b/metaspace_converter/to_anndata.py @@ -92,7 +92,14 @@ def metaspace_to_anndata( database = DEFAULT_DATABASE # Download annotations - annotations = dataset.results(database=database, fdr=fdr, **annotation_filter) + annotations = dataset.results( + database=database, + fdr=fdr, + include_chem_mods=True, + include_neutral_losses=True, + **annotation_filter, + ) + annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) annotations = _normalize_annotations_for_serialization(annotations) # Download ion images @@ -108,21 +115,7 @@ def metaspace_to_anndata( f"No isotope images available for dataset {dataset.id} and database " f"{database[0]} – {database[1]}. Was the database selected for processing on METASPACE?" ) - # Isotope images are also specific to neutral loss and chemical modification (if any) - # whereas annotations only include formula and adduct. Thus there can be a mismatch. - # Since we want to keep all isotope images, we add missing rows to annotations - isotope_images_index = pd.DataFrame( - [(img.formula, img.adduct, img.neutral_loss, img.chem_mod) for img in isotope_images], - columns=["formula", "adduct", "neutral_loss", "chem_mod"], - ) - annotations = pd.merge( - annotations, - isotope_images_index, - how="inner", - left_on=("formula", "adduct"), - right_on=("formula", "adduct"), - ) - annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) + assert len(annotations) == len(isotope_images) # Sort isotope images to match the annotations. isotope_images = _sort_isotope_images_like(isotope_images, annotations) @@ -229,7 +222,7 @@ def _sort_isotope_images_like( # Return them in the requested order. # Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows. return [ - images_dict[(row.formula, row.adduct, row.neutral_loss, row.chem_mod)] + images_dict[(row.formula, row.adduct, row.neutralLoss, row.chemMod)] for row in df.itertuples(index=False) ]