From 7e4442aba066be2db629af29714e04b205447532 Mon Sep 17 00:00:00 2001 From: Andreas Eisenbarth Date: Fri, 8 Mar 2024 16:48:01 +0100 Subject: [PATCH] Make annotations and ion images matching more robust --- metaspace_converter/to_anndata.py | 49 +++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/metaspace_converter/to_anndata.py b/metaspace_converter/to_anndata.py index e2fcb87..92852d2 100644 --- a/metaspace_converter/to_anndata.py +++ b/metaspace_converter/to_anndata.py @@ -93,7 +93,6 @@ def metaspace_to_anndata( # Download annotations annotations = dataset.results(database=database, fdr=fdr, **annotation_filter) - annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) annotations = _normalize_annotations_for_serialization(annotations) # Download ion images @@ -109,10 +108,24 @@ def metaspace_to_anndata( f"No isotope images available for dataset {dataset.id} and database " f"{database[0]} – {database[1]}. Was the database selected for processing on METASPACE?" ) - assert len(annotations) == len(isotope_images) + # Isotope images are also specific to neutral loss and chemical modification (if any) + # whereas annotations only include formula and adduct. Thus there can be a mismatch. + # Since we want to keep all isotope images, we add missing rows to annotations + isotope_images_index = pd.DataFrame( + [(img.formula, img.adduct, img.chem_mod, img.neutral_loss) for img in isotope_images], + columns=["formula", "adduct", "chem_mod", "neutral_loss"], + ) + annotations = pd.merge( + annotations, + isotope_images_index, + how="inner", + left_on=("formula", "adduct"), + right_on=("formula", "adduct"), + ) + annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) - # Sort them matching the annotations. - isotope_images = _sort_isotope_images_like(isotope_images, annotations.index) + # Sort isotope images to match the annotations. + isotope_images = _sort_isotope_images_like(isotope_images, annotations) # Create X matrix (all ion pixels flattened to primary axis) shape = get_ion_image_shape(dataset) @@ -146,13 +159,20 @@ def metaspace_to_anndata( return adata -def create_annotation_id(formula: str, adduct: str) -> str: - return f"{formula}{adduct}" +def create_annotation_id( + formula: str, adduct: str, chem_mod: str = "", neutral_loss: str = "" +) -> str: + return f"{formula}{adduct}{chem_mod}{neutral_loss}" def _add_annotations_index(df: pd.DataFrame, index_name: str = VAR_INDEX_NAME) -> pd.DataFrame: df = df.reset_index() - df[index_name] = df.apply(lambda row: create_annotation_id(row.formula, row.adduct), axis=1) + df[index_name] = df.apply( + lambda row: create_annotation_id( + row.formula, row.adduct, getattr(row, "chem_mod", ""), getattr(row, "neutral_loss", "") + ), + axis=1, + ) return df.set_index(index_name) @@ -201,14 +221,17 @@ def get_ion_image_shape( def _sort_isotope_images_like( - isotope_images: list[IsotopeImages], index: pd.Index + isotope_images: list[IsotopeImages], df: pd.DataFrame ) -> list[IsotopeImages]: - images_dict = {} - for isotope_image in isotope_images: - annotation_id = create_annotation_id(isotope_image.formula, isotope_image.adduct) - images_dict[annotation_id] = isotope_image + images_dict = { + (img.formula, img.adduct, img.chem_mod, img.neutral_loss): img for img in isotope_images + } # Return them in the requested order. - return [images_dict[key] for key in index] + # Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows. + return [ + images_dict[(row.formula, row.adduct, row.chem_mod, row.neutral_loss)] + for row in df.itertuples(index=False) + ] def _create_anndata_x(isotope_images: list[IsotopeImages], shape: Shape2d) -> np.ndarray: