Skip to content

Commit

Permalink
Make annotations and ion images matching more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
aeisenbarth committed Mar 11, 2024
1 parent 111bece commit 7e4442a
Showing 1 changed file with 36 additions and 13 deletions.
49 changes: 36 additions & 13 deletions metaspace_converter/to_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def metaspace_to_anndata(

# Download annotations
annotations = dataset.results(database=database, fdr=fdr, **annotation_filter)
annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME)
annotations = _normalize_annotations_for_serialization(annotations)

# Download ion images
Expand All @@ -109,10 +108,24 @@ def metaspace_to_anndata(
f"No isotope images available for dataset {dataset.id} and database "
f"{database[0]}{database[1]}. Was the database selected for processing on METASPACE?"
)
assert len(annotations) == len(isotope_images)
# Isotope images are also specific to neutral loss and chemical modification (if any)
# whereas annotations only include formula and adduct. Thus there can be a mismatch.
# Since we want to keep all isotope images, we add missing rows to annotations
isotope_images_index = pd.DataFrame(
[(img.formula, img.adduct, img.chem_mod, img.neutral_loss) for img in isotope_images],
columns=["formula", "adduct", "chem_mod", "neutral_loss"],
)
annotations = pd.merge(
annotations,
isotope_images_index,
how="inner",
left_on=("formula", "adduct"),
right_on=("formula", "adduct"),
)
annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME)

# Sort them matching the annotations.
isotope_images = _sort_isotope_images_like(isotope_images, annotations.index)
# Sort isotope images to match the annotations.
isotope_images = _sort_isotope_images_like(isotope_images, annotations)

# Create X matrix (all ion pixels flattened to primary axis)
shape = get_ion_image_shape(dataset)
Expand Down Expand Up @@ -146,13 +159,20 @@ def metaspace_to_anndata(
return adata


def create_annotation_id(formula: str, adduct: str) -> str:
return f"{formula}{adduct}"
def create_annotation_id(
formula: str, adduct: str, chem_mod: str = "", neutral_loss: str = ""
) -> str:
return f"{formula}{adduct}{chem_mod}{neutral_loss}"


def _add_annotations_index(df: pd.DataFrame, index_name: str = VAR_INDEX_NAME) -> pd.DataFrame:
df = df.reset_index()
df[index_name] = df.apply(lambda row: create_annotation_id(row.formula, row.adduct), axis=1)
df[index_name] = df.apply(
lambda row: create_annotation_id(
row.formula, row.adduct, getattr(row, "chem_mod", ""), getattr(row, "neutral_loss", "")
),
axis=1,
)
return df.set_index(index_name)


Expand Down Expand Up @@ -201,14 +221,17 @@ def get_ion_image_shape(


def _sort_isotope_images_like(
isotope_images: list[IsotopeImages], index: pd.Index
isotope_images: list[IsotopeImages], df: pd.DataFrame
) -> list[IsotopeImages]:
images_dict = {}
for isotope_image in isotope_images:
annotation_id = create_annotation_id(isotope_image.formula, isotope_image.adduct)
images_dict[annotation_id] = isotope_image
images_dict = {
(img.formula, img.adduct, img.chem_mod, img.neutral_loss): img for img in isotope_images
}
# Return them in the requested order.
return [images_dict[key] for key in index]
# Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows.
return [
images_dict[(row.formula, row.adduct, row.chem_mod, row.neutral_loss)]
for row in df.itertuples(index=False)
]


def _create_anndata_x(isotope_images: list[IsotopeImages], shape: Shape2d) -> np.ndarray:
Expand Down

0 comments on commit 7e4442a

Please sign in to comment.