Skip to content

Commit

Permalink
Fix dataframe/images mismatch for neutral loss
Browse files Browse the repository at this point in the history
  • Loading branch information
aeisenbarth committed Mar 11, 2024
1 parent 9620176 commit c64123c
Showing 1 changed file with 10 additions and 17 deletions.
27 changes: 10 additions & 17 deletions metaspace_converter/to_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,14 @@ def metaspace_to_anndata(
database = DEFAULT_DATABASE

# Download annotations
annotations = dataset.results(database=database, fdr=fdr, **annotation_filter)
annotations = dataset.results(
database=database,
fdr=fdr,
include_chem_mods=True,
include_neutral_losses=True,
**annotation_filter,
)
annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME)
annotations = _normalize_annotations_for_serialization(annotations)

# Download ion images
Expand All @@ -108,21 +115,7 @@ def metaspace_to_anndata(
f"No isotope images available for dataset {dataset.id} and database "
f"{database[0]}{database[1]}. Was the database selected for processing on METASPACE?"
)
# Isotope images are also specific to neutral loss and chemical modification (if any)
# whereas annotations only include formula and adduct. Thus there can be a mismatch.
# Since we want to keep all isotope images, we add missing rows to annotations
isotope_images_index = pd.DataFrame(
[(img.formula, img.adduct, img.neutral_loss, img.chem_mod) for img in isotope_images],
columns=["formula", "adduct", "neutral_loss", "chem_mod"],
)
annotations = pd.merge(
annotations,
isotope_images_index,
how="inner",
left_on=("formula", "adduct"),
right_on=("formula", "adduct"),
)
annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME)
assert len(annotations) == len(isotope_images)

# Sort isotope images to match the annotations.
isotope_images = _sort_isotope_images_like(isotope_images, annotations)
Expand Down Expand Up @@ -229,7 +222,7 @@ def _sort_isotope_images_like(
# Return them in the requested order.
# Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows.
return [
images_dict[(row.formula, row.adduct, row.neutral_loss, row.chem_mod)]
images_dict[(row.formula, row.adduct, row.neutralLoss, row.chemMod)]
for row in df.itertuples(index=False)
]

Expand Down

0 comments on commit c64123c

Please sign in to comment.