Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ionization mode generator #142

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions ms2deepscore/data_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,117 @@ def on_epoch_end(self):
np.random.shuffle(self.indexes)


class DataGeneratorIonisationMode(DataGeneratorBase):
"""Generates data for training a siamese Keras model
This generator will provide training data by picking each training InchiKey
listed in *selected_inchikeys* num_turns times in every epoch. It will then randomly
pick one the spectra corresponding to this InchiKey (if multiple) and pair it
with a randomly chosen other spectrum that corresponds to a reference score
as defined in same_prob_bins.
"""

def __init__(self, binned_spectrums: List[BinnedSpectrumType],
reference_scores_df: pd.DataFrame,
spectrum_binner: SpectrumBinner,
selected_inchikeys: Optional[list] = None,
**settings):
"""Generates data for training a siamese Keras model.
Parameters
----------
binned_spectrums
List of BinnedSpectrum objects with the binned peak positions and intensities.
reference_scores_df
Pandas DataFrame with reference similarity scores (=labels) for compounds identified
by inchikeys. Columns and index should be inchikeys, the value in a row x column
depicting the similarity score for that pair. Must be symmetric
(reference_scores_df[i,j] == reference_scores_df[j,i]) and column names should be identical to the index.
selected_inchikeys
List of inchikeys to use for training.
dim
Input vector dimension.
As part of **settings, defaults for the following parameters can be set:
batch_size
Number of pairs per batch. Default=32.
num_turns
Number of pairs for each InChiKey during each epoch. Default=1
shuffle
Set to True to shuffle IDs every epoch. Default=True
ignore_equal_pairs
Set to True to ignore pairs of two identical spectra. Default=True
same_prob_bins
List of tuples that define ranges of the true label to be trained with
equal frequencies. Default is set to [(0, 0.5), (0.5, 1)], which means
that pairs with scores <=0.5 will be picked as often as pairs with scores
> 0.5.
augment_removal_max
Maximum fraction of peaks (if intensity < below augment_removal_intensity)
to be removed randomly. Default is set to 0.2, which means that between
0 and 20% of all peaks with intensities < augment_removal_intensity
will be removed.
augment_removal_intensity
Specifying that only peaks with intensities < max_intensity will be removed.
augment_intensity
Change peak intensities by a random number between 0 and augment_intensity.
Default=0.1, which means that intensities are multiplied by 1+- a random
number within [0, 0.1].
additional_inputs
Array of additional values to be used in training for e.g. ["precursor_mz", "parent_mass"]
"""
super().__init__(binned_spectrums, reference_scores_df, spectrum_binner, **settings)
self.reference_scores_df = self._data_selection(reference_scores_df, selected_inchikeys)
self.on_epoch_end()

def __len__(self):
"""Denotes the number of batches per epoch
NB1: self.reference_scores_df only contains 'selected' inchikeys, see `self._data_selection`.
NB2: We don't see all data every epoch, because the last half-empty batch is omitted.
This is expected behavior, with the shuffling this is OK.
"""
return int(self.settings["num_turns"]) * int(np.floor(len(self.reference_scores_df) / self.settings["batch_size"]))

def _select_positive_and_negative_mode(self):
self.positive_binned_spectra = [s for s in self.binned_spectrums if s.get("ionmode") is "positive"]
self.negative_binned_spectra = [s for s in self.binned_spectrums if s.get("ionmode") is "negative"]
self.positive_spectrum_inchikeys = np.array([s.get("inchikey")[:14] for s in self.positive_binned_spectrums])
self.negative_spectrum_inchikeys = np.array([s.get("inchikey")[:14] for s in self.negative_binned_spectrums])


def _spectrum_pair_generator(self, batch_index: int) -> Iterator[SpectrumPair]:
"""
Generate spectrum pairs for batch. For each 'source' inchikey pick an inchikey in the
desired target score range. Then randomly get spectrums for this pair of inchikeys.
"""
same_prob_bins = self.settings["same_prob_bins"]
batch_size = self.settings["batch_size"]
ionisation_modes = ["positive", "negative"]

# Go through all indexes
indexes = self.indexes[batch_index * batch_size:(batch_index + 1) * batch_size]

for index in indexes:
inchikey1 = self.reference_scores_df.index[index]
# Randomly pick the desired target score range and pick matching inchikey
target_score_range = same_prob_bins[np.random.choice(np.arange(len(same_prob_bins)))]

inchikey2 = self._find_match_in_range(inchikey1, target_score_range)
spectrum1 = self._get_spectrum_with_inchikey(inchikey1)
spectrum2 = self._get_spectrum_with_inchikey(inchikey2)
yield SpectrumPair(spectrum1, spectrum2)

@ staticmethod
def _data_selection(reference_scores_df, selected_inchikeys):
"""
Select labeled data to generate from based on `selected_inchikeys`
"""
return reference_scores_df.loc[selected_inchikeys, selected_inchikeys]

def on_epoch_end(self):
"""Updates indexes after each epoch"""
self.indexes = np.tile(np.arange(len(self.reference_scores_df)), int(self.settings["num_turns"]))
if self.settings["shuffle"]:
np.random.shuffle(self.indexes)


class Container:
"""
Helper class for DataGenerator
Expand Down
Loading