matchms · niekdejonge · Aug 11, 2023
diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py
@@ -504,6 +504,117 @@ def on_epoch_end(self):
             np.random.shuffle(self.indexes)
 
 
+class DataGeneratorIonisationMode(DataGeneratorBase):
+    """Generates data for training a siamese Keras model
+    This generator will provide training data by picking each training InchiKey
+    listed in *selected_inchikeys* num_turns times in every epoch. It will then randomly
+    pick one the spectra corresponding to this InchiKey (if multiple) and pair it
+    with a randomly chosen other spectrum that corresponds to a reference score
+    as defined in same_prob_bins.
+    """
+
+    def __init__(self, binned_spectrums: List[BinnedSpectrumType],
+                 reference_scores_df: pd.DataFrame,
+                 spectrum_binner: SpectrumBinner,
+                 selected_inchikeys: Optional[list] = None,
+                 **settings):
+        """Generates data for training a siamese Keras model.
+        Parameters
+        ----------
+        binned_spectrums
+            List of BinnedSpectrum objects with the binned peak positions and intensities.
+        reference_scores_df
+            Pandas DataFrame with reference similarity scores (=labels) for compounds identified
+            by inchikeys. Columns and index should be inchikeys, the value in a row x column
+            depicting the similarity score for that pair. Must be symmetric
+            (reference_scores_df[i,j] == reference_scores_df[j,i]) and column names should be identical to the index.
+        selected_inchikeys
+            List of inchikeys to use for training.
+        dim
+            Input vector dimension.
+        As part of **settings, defaults for the following parameters can be set:
+        batch_size
+            Number of pairs per batch. Default=32.
+        num_turns
+            Number of pairs for each InChiKey during each epoch. Default=1
+        shuffle
+            Set to True to shuffle IDs every epoch. Default=True
+        ignore_equal_pairs
+            Set to True to ignore pairs of two identical spectra. Default=True
+        same_prob_bins
+            List of tuples that define ranges of the true label to be trained with
+            equal frequencies. Default is set to [(0, 0.5), (0.5, 1)], which means
+            that pairs with scores <=0.5 will be picked as often as pairs with scores
+            > 0.5.
+        augment_removal_max
+            Maximum fraction of peaks (if intensity < below augment_removal_intensity)
+            to be removed randomly. Default is set to 0.2, which means that between
+            0 and 20% of all peaks with intensities < augment_removal_intensity
+            will be removed.
+        augment_removal_intensity
+            Specifying that only peaks with intensities < max_intensity will be removed.
+        augment_intensity
+            Change peak intensities by a random number between 0 and augment_intensity.
+            Default=0.1, which means that intensities are multiplied by 1+- a random
+            number within [0, 0.1].
+        additional_inputs
+            Array of additional values to be used in training for e.g. ["precursor_mz", "parent_mass"]
+        """
+        super().__init__(binned_spectrums, reference_scores_df, spectrum_binner, **settings)
+        self.reference_scores_df = self._data_selection(reference_scores_df, selected_inchikeys)
+        self.on_epoch_end()
+
+    def __len__(self):
+        """Denotes the number of batches per epoch
+        NB1: self.reference_scores_df only contains 'selected' inchikeys, see `self._data_selection`.
+        NB2: We don't see all data every epoch, because the last half-empty batch is omitted.
+        This is expected behavior, with the shuffling this is OK.
+        """
+        return int(self.settings["num_turns"]) * int(np.floor(len(self.reference_scores_df) / self.settings["batch_size"]))
+
+    def _select_positive_and_negative_mode(self):
+        self.positive_binned_spectra = [s for s in self.binned_spectrums if s.get("ionmode") is "positive"]
+        self.negative_binned_spectra = [s for s in self.binned_spectrums if s.get("ionmode") is "negative"]
+        self.positive_spectrum_inchikeys = np.array([s.get("inchikey")[:14] for s in self.positive_binned_spectrums])
+        self.negative_spectrum_inchikeys = np.array([s.get("inchikey")[:14] for s in self.negative_binned_spectrums])
+
+
+    def _spectrum_pair_generator(self, batch_index: int) -> Iterator[SpectrumPair]:
+        """
+        Generate spectrum pairs for batch. For each 'source' inchikey pick an inchikey in the
+        desired target score range. Then randomly get spectrums for this pair of inchikeys.
+        """
+        same_prob_bins = self.settings["same_prob_bins"]
+        batch_size = self.settings["batch_size"]
+        ionisation_modes = ["positive", "negative"]
+
+        # Go through all indexes
+        indexes = self.indexes[batch_index * batch_size:(batch_index + 1) * batch_size]
+
+        for index in indexes:
+            inchikey1 = self.reference_scores_df.index[index]
+            # Randomly pick the desired target score range and pick matching inchikey
+            target_score_range = same_prob_bins[np.random.choice(np.arange(len(same_prob_bins)))]
+
+            inchikey2 = self._find_match_in_range(inchikey1, target_score_range)
+            spectrum1 = self._get_spectrum_with_inchikey(inchikey1)
+            spectrum2 = self._get_spectrum_with_inchikey(inchikey2)
+            yield SpectrumPair(spectrum1, spectrum2)
+
+    @ staticmethod
+    def _data_selection(reference_scores_df, selected_inchikeys):
+        """
+        Select labeled data to generate from based on `selected_inchikeys`
+        """
+        return reference_scores_df.loc[selected_inchikeys, selected_inchikeys]
+
+    def on_epoch_end(self):
+        """Updates indexes after each epoch"""
+        self.indexes = np.tile(np.arange(len(self.reference_scores_df)), int(self.settings["num_turns"]))
+        if self.settings["shuffle"]:
+            np.random.shuffle(self.indexes)
+
+
 class Container:
     """
     Helper class for DataGenerator