matchms · niekdejonge · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### fixed
+- A bug of spectrum pair sampling during training was fixed. Due to this bug for each spectrum only one unique spectrum was sampled, even if multiple spectra were available. The bug was introduced with MS2Deepscore 2.0
+
+### Changed
+- The inchikey pair selection and data generator has been refactored. The new data generator results in a more balanced inchikey distribution. For details see [#232](https://github.com/matchms/ms2deepscore/pull/232)
+
 ### Changed
 - dense layers are not build with leaky ReLU instead of ReLU [#222](https://github.com/matchms/ms2deepscore/pull/222).
 

diff --git a/ms2deepscore/SettingsMS2Deepscore.py b/ms2deepscore/SettingsMS2Deepscore.py
@@ -1,5 +1,6 @@
 import json
 import warnings
+from collections import Counter
 from datetime import datetime
 from json import JSONEncoder
 from typing import Optional
@@ -90,8 +91,11 @@ class SettingsMS2Deepscore:
             Array of metadata entries (and their transformation) to be used in training.
             See `MetadatFeatureGenerator` for more information.
             Default is set to empty list.
+        max_pair_resampling
+            The maximum number a inchikey pair can be resampled. Resampling is done to balance inchikey pairs over
+            the tanimoto scores. The minimum is 1, meaning that no resampling is performed.
         """
-    def __init__(self, **settings):
+    def __init__(self, validate_settings=True, **settings):
         # model structure
         self.base_dims = (2000, 2000, 2000)
         self.embedding_dim = 400
@@ -133,10 +137,12 @@ def __init__(self, **settings):
         # Compound pairs selection settings
         self.average_pairs_per_bin = 20
         self.max_pairs_per_bin = 100
-        self.same_prob_bins = np.array([(x / 10, x / 10 + 0.1) for x in range(0, 10)])
+        self.same_prob_bins = np.array([(0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), (0.5, 0.6),
+                                        (0.4, 0.5), (0.3, 0.4), (0.2, 0.3), (0.1, 0.2), (-0.01, 0.1)])
         self.include_diagonal = True
         self.val_spectra_per_inchikey = 1
         self.random_seed: Optional[int] = None
+        self.max_pair_resampling = 1
 
         # Tanimioto score setings
         self.fingerprint_type: str = "daylight"
@@ -158,9 +164,15 @@ def __init__(self, **settings):
                                         f"the type given is {type(value)}, the value given is {value}")
                     setattr(self, key, value)
                 else:
-                    raise ValueError(f"Unknown setting: {key}")
+                    if validate_settings:
+                        raise ValueError(f"Unknown setting: {key}")
+                    # When loading an older model, there can be incompatibilities between training settings.
+                    #  If these settings were just used during training it should not break the loading of a model,
+                    #  since it does not affect how the model runs.
+                    setattr(self, key, value)
 
-        self.validate_settings()
+        if validate_settings:
+            self.validate_settings()
         if self.random_seed is not None:
             np.random.seed(self.random_seed)
 
@@ -174,6 +186,7 @@ def validate_settings(self):
             assert isinstance(self.random_seed, int), "Random seed must be integer number."
         if self.loss_function.lower() not in LOSS_FUNCTIONS:
             raise ValueError(f"Unknown loss function. Must be one of: {LOSS_FUNCTIONS.keys()}")
+        validate_bin_order(self.same_prob_bins)
 
     def number_of_bins(self):
         return int((self.max_mz - self.min_mz) / self.mz_bin_width)
@@ -192,6 +205,38 @@ def default(self, o):
             json.dump(self.__dict__, file, indent=4, cls=NumpyArrayEncoder)
 
 
+def validate_bin_order(score_bins):
+    """Checks that the given bins are of the correct format
+
+    The bins should cover everything between 0 and 1.0 and the lowest bin should be below 0
+    (since pairs > are selected and we want to include zero)"""
+    # check that the correct same_prob_bins are selected
+    bin_borders_below_zero = 0
+    bin_borders_1 = 0
+    not_starting_or_ending_borders = []
+    for score_bin in score_bins:
+        if score_bin[0] > score_bin[1]:
+            raise ValueError("The first number in the bin should be smaller than the second")
+        for bin_border in score_bin:
+            if bin_border < 0:
+                bin_borders_below_zero += 1
+            elif bin_border == 1:
+                bin_borders_1 += 1
+            else:
+                not_starting_or_ending_borders.append(bin_border)
+    border_counts = Counter(not_starting_or_ending_borders)
+    if bin_borders_below_zero != 1:
+        raise ValueError(f"There should be one bin border with a value below 0. "
+                         f"But {bin_borders_below_zero} bin borders with value below 0 are found")
+    if bin_borders_1 != 1:
+        raise ValueError(
+            f"There should be one bin border with value 1. "
+            f"But {bin_borders_below_zero} bin borders with value 1 are found")
+    for count in border_counts.values():
+        if count != 2:
+            raise ValueError("There is a gap in the bins, the bins should cover everything between 0 and 1.")
+
+
 class SettingsEmbeddingEvaluator:
     """Contains all the settings used for training a EmbeddingEvaluator model.
 

diff --git a/ms2deepscore/benchmarking/calculate_scores_for_validation.py b/ms2deepscore/benchmarking/calculate_scores_for_validation.py
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 from ms2deepscore import MS2DeepScore
 from ms2deepscore.models import load_model
-from ms2deepscore.train_new_model.spectrum_pair_selection import \
+from ms2deepscore.train_new_model.inchikey_pair_selection import \
     select_inchi_for_unique_inchikeys
 from ms2deepscore.utils import save_pickled_file
 

diff --git a/ms2deepscore/models/load_model.py b/ms2deepscore/models/load_model.py
@@ -39,7 +39,7 @@ def load_model(filename: Union[str, Path]) -> SiameseSpectralModel:
     model_params = model_settings["model_params"]
 
     # Instantiate the SiameseSpectralModel with the loaded parameters
-    model = SiameseSpectralModel(settings=SettingsMS2Deepscore(**model_params))
+    model = SiameseSpectralModel(settings=SettingsMS2Deepscore(**model_params, validate_settings=False))
     model.load_state_dict(model_settings["model_state_dict"])
     model.eval()
     return model

diff --git a/ms2deepscore/train_new_model/ValidationLossCalculator.py b/ms2deepscore/train_new_model/ValidationLossCalculator.py
@@ -55,24 +55,21 @@ def select_spectra_per_inchikey(spectra,
                                 spectra_per_inchikey: int = 1):
     """Pick spectra_per_inchikey spectra for every unique inchikey14 (when possible).
     """
+    if spectra_per_inchikey < 1:
+        raise ValueError
     inchikeys14_array = np.array([s.get("inchikey")[:14] for s in spectra])
     unique_inchikeys = np.unique(inchikeys14_array)
     rng = np.random.default_rng(seed=random_seed)
     selected_spectra = []
     for inchikey in unique_inchikeys:
         matching_spectra_idx = np.where(inchikeys14_array == inchikey)[0]
-        if (spectra_per_inchikey > 1) & (spectra_per_inchikey <= len(matching_spectra_idx)):              
-            spectrum_id = rng.choice(matching_spectra_idx, spectra_per_inchikey, replace=False)
-            selected_spectra.extend([spectra[i] for i in spectrum_id])
-        else:
-            spectrum_id = rng.choice(matching_spectra_idx)
-            selected_spectra.append(spectra[spectrum_id])
+        if len(matching_spectra_idx) == 0:
+            raise ValueError("Expected at least one spectrum per inchikey")
+        selected_spectrum_ids = []
+        for i in range(int(spectra_per_inchikey//len(matching_spectra_idx))):
+            selected_spectrum_ids.extend(list(matching_spectra_idx))
+        additional_spectrum_ids = rng.choice(matching_spectra_idx, spectra_per_inchikey%len(matching_spectra_idx),
+                                             replace=False)
+        selected_spectrum_ids.extend(additional_spectrum_ids)
+        selected_spectra.extend([spectra[i] for i in selected_spectrum_ids])
     return selected_spectra
-
-
-def select_one_spectrum_per_inchikey(spectra,
-                                     random_seed):
-    return select_spectra_per_inchikey(
-        spectra,
-        random_seed,
-        spectra_per_inchikey = 1)
diff --git a/ms2deepscore/train_new_model/__init__.py b/ms2deepscore/train_new_model/__init__.py
@@ -1,10 +1,9 @@
-from .data_generators import DataGeneratorPytorch
-from .spectrum_pair_selection import (SelectedCompoundPairs,
-                                      select_compound_pairs_wrapper)
+from .data_generators import SpectrumPairGenerator, InchikeyPairGenerator
+from .inchikey_pair_selection import (select_compound_pairs_wrapper)
 
 
 __all__ = [
-    "DataGeneratorPytorch",
+    "SpectrumPairGenerator",
     "select_compound_pairs_wrapper",
-    "SelectedCompoundPairs",
+    "InchikeyPairGenerator"
 ]