From b3a5d0d25a70b796c4a10e1a7671769529c2913c Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Wed, 13 Dec 2023 21:54:47 +0100 Subject: [PATCH 01/40] add first pytorch model version --- ms2deepscore/models/SiameseSpectralModel.py | 111 ++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 ms2deepscore/models/SiameseSpectralModel.py diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py new file mode 100644 index 00000000..c30496e9 --- /dev/null +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -0,0 +1,111 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SiameseSpectralModel(nn.Module): + """ + Class for training and evaluating a siamese neural network, implemented in PyTorch. + It consists of a dense 'base' network that produces an embedding for each of the 2 inputs. + This head model computes the cosine similarity between the embeddings. + """ + def __init__(self, + min_mz=0, + max_mz=1000, + step=0.01, + train_binning_layer: bool = True, + group_size: int = 30, + output_per_group: int = 3, + ): + super(SiameseSpectralModel, self).__init__() + self.model_parameters = { + "min_mz": min_mz, + "max_mz": max_mz, + "step": step, + "train_binning_layer": train_binning_layer, + "group_size": group_size, + "output_per_group": output_per_group, + #TODO: add ms2deepscore version + } + self.encoder = SpectralEncoder(min_mz, max_mz, step, train_binning_layer, group_size, output_per_group) + + def forward(self, x1, x2): + # Pass both inputs through the same encoder + encoded_x1 = self.encoder(x1) + encoded_x2 = self.encoder(x2) + + # Calculate cosine similarity + cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)(encoded_x1, encoded_x2) + return cos_sim + +class BinnedSpectraLayer(nn.Module): + def __init__(self, min_mz, max_mz, step): + super(BinnedSpectraLayer, self).__init__() + self.min_mz = min_mz + self.max_mz = max_mz + self.step = step + self.num_bins = int((max_mz - min_mz) / step) + + def forward(self, spectra): + # Assuming spectra is a list of matchms Spectrum objects (with 'peaks.mz' and 'peaks.intensities' attributes) + binned_spectra = torch.zeros((len(spectra), self.num_bins)) + + for i, spectrum in enumerate(spectra): + for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): + if self.min_mz <= mz < self.max_mz: + bin_index = int((mz - self.min_mz) / self.step) + binned_spectra[i, bin_index] += intensity + + return binned_spectra + +class PeakBinner(nn.Module): + def __init__(self, input_size, group_size, output_per_group): + super(PeakBinner, self).__init__() + self.group_size = group_size + self.output_per_group = output_per_group + self.groups = input_size // group_size + + # Create a ModuleList of linear layers, each mapping group_size inputs to output_per_group outputs + self.linear_layers = nn.ModuleList([nn.Linear(group_size, output_per_group) for _ in range(self.groups)]) + + def forward(self, x): + # Split the input into groups and apply each linear layer to each group + outputs = [linear(x[:, i*self.group_size:(i+1)*self.group_size]) for i, linear in enumerate(self.linear_layers)] + + # Make sure all inputs get a connection to the next layer + i = self.groups - 1 + outputs[-1] = self.linear_layers[-1](x[:, i*self.group_size:(i+1)*self.group_size]) + + # Concatenate all outputs + return torch.cat(outputs, dim=1) + + def output_size(self): + return self.groups * self.output_per_group + +class SpectralEncoder(nn.Module): + def __init__(self, min_mz: float, max_mz: float, step: float, + train_binning_layer: bool, group_size: int, output_per_group: int): + super(SpectralEncoder, self).__init__() + self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, step) + self.train_binning_layer = train_binning_layer + + if self.train_binning_layer: + self.peak_binner = PeakBinner(self.binning_layer.num_bins, 30, 3) + self.fc1 = nn.Linear(self.peak_binner.output_size(), 1000) + else: + self.fc1 = nn.Linear(self.binning_layer.num_bins, 1000) + self.fc2 = nn.Linear(1000, 1000) + self.fc3 = nn.Linear(1000, 1000) + self.output_layer = nn.Linear(1000, 500) + + def forward(self, spectra): + binned_spectra = self.binning_layer(spectra) + if self.train_binning_layer: + x = self.peak_binner(binned_spectra) + x = F.relu(self.fc1(x)) + else: + x = F.relu(self.fc1(binned_spectra)) + x = F.relu(self.fc2(x)) + x = F.relu(self.fc3(x)) + x = self.output_layer(x) + return x From c58c2ff23fc2e713c01eb00099ee1049c4814521 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Wed, 13 Dec 2023 22:23:39 +0100 Subject: [PATCH 02/40] expand model --- ms2deepscore/models/SiameseSpectralModel.py | 92 ++++++++++++++++----- 1 file changed, 72 insertions(+), 20 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index c30496e9..6cb7bac4 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -12,22 +12,62 @@ class SiameseSpectralModel(nn.Module): def __init__(self, min_mz=0, max_mz=1000, - step=0.01, + mz_bin_width=0.01, + base_dims: tuple[int, ...] = (1000, 800, 800), + embedding_dim: int = 400, train_binning_layer: bool = True, group_size: int = 30, output_per_group: int = 3, + dropout_rate: float = 0.2, + ): + """ + Construct SiameseSpectralModel + + Parameters + ---------- + min_mz + Lower bound for m/z values to consider. + max_mz + Upper bound for m/z values to consider. + mz_bin_width + Bin width for m/z sampling. + base_dims + Tuple of integers depicting the dimensions of the desired hidden + layers of the base model + embedding_dim + Dimension of the embedding (i.e. the output of the base model) + train_binning_layer + Default is True in which case the model contains a first dense multi-group peak binning layer. + group_size + When binning layer is used the group_size determins how many input bins are taken into + one dense micro-network. + output_per_group + This sets the number of next layer bins each group_size sized group of inputs shares. + dropout_rate + Dropout rate to be used in the base model. + l1_reg + L1 regularization rate. Default is 1e-6. + l2_reg + L2 regularization rate. Default is 1e-6. + keras_model + When provided, this keras model will be used to construct the SiameseModel instance. + Default is None. + """ super(SiameseSpectralModel, self).__init__() self.model_parameters = { "min_mz": min_mz, "max_mz": max_mz, - "step": step, + "mz_bin_width": mz_bin_width, + "base_dims": base_dims, + "embedding_dim": embedding_dim, "train_binning_layer": train_binning_layer, "group_size": group_size, "output_per_group": output_per_group, + "dropout_rate": dropout_rate, #TODO: add ms2deepscore version } - self.encoder = SpectralEncoder(min_mz, max_mz, step, train_binning_layer, group_size, output_per_group) + self.encoder = SpectralEncoder(**self.model_parameters) def forward(self, x1, x2): # Pass both inputs through the same encoder @@ -39,12 +79,12 @@ def forward(self, x1, x2): return cos_sim class BinnedSpectraLayer(nn.Module): - def __init__(self, min_mz, max_mz, step): + def __init__(self, min_mz, max_mz, mz_bin_width): super(BinnedSpectraLayer, self).__init__() self.min_mz = min_mz self.max_mz = max_mz - self.step = step - self.num_bins = int((max_mz - min_mz) / step) + self.mz_bin_width = mz_bin_width + self.num_bins = int((max_mz - min_mz) / mz_bin_width) def forward(self, spectra): # Assuming spectra is a list of matchms Spectrum objects (with 'peaks.mz' and 'peaks.intensities' attributes) @@ -53,7 +93,7 @@ def forward(self, spectra): for i, spectrum in enumerate(spectra): for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): if self.min_mz <= mz < self.max_mz: - bin_index = int((mz - self.min_mz) / self.step) + bin_index = int((mz - self.min_mz) / self.mz_bin_width) binned_spectra[i, bin_index] += intensity return binned_spectra @@ -83,29 +123,41 @@ def output_size(self): return self.groups * self.output_per_group class SpectralEncoder(nn.Module): - def __init__(self, min_mz: float, max_mz: float, step: float, + def __init__(self, min_mz: float, max_mz: float, mz_bin_width: float, + base_dims, embedding_dim, dropout_rate, train_binning_layer: bool, group_size: int, output_per_group: int): super(SpectralEncoder, self).__init__() - self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, step) + self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width) self.train_binning_layer = train_binning_layer + # First dense layer (no dropout!) + self.dense_layers = [] if self.train_binning_layer: - self.peak_binner = PeakBinner(self.binning_layer.num_bins, 30, 3) - self.fc1 = nn.Linear(self.peak_binner.output_size(), 1000) + self.peak_binner = PeakBinner(self.binning_layer.num_bins, group_size, output_per_group) + self.dense_layers.append(nn.Linear(self.peak_binner.output_size(), base_dims[0])) else: - self.fc1 = nn.Linear(self.binning_layer.num_bins, 1000) - self.fc2 = nn.Linear(1000, 1000) - self.fc3 = nn.Linear(1000, 1000) - self.output_layer = nn.Linear(1000, 500) + self.dense_layers.append(nn.Linear(self.binning_layer.num_bins, base_dims[0])) + input_dim = base_dims[0] + + # Create additional dense layers + for output_dim in base_dims[1:]: + self.dense_layers.append(nn.Linear(input_dim, output_dim)) + input_dim = output_dim + + self.embedding_layer = nn.Linear(base_dims[-1], embedding_dim) + self.dropout = nn.Dropout(dropout_rate) def forward(self, spectra): binned_spectra = self.binning_layer(spectra) if self.train_binning_layer: x = self.peak_binner(binned_spectra) - x = F.relu(self.fc1(x)) + x = F.relu(self.dense_layers[0](x)) else: - x = F.relu(self.fc1(binned_spectra)) - x = F.relu(self.fc2(x)) - x = F.relu(self.fc3(x)) - x = self.output_layer(x) + x = F.relu(self.dense_layers[0](binned_spectra)) + + for layer in self.dense_layers[1:]: + x = F.relu(layer(x)) + x = self.dropout(x) + + x = self.embedding_layer(x) return x From 434766572cfe69a4a803c59f1f0fca994dbc1156 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Wed, 13 Dec 2023 22:23:54 +0100 Subject: [PATCH 03/40] add torch dependency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 0edd15d7..af89a777 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ "numba", "numpy>= 1.20.3", "pandas", + "torch", "tensorflow-macos;platform_machine=='arm64'", "tensorflow-metal;platform_machine=='arm64'", "tensorflow;platform_machine!='arm64'", From dfc1baf9184680de9ce56e417897048388e4efb2 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 14 Dec 2023 21:43:37 +0100 Subject: [PATCH 04/40] expand pytorch model part --- ms2deepscore/models/SiameseSpectralModel.py | 55 +++++++++++++++++---- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 6cb7bac4..24285a42 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -1,6 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +import torch.optim as optim class SiameseSpectralModel(nn.Module): @@ -46,12 +47,8 @@ def __init__(self, This sets the number of next layer bins each group_size sized group of inputs shares. dropout_rate Dropout rate to be used in the base model. - l1_reg - L1 regularization rate. Default is 1e-6. - l2_reg - L2 regularization rate. Default is 1e-6. - keras_model - When provided, this keras model will be used to construct the SiameseModel instance. + pytorch_model + When provided, this pytorch model will be used to construct the SiameseModel instance. Default is None. """ super(SiameseSpectralModel, self).__init__() @@ -69,10 +66,10 @@ def __init__(self, } self.encoder = SpectralEncoder(**self.model_parameters) - def forward(self, x1, x2): + def forward(self, spectra_pairs): # Pass both inputs through the same encoder - encoded_x1 = self.encoder(x1) - encoded_x2 = self.encoder(x2) + encoded_x1 = self.encoder([s[0] for s in spectra_pairs]) + encoded_x2 = self.encoder([s[1] for s in spectra_pairs]) # Calculate cosine similarity cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)(encoded_x1, encoded_x2) @@ -161,3 +158,43 @@ def forward(self, spectra): x = self.embedding_layer(x) return x + + +### Model training + +def train(model, train_loader, num_epochs, learning_rate, + lambda_l1=1e-6, + lambda_l2=1e-6): + criterion = nn.MSELoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + + model.train(True) + for epoch in range(num_epochs): + for spectra, targets in train_loader: + optimizer.zero_grad() + + # Forward pass + outputs = model(spectra) + # Calculate loss + loss = criterion(outputs, targets) + loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) + + # Backward pass and optimize + loss.backward() + optimizer.step() + + # Print statistics + print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') + + +### Helper functions + +def l1_regularization(model, lambda_l1): + """L1 regulatization for first dense layer of model.""" + l1_loss = torch.linalg.vector_norm(next(model.encoder.dense_layers[0].parameters()), ord=1) + return lambda_l1 * l1_loss + +def l2_regularization(model, lambda_l2): + """L2 regulatization for first dense layer of model.""" + l2_loss = torch.linalg.vector_norm(next(model.encoder.dense_layers[0].parameters()), ord=2) + return lambda_l2 * l2_loss From bdeee6f7c8b0c7aac5727de8ae21a4bb4bc61c7e Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 14 Dec 2023 21:52:25 +0100 Subject: [PATCH 05/40] add first pytorch tests --- ms2deepscore/models/__init__.py | 2 ++ tests/test_siamese_spectra_model.py | 46 +++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 tests/test_siamese_spectra_model.py diff --git a/ms2deepscore/models/__init__.py b/ms2deepscore/models/__init__.py index 06d563e2..3ba653b4 100644 --- a/ms2deepscore/models/__init__.py +++ b/ms2deepscore/models/__init__.py @@ -1,8 +1,10 @@ from .load_model import load_model from .SiameseModel import SiameseModel +from .SiameseSpectralModel import SiameseSpectralModel __all__ = [ "load_model", "SiameseModel", + "SiameseSpectralModel", ] diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py new file mode 100644 index 00000000..053ea6dd --- /dev/null +++ b/tests/test_siamese_spectra_model.py @@ -0,0 +1,46 @@ +import pytest +from matchms import Spectrum +from ms2deepscore.models import SiameseSpectralModel + + +@pytest.fixture +def dummy_spectra(): + # Example inputs + spectrum1 = Spectrum(mz=np.array([101, 202, 303.0]), intensities=np.array([0.1, 0.2, 1.0])) + spectrum2 = Spectrum(mz=np.array([101.5, 202.5, 303.0]), intensities=np.array([0.1, 0.2, 1.0])) + return [spectrum1, spectrum2] + + +def test_siamese_model_defaults(): + # Create the model instance + model = SiameseSpectralModel() + + assert model.model_parameters == { + 'min_mz': 0, + 'max_mz': 1000, + 'mz_bin_width': 0.01, + 'base_dims': (1000, 800, 800), + 'embedding_dim': 400, + 'train_binning_layer': True, + 'group_size': 30, + 'output_per_group': 3, + 'dropout_rate': 0.2 + } + + +def test_siamese_model_forward_pass(dummy_spectra): + model = SiameseSpectralModel() + similarity_score = model([dummy_spectra]) + assert similarity_score.shape[0] == 1 + + similarity_score = model([dummy_spectra, dummy_spectra]) + assert similarity_score.shape[0] == 2 + + +def test_siamese_model_no_binning_layer(dummy_spectra): + model = SiameseSpectralModel(train_binning_layer=False) + assert not model.model_parameters["train_binning_layer"] + + # Test forward pass + similarity_score = model([dummy_spectra]) + assert similarity_score.shape[0] == 1 From 0757ca2353a11131c4494888cb67ecf82368947c Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 14 Dec 2023 21:54:00 +0100 Subject: [PATCH 06/40] remove tensorflow version checks --- .github/workflows/CI_build.yml | 35 ---------------------------------- 1 file changed, 35 deletions(-) diff --git a/.github/workflows/CI_build.yml b/.github/workflows/CI_build.yml index 02c1589e..0865f765 100644 --- a/.github/workflows/CI_build.yml +++ b/.github/workflows/CI_build.yml @@ -69,38 +69,3 @@ jobs: - name: Run tests run: | pytest - - tensorflow_check: - name: Tensorflow version check / python-3.8 / ubuntu-latest - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Python info - run: | - which python - python --version - - name: Install Tensorflow version 2.6 - run: | - python -m pip install --upgrade pip - pip install "tensorflow>=2.6,<2.7" - - name: Install other dependencies - run: | - pip install -e .[dev,train] - - name: Show pip list - run: | - pip list - - name: Run test with tensorflow version 2.6 - run: pytest - - name: Install Tensorflow version 2.8 - run: | - pip install --upgrade "numpy<1.24.0" - pip install --upgrade "tensorflow>=2.8,<2.9" - - name: Show pip list - run: | - pip list - - name: Run test with tensorflow version 2.8 - run: pytest From 9dba96862826a274391c5945cd51cc0201e67507 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 14 Dec 2023 22:01:34 +0100 Subject: [PATCH 07/40] python 3.8-3.10 to 3.9-3.11 --- .github/workflows/CI_build.yml | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI_build.yml b/.github/workflows/CI_build.yml index 0865f765..df37bfff 100644 --- a/.github/workflows/CI_build.yml +++ b/.github/workflows/CI_build.yml @@ -44,7 +44,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.9', '3.10', '3.11'] exclude: # already tested in first_check job - python-version: 3.9 diff --git a/setup.py b/setup.py index af89a777..d61db4ee 100644 --- a/setup.py +++ b/setup.py @@ -26,9 +26,9 @@ license="Apache Software License 2.0", zip_safe=False, test_suite="tests", - python_requires='>=3.8', + python_requires='>=3.9', install_requires=[ - "matchms>=0.14.0", + "matchms>=0.18.0", "numba", "numpy>= 1.20.3", "pandas", From 2662d8851b97d3084d9afd8febffd4eddf72a03d Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 14 Dec 2023 22:01:46 +0100 Subject: [PATCH 08/40] add missing import --- tests/test_siamese_spectra_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 053ea6dd..02e56ece 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -1,4 +1,5 @@ import pytest +import numpy as np from matchms import Spectrum from ms2deepscore.models import SiameseSpectralModel From 4b16ee2d2b51777fa15664f630f5accdef44af49 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Wed, 20 Dec 2023 22:02:54 +0100 Subject: [PATCH 09/40] add peak scaling --- ms2deepscore/models/SiameseSpectralModel.py | 17 ++++++++++++----- tests/test_siamese_spectra_model.py | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 24285a42..eefea93f 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -16,6 +16,7 @@ def __init__(self, mz_bin_width=0.01, base_dims: tuple[int, ...] = (1000, 800, 800), embedding_dim: int = 400, + intensity_scaling: float = 0.5, train_binning_layer: bool = True, group_size: int = 30, output_per_group: int = 3, @@ -38,6 +39,9 @@ def __init__(self, layers of the base model embedding_dim Dimension of the embedding (i.e. the output of the base model) + intensity_scaling + To put more attention on small and medium intensity peaks, peak intensities are + scaled by intensity to the power of intensity_scaling. train_binning_layer Default is True in which case the model contains a first dense multi-group peak binning layer. group_size @@ -58,6 +62,7 @@ def __init__(self, "mz_bin_width": mz_bin_width, "base_dims": base_dims, "embedding_dim": embedding_dim, + "intensity_scaling": intensity_scaling, "train_binning_layer": train_binning_layer, "group_size": group_size, "output_per_group": output_per_group, @@ -76,12 +81,13 @@ def forward(self, spectra_pairs): return cos_sim class BinnedSpectraLayer(nn.Module): - def __init__(self, min_mz, max_mz, mz_bin_width): + def __init__(self, min_mz, max_mz, mz_bin_width, intensity_scaling): super(BinnedSpectraLayer, self).__init__() self.min_mz = min_mz self.max_mz = max_mz self.mz_bin_width = mz_bin_width self.num_bins = int((max_mz - min_mz) / mz_bin_width) + self.intensity_scaling = intensity_scaling def forward(self, spectra): # Assuming spectra is a list of matchms Spectrum objects (with 'peaks.mz' and 'peaks.intensities' attributes) @@ -91,7 +97,7 @@ def forward(self, spectra): for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): if self.min_mz <= mz < self.max_mz: bin_index = int((mz - self.min_mz) / self.mz_bin_width) - binned_spectra[i, bin_index] += intensity + binned_spectra[i, bin_index] += intensity ** self.intensity_scaling return binned_spectra @@ -121,16 +127,17 @@ def output_size(self): class SpectralEncoder(nn.Module): def __init__(self, min_mz: float, max_mz: float, mz_bin_width: float, - base_dims, embedding_dim, dropout_rate, + base_dims, embedding_dim, intensity_scaling, dropout_rate, train_binning_layer: bool, group_size: int, output_per_group: int): super(SpectralEncoder, self).__init__() - self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width) + self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width, intensity_scaling) self.train_binning_layer = train_binning_layer # First dense layer (no dropout!) self.dense_layers = [] if self.train_binning_layer: - self.peak_binner = PeakBinner(self.binning_layer.num_bins, group_size, output_per_group) + self.peak_binner = PeakBinner(self.binning_layer.num_bins, + group_size, output_per_group) self.dense_layers.append(nn.Linear(self.peak_binner.output_size(), base_dims[0])) else: self.dense_layers.append(nn.Linear(self.binning_layer.num_bins, base_dims[0])) diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 02e56ece..70eccf98 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -22,6 +22,7 @@ def test_siamese_model_defaults(): 'mz_bin_width': 0.01, 'base_dims': (1000, 800, 800), 'embedding_dim': 400, + 'intensity_scaling': 0.5, 'train_binning_layer': True, 'group_size': 30, 'output_per_group': 3, From a04a1b2ef0b357cfbcaf26c3781d3155aed22b4a Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Wed, 20 Dec 2023 23:14:09 +0100 Subject: [PATCH 10/40] add first metadata vectorizer --- ms2deepscore/MetadataFeatureGenerator.py | 47 +++++++++++++++++++++++- tests/test_MetadataFeatureGenerator.py | 13 ++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/ms2deepscore/MetadataFeatureGenerator.py b/ms2deepscore/MetadataFeatureGenerator.py index 8cde7742..92c044de 100644 --- a/ms2deepscore/MetadataFeatureGenerator.py +++ b/ms2deepscore/MetadataFeatureGenerator.py @@ -1,7 +1,52 @@ import json from importlib import import_module -from typing import List, Union +from typing import List, Tuple, Union +import numpy as np from matchms import Metadata +from matchms.typing import SpectrumType +from tqdm import tqdm +from .typing import BinnedSpectrumType + + +class MetadataVectorizer: + """Create a numerical vector of selected metadata field including transformations.. + """ + + def __init__(self, + additional_metadata = ()): + """ + + Parameters + ---------- + additional_metadata: + List of all metadata used/wanted in a metadata vector. Default is (). + """ + self.additional_metadata = additional_metadata + + def transform(self, input_spectrums: List[SpectrumType], + progress_bar=False) -> List[BinnedSpectrumType]: + """Transforms the input *spectrums* into metadata vectors as needed for + MS2DeepScore. + + Parameters + ---------- + input_spectrums + List of spectrums. + progress_bar + Show progress bar if set to True. Default is False. + + Returns: + List of metadata vectors. + """ + metadata_vectors = [] + for spec in tqdm(input_spectrums, + desc="Create metadata vectors", + disable=(not progress_bar)): + additional_metadata = \ + np.array([feature_generator.generate_features(spec.metadata) + for feature_generator in self.additional_metadata]) + metadata_vectors.append(additional_metadata) + return metadata_vectors class MetadataFeatureGenerator: diff --git a/tests/test_MetadataFeatureGenerator.py b/tests/test_MetadataFeatureGenerator.py index 03623cf0..fd6c9cb1 100644 --- a/tests/test_MetadataFeatureGenerator.py +++ b/tests/test_MetadataFeatureGenerator.py @@ -1,7 +1,9 @@ import pytest -from matchms import Metadata +import numpy as np +from matchms import Metadata, Spectrum from ms2deepscore.MetadataFeatureGenerator import (CategoricalToBinary, MetadataFeatureGenerator, + MetadataVectorizer, OneHotEncoder, StandardScaler) @@ -21,6 +23,15 @@ def test_metadatafeaturegenerator_not_implemented(metadata): MetadataFeatureGenerator.load_from_dict({}) +def test_metadata_vectorizer(metadata): + scaler = StandardScaler("mass", 200.0, 250.0) + metadata = {"mass": 220.0} + s1 = Spectrum(mz=np.array([100.]), intensities=np.array([1.0]), metadata=metadata) + vectorizer = MetadataVectorizer([scaler]) + expected_value = (220 - 200) / 250 + assert vectorizer.transform([s1]) == np.array([expected_value]) + + def test_standard_scaler_generate_features_with_std(): scaler = StandardScaler("mass", 5.0, 1.0) metadata = Metadata({"mass": 7.0}) From f0d435fd40ebf72b2c0522b62fb1344c164a3716 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 21 Dec 2023 14:07:52 +0100 Subject: [PATCH 11/40] add metadata fields and tests --- ms2deepscore/MetadataFeatureGenerator.py | 24 +++++++------ ms2deepscore/models/SiameseSpectralModel.py | 37 +++++++++++++++++---- tests/test_MetadataFeatureGenerator.py | 5 ++- 3 files changed, 48 insertions(+), 18 deletions(-) diff --git a/ms2deepscore/MetadataFeatureGenerator.py b/ms2deepscore/MetadataFeatureGenerator.py index 92c044de..2eaba40e 100644 --- a/ms2deepscore/MetadataFeatureGenerator.py +++ b/ms2deepscore/MetadataFeatureGenerator.py @@ -1,7 +1,8 @@ import json from importlib import import_module -from typing import List, Tuple, Union +from typing import List, Union import numpy as np +import torch from matchms import Metadata from matchms.typing import SpectrumType from tqdm import tqdm @@ -23,31 +24,34 @@ def __init__(self, """ self.additional_metadata = additional_metadata - def transform(self, input_spectrums: List[SpectrumType], + def transform(self, spectra: List[SpectrumType], progress_bar=False) -> List[BinnedSpectrumType]: """Transforms the input *spectrums* into metadata vectors as needed for MS2DeepScore. Parameters ---------- - input_spectrums - List of spectrums. + spectra + List of spectra. progress_bar Show progress bar if set to True. Default is False. Returns: List of metadata vectors. """ - metadata_vectors = [] - for spec in tqdm(input_spectrums, + metadata_vectors = torch.zeros((len(spectra), self.size)) + for i, spec in tqdm(enumerate(spectra), desc="Create metadata vectors", disable=(not progress_bar)): - additional_metadata = \ - np.array([feature_generator.generate_features(spec.metadata) - for feature_generator in self.additional_metadata]) - metadata_vectors.append(additional_metadata) + metadata_vectors[i, :] = \ + torch.tensor([feature_generator.generate_features(spec.metadata) + for feature_generator in self.additional_metadata]) return metadata_vectors + @property + def size(self): + return len(self.additional_metadata) + class MetadataFeatureGenerator: """Base class to define metadata-to-feature conversion rules. diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index eefea93f..c3282d7e 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -3,6 +3,8 @@ import torch.nn.functional as F import torch.optim as optim +from ms2deepscore.MetadataFeatureGenerator import MetadataVectorizer + class SiameseSpectralModel(nn.Module): """ @@ -21,7 +23,7 @@ def __init__(self, group_size: int = 30, output_per_group: int = 3, dropout_rate: float = 0.2, - + metadata_vectorizer: MetadataVectorizer = None, ): """ Construct SiameseSpectralModel @@ -54,6 +56,9 @@ def __init__(self, pytorch_model When provided, this pytorch model will be used to construct the SiameseModel instance. Default is None. + metadata_vectorizer + Add the specific MetadataVectorizer object for your data if the model should contain specific + metadata entries as input. Default is set to None which means this will be ignored. """ super(SiameseSpectralModel, self).__init__() self.model_parameters = { @@ -69,7 +74,8 @@ def __init__(self, "dropout_rate": dropout_rate, #TODO: add ms2deepscore version } - self.encoder = SpectralEncoder(**self.model_parameters) + self.metadata_vectorizer = metadata_vectorizer + self.encoder = SpectralEncoder(metadata_vectorizer=metadata_vectorizer, **self.model_parameters) def forward(self, spectra_pairs): # Pass both inputs through the same encoder @@ -80,6 +86,7 @@ def forward(self, spectra_pairs): cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)(encoded_x1, encoded_x2) return cos_sim + class BinnedSpectraLayer(nn.Module): def __init__(self, min_mz, max_mz, mz_bin_width, intensity_scaling): super(BinnedSpectraLayer, self).__init__() @@ -101,6 +108,7 @@ def forward(self, spectra): return binned_spectra + class PeakBinner(nn.Module): def __init__(self, input_size, group_size, output_per_group): super(PeakBinner, self).__init__() @@ -126,21 +134,35 @@ def output_size(self): return self.groups * self.output_per_group class SpectralEncoder(nn.Module): - def __init__(self, min_mz: float, max_mz: float, mz_bin_width: float, - base_dims, embedding_dim, intensity_scaling, dropout_rate, - train_binning_layer: bool, group_size: int, output_per_group: int): + def __init__(self, min_mz: float, + max_mz: float, + mz_bin_width: float, + base_dims, + embedding_dim, + intensity_scaling, + dropout_rate, + train_binning_layer: bool, group_size: int, output_per_group: int, + metadata_vectorizer, + ): super(SpectralEncoder, self).__init__() self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width, intensity_scaling) + self.metadata_vectorizer = metadata_vectorizer self.train_binning_layer = train_binning_layer + # Consider additing metadata vector + if metadata_vectorizer is None: + additional_inputs = 0 + else: + additional_inputs = metadata_vectorizer.size # First dense layer (no dropout!) self.dense_layers = [] if self.train_binning_layer: self.peak_binner = PeakBinner(self.binning_layer.num_bins, group_size, output_per_group) - self.dense_layers.append(nn.Linear(self.peak_binner.output_size(), base_dims[0])) + input_size = self.peak_binner.output_size() + additional_inputs else: - self.dense_layers.append(nn.Linear(self.binning_layer.num_bins, base_dims[0])) + input_size = self.binning_layer.num_bins + additional_inputs + self.dense_layers.append(nn.Linear(input_size, base_dims[0])) input_dim = base_dims[0] # Create additional dense layers @@ -153,6 +175,7 @@ def __init__(self, min_mz: float, max_mz: float, mz_bin_width: float, def forward(self, spectra): binned_spectra = self.binning_layer(spectra) + metadata_vector = self.metadata_vectorizer(spectra) if self.train_binning_layer: x = self.peak_binner(binned_spectra) x = F.relu(self.dense_layers[0](x)) diff --git a/tests/test_MetadataFeatureGenerator.py b/tests/test_MetadataFeatureGenerator.py index fd6c9cb1..986819fb 100644 --- a/tests/test_MetadataFeatureGenerator.py +++ b/tests/test_MetadataFeatureGenerator.py @@ -1,5 +1,6 @@ import pytest import numpy as np +import torch from matchms import Metadata, Spectrum from ms2deepscore.MetadataFeatureGenerator import (CategoricalToBinary, MetadataFeatureGenerator, @@ -29,7 +30,9 @@ def test_metadata_vectorizer(metadata): s1 = Spectrum(mz=np.array([100.]), intensities=np.array([1.0]), metadata=metadata) vectorizer = MetadataVectorizer([scaler]) expected_value = (220 - 200) / 250 - assert vectorizer.transform([s1]) == np.array([expected_value]) + assert vectorizer.transform([s1]) == torch.tensor([expected_value]) + assert (vectorizer.transform([s1, s1]) == torch.tensor([expected_value, expected_value])).all() + assert vectorizer.size == 1 def test_standard_scaler_generate_features_with_std(): From c7a3457457f76154eeebaf05fe8111b635ff0dae Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 21 Dec 2023 14:46:00 +0100 Subject: [PATCH 12/40] fix additional inputs and add to architecture --- ms2deepscore/models/SiameseSpectralModel.py | 10 ++++-- tests/test_siamese_spectra_model.py | 40 ++++++++++++++++++--- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index c3282d7e..d9e4923f 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -175,12 +175,16 @@ def __init__(self, min_mz: float, def forward(self, spectra): binned_spectra = self.binning_layer(spectra) - metadata_vector = self.metadata_vectorizer(spectra) + if self.metadata_vectorizer is not None: + metadata_vector = self.metadata_vectorizer.transform(spectra) + else: + metadata_vector = torch.tensor([]) if self.train_binning_layer: x = self.peak_binner(binned_spectra) - x = F.relu(self.dense_layers[0](x)) + x = torch.cat([metadata_vector, x], dim=1) else: - x = F.relu(self.dense_layers[0](binned_spectra)) + x = torch.cat([metadata_vector, binned_spectra], dim=1) + x = F.relu(self.dense_layers[0](x)) for layer in self.dense_layers[1:]: x = F.relu(layer(x)) diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 70eccf98..0cd94a54 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -2,13 +2,18 @@ import numpy as np from matchms import Spectrum from ms2deepscore.models import SiameseSpectralModel - +from ms2deepscore.MetadataFeatureGenerator import (MetadataVectorizer, + StandardScaler) @pytest.fixture def dummy_spectra(): - # Example inputs - spectrum1 = Spectrum(mz=np.array([101, 202, 303.0]), intensities=np.array([0.1, 0.2, 1.0])) - spectrum2 = Spectrum(mz=np.array([101.5, 202.5, 303.0]), intensities=np.array([0.1, 0.2, 1.0])) + spectrum1 = Spectrum(mz=np.array([101, 202, 303.0]), + intensities=np.array([0.1, 0.2, 1.0]), + metadata={"precursor_mz": 222.2} + ) + spectrum2 = Spectrum(mz=np.array([101.5, 202.5, 303.0]), + intensities=np.array([0.1, 0.2, 1.0]), + metadata={"precursor_mz": 333.3}) return [spectrum1, spectrum2] @@ -40,9 +45,34 @@ def test_siamese_model_forward_pass(dummy_spectra): def test_siamese_model_no_binning_layer(dummy_spectra): - model = SiameseSpectralModel(train_binning_layer=False) + model = SiameseSpectralModel(mz_bin_width=0.1, train_binning_layer=False) assert not model.model_parameters["train_binning_layer"] # Test forward pass similarity_score = model([dummy_spectra]) assert similarity_score.shape[0] == 1 + + +def test_siamese_model_additional_metadata(dummy_spectra): + scaler = StandardScaler("precursor_mz", 200.0, 250.0) + vectorizer = MetadataVectorizer([scaler]) + model = SiameseSpectralModel(mz_bin_width=0.1, train_binning_layer=False, metadata_vectorizer=vectorizer) + + # Test forward pass + similarity_score = model([dummy_spectra]) + assert similarity_score.shape[0] == 1 + assert model.encoder.dense_layers[0].weight.shape[1] == 10001 + + # Include dense binning layer + model = SiameseSpectralModel(mz_bin_width=0.1, metadata_vectorizer=vectorizer) + + # Test forward pass + similarity_score = model([dummy_spectra]) + assert model.encoder.dense_layers[0].weight.shape[1] == 1000 + + # Compare to no metadata_vectorizer + model = SiameseSpectralModel(mz_bin_width=0.1, metadata_vectorizer=None) + + # Test forward pass + similarity_score = model([dummy_spectra]) + assert model.encoder.dense_layers[0].weight.shape[1] == 999 \ No newline at end of file From 41be0b3831f3b095154ef0602d40f90957a064ad Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 21 Dec 2023 15:05:09 +0100 Subject: [PATCH 13/40] linting --- ms2deepscore/MetadataFeatureGenerator.py | 1 - ms2deepscore/models/SiameseSpectralModel.py | 44 ++++++++++++++++++--- tests/test_siamese_spectra_model.py | 2 +- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/ms2deepscore/MetadataFeatureGenerator.py b/ms2deepscore/MetadataFeatureGenerator.py index 2eaba40e..00777651 100644 --- a/ms2deepscore/MetadataFeatureGenerator.py +++ b/ms2deepscore/MetadataFeatureGenerator.py @@ -1,7 +1,6 @@ import json from importlib import import_module from typing import List, Union -import numpy as np import torch from matchms import Metadata from matchms.typing import SpectrumType diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index d9e4923f..cd4480f7 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -1,7 +1,7 @@ import torch -import torch.nn as nn +from torch import nn import torch.nn.functional as F -import torch.optim as optim +from torch import optim from ms2deepscore.MetadataFeatureGenerator import MetadataVectorizer @@ -60,7 +60,7 @@ def __init__(self, Add the specific MetadataVectorizer object for your data if the model should contain specific metadata entries as input. Default is set to None which means this will be ignored. """ - super(SiameseSpectralModel, self).__init__() + super().__init__() self.model_parameters = { "min_mz": min_mz, "max_mz": max_mz, @@ -89,7 +89,7 @@ def forward(self, spectra_pairs): class BinnedSpectraLayer(nn.Module): def __init__(self, min_mz, max_mz, mz_bin_width, intensity_scaling): - super(BinnedSpectraLayer, self).__init__() + super().__init__() self.min_mz = min_mz self.max_mz = max_mz self.mz_bin_width = mz_bin_width @@ -111,7 +111,7 @@ def forward(self, spectra): class PeakBinner(nn.Module): def __init__(self, input_size, group_size, output_per_group): - super(PeakBinner, self).__init__() + super().__init__() self.group_size = group_size self.output_per_group = output_per_group self.groups = input_size // group_size @@ -144,7 +144,38 @@ def __init__(self, min_mz: float, train_binning_layer: bool, group_size: int, output_per_group: int, metadata_vectorizer, ): - super(SpectralEncoder, self).__init__() + """ + Parameters + ---------- + min_mz + Lower bound for m/z values to consider. + max_mz + Upper bound for m/z values to consider. + mz_bin_width + Bin width for m/z sampling. + base_dims + Tuple of integers depicting the dimensions of the desired hidden + layers of the base model + embedding_dim + Dimension of the embedding (i.e. the output of the base model) + intensity_scaling + To put more attention on small and medium intensity peaks, peak intensities are + scaled by intensity to the power of intensity_scaling. + train_binning_layer + Default is True in which case the model contains a first dense multi-group peak binning layer. + group_size + When binning layer is used the group_size determins how many input bins are taken into + one dense micro-network. + output_per_group + This sets the number of next layer bins each group_size sized group of inputs shares. + dropout_rate + Dropout rate to be used in the base model. + metadata_vectorizer + Add the specific MetadataVectorizer object for your data if the model should contain specific + metadata entries as input. Default is set to None which means this will be ignored. + """ + # pylint: disable=too-many-arguments, too-many-locals + super().__init__() self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width, intensity_scaling) self.metadata_vectorizer = metadata_vectorizer self.train_binning_layer = train_binning_layer @@ -199,6 +230,7 @@ def forward(self, spectra): def train(model, train_loader, num_epochs, learning_rate, lambda_l1=1e-6, lambda_l2=1e-6): + # pylint: disable=too-many-arguments criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 0cd94a54..9638493c 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -75,4 +75,4 @@ def test_siamese_model_additional_metadata(dummy_spectra): # Test forward pass similarity_score = model([dummy_spectra]) - assert model.encoder.dense_layers[0].weight.shape[1] == 999 \ No newline at end of file + assert model.encoder.dense_layers[0].weight.shape[1] == 999 From 429886096f6925fe59dd641e7593d8a416e847f3 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 21 Dec 2023 15:14:54 +0100 Subject: [PATCH 14/40] linting --- ms2deepscore/models/SiameseSpectralModel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index cd4480f7..a34d107f 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -60,6 +60,7 @@ def __init__(self, Add the specific MetadataVectorizer object for your data if the model should contain specific metadata entries as input. Default is set to None which means this will be ignored. """ + # pylint: disable=too-many-arguments super().__init__() self.model_parameters = { "min_mz": min_mz, From 0518856324f754727614275fe8b112d329f669ce Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 21 Dec 2023 16:35:02 +0100 Subject: [PATCH 15/40] code cleaning and bit of numba --- .../spectrum_pair_selection.py | 63 ++++++++++++------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/ms2deepscore/train_new_model/spectrum_pair_selection.py b/ms2deepscore/train_new_model/spectrum_pair_selection.py index 20fd6a3e..0a0788a8 100644 --- a/ms2deepscore/train_new_model/spectrum_pair_selection.py +++ b/ms2deepscore/train_new_model/spectrum_pair_selection.py @@ -1,5 +1,6 @@ from collections import Counter from typing import List, Optional, Tuple +from numba import jit import numpy as np from matchms import Spectrum from matchms.filtering import add_fingerprint @@ -172,36 +173,43 @@ def compute_jaccard_similarity_per_bin( """ # pylint: disable=too-many-locals size = fingerprints.shape[0] - # initialize storing scores - selected_pairs_per_bin = [[] for _ in range(len(selection_bins))] + num_bins = len(selection_bins) + + # Preallocate arrays instead of using dynamic lists + selected_pairs_per_bin = [[[] for _ in range(size)] for _ in range(num_bins)] - # loop over the fingerprints for idx_fingerprint_i in range(size): fingerprint_i = fingerprints[idx_fingerprint_i, :] + tanimoto_scores = tanimoto_scores_row(fingerprints, idx_fingerprint_i, include_diagonal) - # Calculate all tanimoto scores for 1 fingerprint - tanimoto_scores = np.zeros(size) - for idx_fingerprint_j in range(size): - if idx_fingerprint_i == idx_fingerprint_j and not include_diagonal: - # skip matching fingerprint score against itself. - continue - fingerprint_j = fingerprints[idx_fingerprint_j, :] - tanimoto_score = jaccard_index(fingerprint_i, fingerprint_j) - tanimoto_scores[idx_fingerprint_j] = tanimoto_score - - # Select pairs per bin with a maximum of max_pairs_per_bin - for bin_number, selection_bin in enumerate(selection_bins): - selected_pairs_per_bin[bin_number].append([]) - # Indices of scores within the current bin + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + idx = np.where((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - # Randomly select up to max_pairs_per_bin scores within the bin np.random.shuffle(idx) - idx_selected = idx[:max_pairs_per_bin] - for index in idx_selected: + if max_pairs_per_bin is not None: + idx = idx[:max_pairs_per_bin] + for index in idx: selected_pairs_per_bin[bin_number][idx_fingerprint_i].append((index, tanimoto_scores[index])) + return selected_pairs_per_bin +@jit(nopython=True) +def tanimoto_scores_row(fingerprints, idx, include_diagonal): + size = fingerprints.shape[0] + tanimoto_scores = np.zeros(size) + + fingerprint_i = fingerprints[idx, :] + for idx_fingerprint_j in range(size): + if idx == idx_fingerprint_j and not include_diagonal: + continue + fingerprint_j = fingerprints[idx_fingerprint_j, :] + tanimoto_score = jaccard_index(fingerprint_i, fingerprint_j) + tanimoto_scores[idx_fingerprint_j] = tanimoto_score + return tanimoto_scores + + def fix_bias(selected_pairs_per_bin, expected_average_pairs_per_bin): """ Adjusts the selected pairs for each bin to align with the expected average pairs per bin. @@ -245,11 +253,23 @@ def get_nr_of_pairs_needed_to_fix_bias(nr_of_pairs_in_bin_per_compound: List[int def compute_fingerprints_for_training(spectrums, fingerprint_type: str = "daylight", nbits: int = 2048): - """Calculates fingerprints for each unique inchikey and removes spectra for which no fingerprint could be created""" + """Calculates fingerprints for each unique inchikey. + + Function also removes spectra for which no fingerprint could be created. + + Parameters + ---------- + fingerprint_type: + The fingerprint type that should be used for tanimoto score calculations. + fingerprint_nbits: + The number of bits to use for the fingerprint. + """ if len(spectrums) == 0: raise ValueError("No spectra were selected to calculate fingerprints") + spectra_selected, inchikeys14_unique = select_inchi_for_unique_inchikeys(spectrums) print(f"Selected {len(spectra_selected)} spectra with unique inchikeys (out of {len(spectrums)} spectra)") + # Compute fingerprints using matchms spectra_selected = [add_fingerprint(s, fingerprint_type, nbits)\ if s.get("fingerprint") is None else s for s in spectra_selected] @@ -262,6 +282,7 @@ def compute_fingerprints_for_training(spectrums, raise ValueError("No fingerprints could be computed") if len(idx) < len(fingerprints): print(f"Successfully generated fingerprints for {len(idx)} of {len(fingerprints)} spectra") + fingerprints = np.array([fingerprints[i] for i in idx]) inchikeys14_unique = [inchikeys14_unique[i] for i in idx] spectra_selected = [spectra_selected[i] for i in idx] From 793943b99f0a34038b2c726d62345d87f57a5c32 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 21 Dec 2023 16:42:10 +0100 Subject: [PATCH 16/40] linting --- ms2deepscore/train_new_model/spectrum_pair_selection.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ms2deepscore/train_new_model/spectrum_pair_selection.py b/ms2deepscore/train_new_model/spectrum_pair_selection.py index 0a0788a8..efd320cc 100644 --- a/ms2deepscore/train_new_model/spectrum_pair_selection.py +++ b/ms2deepscore/train_new_model/spectrum_pair_selection.py @@ -171,7 +171,6 @@ def compute_jaccard_similarity_per_bin( A list were the indexes are the bin numbers. This contains Lists were the index is the spectrum_i index. This list contains a Tuple, with first the spectrum_j index and second the score. """ - # pylint: disable=too-many-locals size = fingerprints.shape[0] num_bins = len(selection_bins) @@ -179,7 +178,6 @@ def compute_jaccard_similarity_per_bin( selected_pairs_per_bin = [[[] for _ in range(size)] for _ in range(num_bins)] for idx_fingerprint_i in range(size): - fingerprint_i = fingerprints[idx_fingerprint_i, :] tanimoto_scores = tanimoto_scores_row(fingerprints, idx_fingerprint_i, include_diagonal) for bin_number in range(num_bins): From 1a9f8d392c27eef08e51fa50b71583a88ff70d73 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 4 Jan 2024 11:32:16 +0100 Subject: [PATCH 17/40] add pytorch compatible generator --- ms2deepscore/data_generators.py | 119 ++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 4f36bb3a..774fe9f4 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -1,6 +1,7 @@ """ Data generators for training/inference with siamese Keras model. """ from typing import Iterator, List, NamedTuple, Optional +from matchms import Spectrum import numpy as np import pandas as pd from tensorflow.keras.utils import Sequence # pylint: disable=import-error @@ -11,6 +12,124 @@ from .typing import BinnedSpectrumType +from typing import Iterator +from torch.utils.data import Dataset, DataLoader +from ms2deepscore.train_new_model.spectrum_pair_selection import \ + SelectedCompoundPairs +from ms2deepscore.train_new_model.SettingMS2Deepscore import GeneratorSettings + + +class DataGeneratorPytorch: + """Generates data for training a siamese Keras model. + + This class provides a data generator specifically + designed for training a siamese Keras model with a curated set of compound pairs. + It uses pre-selected compound pairs, allowing more control over the training process, + particularly in scenarios where certain compound pairs are of specific interest or + have higher significance in the training dataset. + """ + def __init__(self, spectrums: list[Spectrum], + selected_compound_pairs: SelectedCompoundPairs, + **settings): + """Generates data for training a siamese Keras model. + + Parameters + ---------- + spectrums + List of matchms Spectrum objects. + selected_compound_pairs + SelectedCompoundPairs object which contains selected compounds pairs and the + respective similarity scores. + settings + The available settings can be found in GeneratorSettings + """ + self.current_index = 0 + self.spectrums = spectrums + + # Collect all inchikeys + self.spectrum_inchikeys = np.array([s.get("inchikey")[:14] for s in self.spectrums]) + + # Set all other settings to input (or otherwise to defaults): + self.settings = GeneratorSettings(settings) + unique_inchikeys = np.unique(self.spectrum_inchikeys) + if len(unique_inchikeys) < self.settings.batch_size: + raise ValueError("The number of unique inchikeys must be larger than the batch size.") + self.fixed_set = {} + self.selected_compound_pairs = selected_compound_pairs + self.on_epoch_end() + + def __len__(self): + return int(self.settings.num_turns)\ + * int(np.ceil(len(self.selected_compound_pairs.scores) / self.settings.batch_size)) + + def __iter__(self): + return self + + def __next__(self): + if self.current_index < self.__len__(): + batch = self.__getitem__(self.current_index) + self.current_index += 1 + return batch + else: + self.current_index = 0 # make generator executable again + raise StopIteration + + def _spectrum_pair_generator(self, batch_index: int): + """Use the provided SelectedCompoundPairs object to pick pairs.""" + batch_size = self.settings.batch_size + indexes = self.indexes[batch_index * batch_size:(batch_index + 1) * batch_size] + for index in indexes: + inchikey1 = self.selected_compound_pairs.idx_to_inchikey[index] + score, inchikey2 = self.selected_compound_pairs.next_pair_for_inchikey(inchikey1) + spectrum1 = self._get_spectrum_with_inchikey(inchikey1) + spectrum2 = self._get_spectrum_with_inchikey(inchikey2) + yield (spectrum1, spectrum2, score) + + def on_epoch_end(self): + """Updates indexes after each epoch""" + self.indexes = np.tile(np.arange(len(self.selected_compound_pairs.scores)), int(self.settings.num_turns)) + if self.settings.shuffle: + np.random.shuffle(self.indexes) + + def __getitem__(self, batch_index: int): + """Generate one batch of data. + + If use_fixed_set=True we try retrieving the batch from self.fixed_set (or store it if + this is the first epoch). This ensures a fixed set of data is generated each epoch. + """ + if self.settings.use_fixed_set and batch_index in self.fixed_set: + return self.fixed_set[batch_index] + if self.settings.random_seed is not None and batch_index == 0: + np.random.seed(self.settings.random_seed) + spectrum_pairs = self._spectrum_pair_generator(batch_index) + pairs, targets = self._split_pairs_and_targets(spectrum_pairs) + # X, y = self._data_generation(spectrum_pairs) + if self.settings.use_fixed_set: + # Store batches for later epochs + self.fixed_set[batch_index] = (pairs, targets) + return pairs, targets + + def _split_pairs_and_targets(self, spectrum_pairs): + pairs = [] + targets = [] + for pair in spectrum_pairs: + pairs.append((pair[0], pair[1])) + targets.append(pair[2]) + return pairs, targets + + def _get_spectrum_with_inchikey(self, inchikey: str) -> Spectrum: + """ + Get a random spectrum matching the `inchikey` argument. + + NB: A compound (identified by an + inchikey) can have multiple measured spectrums in a binned spectrum dataset. + """ + matching_spectrum_id = np.where(self.spectrum_inchikeys == inchikey)[0] + if len(matching_spectrum_id) <= 0: + raise ValueError("No matching inchikey found (note: expected first 14 characters)") + return self.spectrums[np.random.choice(matching_spectrum_id)] + + class SpectrumPair(NamedTuple): """ Represents a pair of binned spectrums From 60dd1f1629ef4b0ac7b96f4a1f7fd8e15dcef036 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 4 Jan 2024 11:52:32 +0100 Subject: [PATCH 18/40] adjust label tensor format --- ms2deepscore/data_generators.py | 3 ++- ms2deepscore/models/SiameseSpectralModel.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 774fe9f4..19b97d35 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -4,6 +4,7 @@ from matchms import Spectrum import numpy as np import pandas as pd +import torch from tensorflow.keras.utils import Sequence # pylint: disable=import-error from ms2deepscore.SpectrumBinner import SpectrumBinner from ms2deepscore.train_new_model.spectrum_pair_selection import \ @@ -107,7 +108,7 @@ def __getitem__(self, batch_index: int): if self.settings.use_fixed_set: # Store batches for later epochs self.fixed_set[batch_index] = (pairs, targets) - return pairs, targets + return pairs, torch.tensor(targets, dtype=torch.float32) def _split_pairs_and_targets(self, spectrum_pairs): pairs = [] diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index a34d107f..63df79a1 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -228,7 +228,7 @@ def forward(self, spectra): ### Model training -def train(model, train_loader, num_epochs, learning_rate, +def train(model, data_generator, num_epochs, learning_rate, lambda_l1=1e-6, lambda_l2=1e-6): # pylint: disable=too-many-arguments @@ -237,7 +237,7 @@ def train(model, train_loader, num_epochs, learning_rate, model.train(True) for epoch in range(num_epochs): - for spectra, targets in train_loader: + for spectra, targets in data_generator: optimizer.zero_grad() # Forward pass From f248e1599459a764a50d6bb68e0008d7f4d3fee8 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Thu, 4 Jan 2024 14:10:47 +0100 Subject: [PATCH 19/40] fix training and progress bar --- ms2deepscore/data_generators.py | 1 + ms2deepscore/models/SiameseSpectralModel.py | 47 +++++++++++++-------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 19b97d35..9cc99ee5 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -73,6 +73,7 @@ def __next__(self): return batch else: self.current_index = 0 # make generator executable again + self.on_epoch_end() raise StopIteration def _spectrum_pair_generator(self, batch_index: int): diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 63df79a1..f19f79d9 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -2,7 +2,7 @@ from torch import nn import torch.nn.functional as F from torch import optim - +from tqdm import tqdm from ms2deepscore.MetadataFeatureGenerator import MetadataVectorizer @@ -129,11 +129,12 @@ def forward(self, x): outputs[-1] = self.linear_layers[-1](x[:, i*self.group_size:(i+1)*self.group_size]) # Concatenate all outputs - return torch.cat(outputs, dim=1) + return F.relu(torch.cat(outputs, dim=1)) def output_size(self): return self.groups * self.output_per_group - + + class SpectralEncoder(nn.Module): def __init__(self, min_mz: float, max_mz: float, @@ -236,22 +237,32 @@ def train(model, data_generator, num_epochs, learning_rate, optimizer = optim.Adam(model.parameters(), lr=learning_rate) model.train(True) - for epoch in range(num_epochs): - for spectra, targets in data_generator: - optimizer.zero_grad() - # Forward pass - outputs = model(spectra) - # Calculate loss - loss = criterion(outputs, targets) - loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) - - # Backward pass and optimize - loss.backward() - optimizer.step() - - # Print statistics - print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}') + #starts = [i*data_generator.settings.batch_size for i in range(len(data_generator))] + for epoch in range(num_epochs): + #print(f"Epoch [{epoch+1}/{num_epochs}]:") + with tqdm(data_generator, unit="batch", mininterval=0) as bar: + bar.set_description(f"Epoch {epoch}") + for spectra, targets in bar: #tqdm(data_generator): + optimizer.zero_grad() + + # Forward pass + outputs = model(spectra) + # Calculate loss + loss = criterion(outputs, targets) + loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) + + # Backward pass and optimize + loss.backward() + optimizer.step() + + # Print progress + bar.set_postfix( + loss=float(loss), + ) + + # Print statistics + print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}") ### Helper functions From 274bb12199bc8c16aeeef7984d9eef6b279f737c Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 5 Jan 2024 16:20:36 +0100 Subject: [PATCH 20/40] linting --- ms2deepscore/data_generators.py | 15 ++++----------- ms2deepscore/models/SiameseSpectralModel.py | 7 +++---- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 9cc99ee5..a2d1209e 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -13,13 +13,6 @@ from .typing import BinnedSpectrumType -from typing import Iterator -from torch.utils.data import Dataset, DataLoader -from ms2deepscore.train_new_model.spectrum_pair_selection import \ - SelectedCompoundPairs -from ms2deepscore.train_new_model.SettingMS2Deepscore import GeneratorSettings - - class DataGeneratorPytorch: """Generates data for training a siamese Keras model. @@ -71,10 +64,10 @@ def __next__(self): batch = self.__getitem__(self.current_index) self.current_index += 1 return batch - else: - self.current_index = 0 # make generator executable again - self.on_epoch_end() - raise StopIteration + + self.current_index = 0 # make generator executable again + self.on_epoch_end() + raise StopIteration def _spectrum_pair_generator(self, batch_index: int): """Use the provided SelectedCompoundPairs object to pick pairs.""" diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index f19f79d9..dae579fb 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -240,10 +240,9 @@ def train(model, data_generator, num_epochs, learning_rate, #starts = [i*data_generator.settings.batch_size for i in range(len(data_generator))] for epoch in range(num_epochs): - #print(f"Epoch [{epoch+1}/{num_epochs}]:") - with tqdm(data_generator, unit="batch", mininterval=0) as bar: - bar.set_description(f"Epoch {epoch}") - for spectra, targets in bar: #tqdm(data_generator): + with tqdm(data_generator, unit="batch", mininterval=0) as training: + training.set_description(f"Epoch {epoch}") + for spectra, targets in training: optimizer.zero_grad() # Forward pass From e6d3b4ac9511a176044772edd0baa51d34f57eeb Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 5 Jan 2024 16:26:38 +0100 Subject: [PATCH 21/40] add test for new generator --- tests/test_data_generators.py | 47 +++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_data_generators.py b/tests/test_data_generators.py index 685097b9..3718d699 100644 --- a/tests/test_data_generators.py +++ b/tests/test_data_generators.py @@ -7,6 +7,7 @@ from ms2deepscore.data_generators import (DataGeneratorAllInchikeys, DataGeneratorAllSpectrums, DataGeneratorCherrypicked, + DataGeneratorPytorch, _exclude_nans_from_labels, _validate_labels) from ms2deepscore.MetadataFeatureGenerator import (CategoricalToBinary, @@ -110,6 +111,52 @@ def generate_binary_vector(i): return spectrums +def test_DataGeneratorPytorch(): + """Test DataGeneratorPytorch using generated data. + """ + num_of_unique_inchikeys = 15 + spectrums = create_test_spectra(num_of_unique_inchikeys) + batch_size = 8 + + settings = SettingsMS2Deepscore({"tanimoto_bins": np.array([(x / 4, x / 4 + 0.25) for x in range(0, 4)]), + "average_pairs_per_bin": 1}) + scp, spectrums = select_compound_pairs_wrapper(spectrums, settings) + + # Create generator + test_generator = DataGeneratorPytorch( + spectrums=spectrums, + selected_compound_pairs=scp, + batch_size=batch_size, + augment_removal_max=0.0, + augment_removal_intensity=0.0, + augment_intensity=0.0, + augment_noise_max=0 + ) + + x, y = test_generator.__getitem__(0) + assert len(x) == batch_size + assert len(y) == batch_size + assert len(test_generator.indexes) == 15 + assert isinstance(x[0][0], Spectrum) and isinstance(x[0][1], Spectrum) + assert len(test_generator) == 2 + + counts = [] + repetitions = 100 + total = num_of_unique_inchikeys * repetitions + for _ in range(repetitions): + for i, batch in enumerate(test_generator): + counts.extend(batch[1]) + assert len(counts) == total + assert (np.array(counts) > 0.5).sum() > 0.4 * total + assert (np.array(counts) <= 0.5).sum() > 0.4 * total + + # Check mostly equal distribution across all four bins: + assert (np.array(counts) <= 0.25).sum() > 0.22 * total + assert ((np.array(counts) > 0.25) & (np.array(counts) <= 0.5)).sum() > 0.22 * total + assert ((np.array(counts) > 0.5) & (np.array(counts) <= 0.75)).sum() > 0.22 * total + assert (np.array(counts) > 0.75).sum() > 0.22 * total + + def test_DataGeneratorCherrypicked(): """Test DataGeneratorCherrypicked using generated data. """ From 3509b5629dd5f87f5931728ae34c037abf70ba8b Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 5 Jan 2024 16:27:25 +0100 Subject: [PATCH 22/40] fix --- ms2deepscore/models/SiameseSpectralModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index dae579fb..0c17a2d7 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -256,7 +256,7 @@ def train(model, data_generator, num_epochs, learning_rate, optimizer.step() # Print progress - bar.set_postfix( + training.set_postfix( loss=float(loss), ) From fe9ddc28dcdabae687a2bd681c4f7c3d8216b978 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 5 Jan 2024 16:30:49 +0100 Subject: [PATCH 23/40] update train function --- ms2deepscore/models/SiameseSpectralModel.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 0c17a2d7..734c86c7 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -238,18 +238,26 @@ def train(model, data_generator, num_epochs, learning_rate, model.train(True) - #starts = [i*data_generator.settings.batch_size for i in range(len(data_generator))] + losses = [] + collection_targets = [] + #collection_predictions = [] for epoch in range(num_epochs): with tqdm(data_generator, unit="batch", mininterval=0) as training: training.set_description(f"Epoch {epoch}") + batch_losses = [] for spectra, targets in training: + # For debugging: keep track of biases + collection_targets.extend(targets) + optimizer.zero_grad() # Forward pass outputs = model(spectra) + # Calculate loss loss = criterion(outputs, targets) loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) + batch_losses.append(float(loss)) # Backward pass and optimize loss.backward() @@ -261,7 +269,9 @@ def train(model, data_generator, num_epochs, learning_rate, ) # Print statistics - print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}") + print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {np.mean(batch_losses):.4f}") + losses.append(np.mean(batch_losses)) + return losses, collection_targets ### Helper functions From 9db4512ba0f3a68bb7c14167f8abce62c714a529 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 5 Jan 2024 18:08:28 +0100 Subject: [PATCH 24/40] more tensorization to generator and include GPU support --- ms2deepscore/data_generators.py | 71 ++++++++--- ms2deepscore/models/SiameseSpectralModel.py | 129 +++++++------------- tests/test_data_generators.py | 19 +-- 3 files changed, 112 insertions(+), 107 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index a2d1209e..8233022e 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -13,7 +13,7 @@ from .typing import BinnedSpectrumType -class DataGeneratorPytorch: +class DataGeneratorPytorch(Dataset): """Generates data for training a siamese Keras model. This class provides a data generator specifically @@ -24,6 +24,8 @@ class DataGeneratorPytorch: """ def __init__(self, spectrums: list[Spectrum], selected_compound_pairs: SelectedCompoundPairs, + min_mz, max_mz, mz_bin_width, intensity_scaling, + metadata_vectorizer, **settings): """Generates data for training a siamese Keras model. @@ -34,6 +36,18 @@ def __init__(self, spectrums: list[Spectrum], selected_compound_pairs SelectedCompoundPairs object which contains selected compounds pairs and the respective similarity scores. + min_mz + Lower bound for m/z values to consider. + max_mz + Upper bound for m/z values to consider. + mz_bin_width + Bin width for m/z sampling. + intensity_scaling + To put more attention on small and medium intensity peaks, peak intensities are + scaled by intensity to the power of intensity_scaling. + metadata_vectorizer + Add the specific MetadataVectorizer object for your data if the model should contain specific + metadata entries as input. Default is set to None which means this will be ignored. settings The available settings can be found in GeneratorSettings """ @@ -45,6 +59,13 @@ def __init__(self, spectrums: list[Spectrum], # Set all other settings to input (or otherwise to defaults): self.settings = GeneratorSettings(settings) + self.min_mz = min_mz + self.max_mz = max_mz + self.mz_bin_width = mz_bin_width + self.intensity_scaling = intensity_scaling + self.num_bins = int((max_mz - min_mz) / mz_bin_width) + self.metadata_vectorizer = metadata_vectorizer + unique_inchikeys = np.unique(self.spectrum_inchikeys) if len(unique_inchikeys) < self.settings.batch_size: raise ValueError("The number of unique inchikeys must be larger than the batch size.") @@ -64,10 +85,10 @@ def __next__(self): batch = self.__getitem__(self.current_index) self.current_index += 1 return batch - - self.current_index = 0 # make generator executable again - self.on_epoch_end() - raise StopIteration + else: + self.current_index = 0 # make generator executable again + self.on_epoch_end() + raise StopIteration def _spectrum_pair_generator(self, batch_index: int): """Use the provided SelectedCompoundPairs object to pick pairs.""" @@ -81,7 +102,7 @@ def _spectrum_pair_generator(self, batch_index: int): yield (spectrum1, spectrum2, score) def on_epoch_end(self): - """Updates indexes after each epoch""" + """Updates indexes after each epoch.""" self.indexes = np.tile(np.arange(len(self.selected_compound_pairs.scores)), int(self.settings.num_turns)) if self.settings.shuffle: np.random.shuffle(self.indexes) @@ -97,21 +118,43 @@ def __getitem__(self, batch_index: int): if self.settings.random_seed is not None and batch_index == 0: np.random.seed(self.settings.random_seed) spectrum_pairs = self._spectrum_pair_generator(batch_index) - pairs, targets = self._split_pairs_and_targets(spectrum_pairs) - # X, y = self._data_generation(spectrum_pairs) + spectra_1, spectra_2, meta_1, meta_2, targets = self._tensorize_all(spectrum_pairs) + if self.settings.use_fixed_set: # Store batches for later epochs - self.fixed_set[batch_index] = (pairs, targets) - return pairs, torch.tensor(targets, dtype=torch.float32) + self.fixed_set[batch_index] = (spectra_1, spectra_2, targets) + return spectra_1, spectra_2, meta_1, meta_2, targets - def _split_pairs_and_targets(self, spectrum_pairs): - pairs = [] + def _tensorize_all(self, spectrum_pairs): + spectra_1 = [] + spectra_2 = [] targets = [] for pair in spectrum_pairs: - pairs.append((pair[0], pair[1])) + spectra_1.append(pair[0]) + spectra_2.append(pair[1]) targets.append(pair[2]) - return pairs, targets + binned_spectra_1 = self._tensorize_spectra(spectra_1) + binned_spectra_2 = self._tensorize_spectra(spectra_2) + if self.metadata_vectorizer is not None: + metadata_1 = self.metadata_vectorizer.transform(spectra1) + metadata_2 = self.metadata_vectorizer.transform(spectra2) + else: + metadata_1 = torch.tensor([]) + metadata_2 = torch.tensor([]) + return binned_spectra_1, binned_spectra_2, metadata_1, metadata_2, torch.tensor(targets, dtype=torch.float32) + + def _tensorize_spectra(self, spectra): + # Assuming spectra is a list of matchms Spectrum objects (with 'peaks.mz' and 'peaks.intensities' attributes) + binned_spectra = torch.zeros((len(spectra), self.num_bins)) + + for i, spectrum in enumerate(spectra): + for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): + if self.min_mz <= mz < self.max_mz: + bin_index = int((mz - self.min_mz) / self.mz_bin_width) + binned_spectra[i, bin_index] += intensity ** self.intensity_scaling + return binned_spectra + def _get_spectrum_with_inchikey(self, inchikey: str) -> Spectrum: """ Get a random spectrum matching the `inchikey` argument. diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 734c86c7..9e974b68 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -13,37 +13,26 @@ class SiameseSpectralModel(nn.Module): This head model computes the cosine similarity between the embeddings. """ def __init__(self, - min_mz=0, - max_mz=1000, - mz_bin_width=0.01, + peak_inputs: int, + additional_inputs: int = 0, base_dims: tuple[int, ...] = (1000, 800, 800), embedding_dim: int = 400, - intensity_scaling: float = 0.5, train_binning_layer: bool = True, group_size: int = 30, output_per_group: int = 3, dropout_rate: float = 0.2, - metadata_vectorizer: MetadataVectorizer = None, + ): """ Construct SiameseSpectralModel Parameters ---------- - min_mz - Lower bound for m/z values to consider. - max_mz - Upper bound for m/z values to consider. - mz_bin_width - Bin width for m/z sampling. base_dims Tuple of integers depicting the dimensions of the desired hidden layers of the base model embedding_dim Dimension of the embedding (i.e. the output of the base model) - intensity_scaling - To put more attention on small and medium intensity peaks, peak intensities are - scaled by intensity to the power of intensity_scaling. train_binning_layer Default is True in which case the model contains a first dense multi-group peak binning layer. group_size @@ -53,63 +42,39 @@ def __init__(self, This sets the number of next layer bins each group_size sized group of inputs shares. dropout_rate Dropout rate to be used in the base model. + peak_inputs + Integer to specify the number of binned peaks in the input spectra. + additional_inputs + Integer to specify the number of additional (metadata) input fields. pytorch_model When provided, this pytorch model will be used to construct the SiameseModel instance. Default is None. - metadata_vectorizer - Add the specific MetadataVectorizer object for your data if the model should contain specific - metadata entries as input. Default is set to None which means this will be ignored. """ # pylint: disable=too-many-arguments super().__init__() self.model_parameters = { - "min_mz": min_mz, - "max_mz": max_mz, - "mz_bin_width": mz_bin_width, "base_dims": base_dims, "embedding_dim": embedding_dim, - "intensity_scaling": intensity_scaling, "train_binning_layer": train_binning_layer, "group_size": group_size, "output_per_group": output_per_group, "dropout_rate": dropout_rate, + "peak_inputs": peak_inputs, + "additional_inputs": additional_inputs, #TODO: add ms2deepscore version } - self.metadata_vectorizer = metadata_vectorizer - self.encoder = SpectralEncoder(metadata_vectorizer=metadata_vectorizer, **self.model_parameters) + self.encoder = SpectralEncoder(**self.model_parameters) - def forward(self, spectra_pairs): + def forward(self, spectra_tensors_1, spectra_tensors_2, metadata_1, metadata_2): # Pass both inputs through the same encoder - encoded_x1 = self.encoder([s[0] for s in spectra_pairs]) - encoded_x2 = self.encoder([s[1] for s in spectra_pairs]) + encoded_x1 = self.encoder(spectra_tensors_1, metadata_1) + encoded_x2 = self.encoder(spectra_tensors_2, metadata_2) # Calculate cosine similarity cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)(encoded_x1, encoded_x2) return cos_sim -class BinnedSpectraLayer(nn.Module): - def __init__(self, min_mz, max_mz, mz_bin_width, intensity_scaling): - super().__init__() - self.min_mz = min_mz - self.max_mz = max_mz - self.mz_bin_width = mz_bin_width - self.num_bins = int((max_mz - min_mz) / mz_bin_width) - self.intensity_scaling = intensity_scaling - - def forward(self, spectra): - # Assuming spectra is a list of matchms Spectrum objects (with 'peaks.mz' and 'peaks.intensities' attributes) - binned_spectra = torch.zeros((len(spectra), self.num_bins)) - - for i, spectrum in enumerate(spectra): - for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): - if self.min_mz <= mz < self.max_mz: - bin_index = int((mz - self.min_mz) / self.mz_bin_width) - binned_spectra[i, bin_index] += intensity ** self.intensity_scaling - - return binned_spectra - - class PeakBinner(nn.Module): def __init__(self, input_size, group_size, output_per_group): super().__init__() @@ -136,33 +101,22 @@ def output_size(self): class SpectralEncoder(nn.Module): - def __init__(self, min_mz: float, - max_mz: float, - mz_bin_width: float, + def __init__(self, base_dims, embedding_dim, - intensity_scaling, dropout_rate, train_binning_layer: bool, group_size: int, output_per_group: int, - metadata_vectorizer, + peak_inputs: int, + additional_inputs: int, ): """ Parameters ---------- - min_mz - Lower bound for m/z values to consider. - max_mz - Upper bound for m/z values to consider. - mz_bin_width - Bin width for m/z sampling. base_dims Tuple of integers depicting the dimensions of the desired hidden layers of the base model embedding_dim Dimension of the embedding (i.e. the output of the base model) - intensity_scaling - To put more attention on small and medium intensity peaks, peak intensities are - scaled by intensity to the power of intensity_scaling. train_binning_layer Default is True in which case the model contains a first dense multi-group peak binning layer. group_size @@ -172,29 +126,24 @@ def __init__(self, min_mz: float, This sets the number of next layer bins each group_size sized group of inputs shares. dropout_rate Dropout rate to be used in the base model. - metadata_vectorizer - Add the specific MetadataVectorizer object for your data if the model should contain specific - metadata entries as input. Default is set to None which means this will be ignored. + peak_inputs + Integer to specify the number of binned peaks in the input spectra. + additional_inputs + Integer to specify the number of additional (metadata) input fields. """ # pylint: disable=too-many-arguments, too-many-locals super().__init__() - self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width, intensity_scaling) - self.metadata_vectorizer = metadata_vectorizer + #self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width, intensity_scaling) self.train_binning_layer = train_binning_layer - # Consider additing metadata vector - if metadata_vectorizer is None: - additional_inputs = 0 - else: - additional_inputs = metadata_vectorizer.size # First dense layer (no dropout!) self.dense_layers = [] if self.train_binning_layer: - self.peak_binner = PeakBinner(self.binning_layer.num_bins, + self.peak_binner = PeakBinner(peak_inputs, group_size, output_per_group) input_size = self.peak_binner.output_size() + additional_inputs else: - input_size = self.binning_layer.num_bins + additional_inputs + input_size = peak_inputs + additional_inputs self.dense_layers.append(nn.Linear(input_size, base_dims[0])) input_dim = base_dims[0] @@ -206,17 +155,12 @@ def __init__(self, min_mz: float, self.embedding_layer = nn.Linear(base_dims[-1], embedding_dim) self.dropout = nn.Dropout(dropout_rate) - def forward(self, spectra): - binned_spectra = self.binning_layer(spectra) - if self.metadata_vectorizer is not None: - metadata_vector = self.metadata_vectorizer.transform(spectra) - else: - metadata_vector = torch.tensor([]) + def forward(self, spectra_tensors, metadata_tensors): if self.train_binning_layer: - x = self.peak_binner(binned_spectra) - x = torch.cat([metadata_vector, x], dim=1) + x = self.peak_binner(spectra_tensors) + x = torch.cat([metadata_tensors, x], dim=1) else: - x = torch.cat([metadata_vector, binned_spectra], dim=1) + x = torch.cat([metadata_tensors, spectra_tensors], dim=1) x = F.relu(self.dense_layers[0](x)) for layer in self.dense_layers[1:]: @@ -232,7 +176,13 @@ def forward(self, spectra): def train(model, data_generator, num_epochs, learning_rate, lambda_l1=1e-6, lambda_l2=1e-6): - # pylint: disable=too-many-arguments + # pylint: disable=)too-many-arguments + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Training will happen on {device}.") + + # Move model to device + model.to(device) + criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) @@ -245,14 +195,21 @@ def train(model, data_generator, num_epochs, learning_rate, with tqdm(data_generator, unit="batch", mininterval=0) as training: training.set_description(f"Epoch {epoch}") batch_losses = [] - for spectra, targets in training: + for spectra_1, spectra_2, meta_1, meta_2, targets in training: + # Move data to device + spectra_1.to(device) + spectra_2.to(device) + meta_1.to(device) + meta_2.to(device) + targets.to(device) + # For debugging: keep track of biases collection_targets.extend(targets) optimizer.zero_grad() # Forward pass - outputs = model(spectra) + outputs = model(spectra_1, spectra_2, meta_1, meta_2) # Calculate loss loss = criterion(outputs, targets) diff --git a/tests/test_data_generators.py b/tests/test_data_generators.py index 3718d699..8321c63f 100644 --- a/tests/test_data_generators.py +++ b/tests/test_data_generators.py @@ -121,23 +121,28 @@ def test_DataGeneratorPytorch(): settings = SettingsMS2Deepscore({"tanimoto_bins": np.array([(x / 4, x / 4 + 0.25) for x in range(0, 4)]), "average_pairs_per_bin": 1}) scp, spectrums = select_compound_pairs_wrapper(spectrums, settings) - # Create generator test_generator = DataGeneratorPytorch( spectrums=spectrums, + min_mz=10, + max_mz=1000, + mz_bin_width=0.1, + intensity_scaling=0.5, + metadata_vectorizer=None, selected_compound_pairs=scp, batch_size=batch_size, augment_removal_max=0.0, augment_removal_intensity=0.0, augment_intensity=0.0, - augment_noise_max=0 + augment_noise_max=0, ) - x, y = test_generator.__getitem__(0) - assert len(x) == batch_size - assert len(y) == batch_size + spec1, spec2, meta1, meta2, targets = test_generator.__getitem__(0) + assert meta1.shape[0] == meta2.shape[0] == 0 + assert spec1.shape[0] == spec2.shape[0] == batch_size + assert spec1.shape[1] == spec2.shape[1] == 9900 + assert targets.shape[0] == batch_size assert len(test_generator.indexes) == 15 - assert isinstance(x[0][0], Spectrum) and isinstance(x[0][1], Spectrum) assert len(test_generator) == 2 counts = [] @@ -145,7 +150,7 @@ def test_DataGeneratorPytorch(): total = num_of_unique_inchikeys * repetitions for _ in range(repetitions): for i, batch in enumerate(test_generator): - counts.extend(batch[1]) + counts.extend(batch[4]) assert len(counts) == total assert (np.array(counts) > 0.5).sum() > 0.4 * total assert (np.array(counts) <= 0.5).sum() > 0.4 * total From f753f33b710a8fba6b17bf4b3693684ec46c53db Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 5 Jan 2024 18:14:05 +0100 Subject: [PATCH 25/40] fix and linting --- ms2deepscore/data_generators.py | 7 +++++-- ms2deepscore/models/SiameseSpectralModel.py | 2 -- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 8233022e..869757e1 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -13,7 +13,7 @@ from .typing import BinnedSpectrumType -class DataGeneratorPytorch(Dataset): +class DataGeneratorPytorch: """Generates data for training a siamese Keras model. This class provides a data generator specifically @@ -145,7 +145,10 @@ def _tensorize_all(self, spectrum_pairs): return binned_spectra_1, binned_spectra_2, metadata_1, metadata_2, torch.tensor(targets, dtype=torch.float32) def _tensorize_spectra(self, spectra): - # Assuming spectra is a list of matchms Spectrum objects (with 'peaks.mz' and 'peaks.intensities' attributes) + """ + Assuming spectra is a list of matchms Spectrum objects + (with 'peaks.mz' and 'peaks.intensities' attributes). + """ binned_spectra = torch.zeros((len(spectra), self.num_bins)) for i, spectrum in enumerate(spectra): diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 9e974b68..1bc2151d 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -3,7 +3,6 @@ import torch.nn.functional as F from torch import optim from tqdm import tqdm -from ms2deepscore.MetadataFeatureGenerator import MetadataVectorizer class SiameseSpectralModel(nn.Module): @@ -190,7 +189,6 @@ def train(model, data_generator, num_epochs, learning_rate, losses = [] collection_targets = [] - #collection_predictions = [] for epoch in range(num_epochs): with tqdm(data_generator, unit="batch", mininterval=0) as training: training.set_description(f"Epoch {epoch}") From ec9140004bb7dcc4818488e06b3c72015dfd8aea Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 09:29:11 +0100 Subject: [PATCH 26/40] minor edits --- ms2deepscore/models/SiameseSpectralModel.py | 3 ++- tests/test_siamese_spectra_model.py | 6 +----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 1bc2151d..6da30efe 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -211,7 +211,8 @@ def train(model, data_generator, num_epochs, learning_rate, # Calculate loss loss = criterion(outputs, targets) - loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) + if lambda_l1 > 0 or lambda_l2 > 0: + loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) batch_losses.append(float(loss)) # Backward pass and optimize diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 9638493c..09a50bad 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -19,15 +19,11 @@ def dummy_spectra(): def test_siamese_model_defaults(): # Create the model instance - model = SiameseSpectralModel() + model = SiameseSpectralModel(peak_inputs=9900, additional_inputs=0) assert model.model_parameters == { - 'min_mz': 0, - 'max_mz': 1000, - 'mz_bin_width': 0.01, 'base_dims': (1000, 800, 800), 'embedding_dim': 400, - 'intensity_scaling': 0.5, 'train_binning_layer': True, 'group_size': 30, 'output_per_group': 3, From 5b66690c41baf22662b402fe97fffddcdab33a2d Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 10:25:28 +0100 Subject: [PATCH 27/40] update and fix tensorization --- ms2deepscore/data_generators.py | 59 ++++++++++++++++++----------- tests/test_data_generators.py | 15 +++++++- tests/test_siamese_spectra_model.py | 44 +++++++++++---------- 3 files changed, 76 insertions(+), 42 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 869757e1..397f7479 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -134,29 +134,19 @@ def _tensorize_all(self, spectrum_pairs): spectra_2.append(pair[1]) targets.append(pair[2]) - binned_spectra_1 = self._tensorize_spectra(spectra_1) - binned_spectra_2 = self._tensorize_spectra(spectra_2) - if self.metadata_vectorizer is not None: - metadata_1 = self.metadata_vectorizer.transform(spectra1) - metadata_2 = self.metadata_vectorizer.transform(spectra2) - else: - metadata_1 = torch.tensor([]) - metadata_2 = torch.tensor([]) + binned_spectra_1, metadata_1 = tensorize_spectra( + spectra_1, + self.metadata_vectorizer, + self.num_bins, self.min_mz, self.max_mz, + self.mz_bin_width, self.intensity_scaling + ) + binned_spectra_2, metadata_2 = tensorize_spectra( + spectra_2, + self.metadata_vectorizer, + self.num_bins, self.min_mz, self.max_mz, + self.mz_bin_width, self.intensity_scaling + ) return binned_spectra_1, binned_spectra_2, metadata_1, metadata_2, torch.tensor(targets, dtype=torch.float32) - - def _tensorize_spectra(self, spectra): - """ - Assuming spectra is a list of matchms Spectrum objects - (with 'peaks.mz' and 'peaks.intensities' attributes). - """ - binned_spectra = torch.zeros((len(spectra), self.num_bins)) - - for i, spectrum in enumerate(spectra): - for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): - if self.min_mz <= mz < self.max_mz: - bin_index = int((mz - self.min_mz) / self.mz_bin_width) - binned_spectra[i, bin_index] += intensity ** self.intensity_scaling - return binned_spectra def _get_spectrum_with_inchikey(self, inchikey: str) -> Spectrum: """ @@ -171,6 +161,31 @@ def _get_spectrum_with_inchikey(self, inchikey: str) -> Spectrum: return self.spectrums[np.random.choice(matching_spectrum_id)] +def tensorize_spectra( + spectra, + metadata_vectorizer, + min_mz, + max_mz, + mz_bin_width, + intensity_scaling + ): + """Convert list of matchms Spectrum objects to pytorch peak and metadata tensors. + """ + num_bins = int((max_mz - min_mz) / mz_bin_width) + if metadata_vectorizer is None: + metadata_tensors = torch.zeros((len(spectra), 0)) + else: + metadata_tensors = metadata_vectorizer.transform(spectra) + + binned_spectra = torch.zeros((len(spectra), num_bins)) + for i, spectrum in enumerate(spectra): + for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): + if min_mz <= mz < max_mz: + bin_index = int((mz - min_mz) / mz_bin_width) + binned_spectra[i, bin_index] += intensity ** intensity_scaling + return binned_spectra, metadata_tensors + + class SpectrumPair(NamedTuple): """ Represents a pair of binned spectrums diff --git a/tests/test_data_generators.py b/tests/test_data_generators.py index 8321c63f..3da76b9b 100644 --- a/tests/test_data_generators.py +++ b/tests/test_data_generators.py @@ -3,11 +3,13 @@ import pandas as pd import pytest from matchms import Spectrum +import torch from ms2deepscore import SpectrumBinner from ms2deepscore.data_generators import (DataGeneratorAllInchikeys, DataGeneratorAllSpectrums, DataGeneratorCherrypicked, DataGeneratorPytorch, + tensorize_spectra, _exclude_nans_from_labels, _validate_labels) from ms2deepscore.MetadataFeatureGenerator import (CategoricalToBinary, @@ -111,6 +113,16 @@ def generate_binary_vector(i): return spectrums +def test_tensorize_spectra(): + spectrum = Spectrum(mz=np.array([10, 500, 999.9]), intensities=np.array([0.5, 0.5, 1])) + spec_tensors, meta_tensors = tensorize_spectra([spectrum, spectrum], None, 10, 1000, 1, 0.5) + + assert meta_tensors.shape == torch.Size([2, 0]) + assert spec_tensors.shape == torch.Size([2, 990]) + assert spec_tensors[0, 0] == spec_tensors[0, 490] == 0.5 ** 0.5 + assert spec_tensors[0, -1] == 1 + + def test_DataGeneratorPytorch(): """Test DataGeneratorPytorch using generated data. """ @@ -138,7 +150,8 @@ def test_DataGeneratorPytorch(): ) spec1, spec2, meta1, meta2, targets = test_generator.__getitem__(0) - assert meta1.shape[0] == meta2.shape[0] == 0 + assert meta1.shape[0] == meta2.shape[0] == batch_size + assert meta1.shape[1] == meta2.shape[1] == 0 assert spec1.shape[0] == spec2.shape[0] == batch_size assert spec1.shape[1] == spec2.shape[1] == 9900 assert targets.shape[0] == batch_size diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 09a50bad..b77560fa 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -4,6 +4,8 @@ from ms2deepscore.models import SiameseSpectralModel from ms2deepscore.MetadataFeatureGenerator import (MetadataVectorizer, StandardScaler) +from ms2deepscore.data_generators import tensorize_spectra + @pytest.fixture def dummy_spectra(): @@ -27,48 +29,52 @@ def test_siamese_model_defaults(): 'train_binning_layer': True, 'group_size': 30, 'output_per_group': 3, - 'dropout_rate': 0.2 + 'dropout_rate': 0.2, + 'peak_inputs': 9900, + 'additional_inputs': 0 } def test_siamese_model_forward_pass(dummy_spectra): - model = SiameseSpectralModel() - similarity_score = model([dummy_spectra]) - assert similarity_score.shape[0] == 1 - - similarity_score = model([dummy_spectra, dummy_spectra]) + model = SiameseSpectralModel(peak_inputs=990, additional_inputs=0) + spec_tensors, meta_tensors = tensorize_spectra(dummy_spectra, None, 10, 1000, 1, 0.5) + similarity_score = model(spec_tensors, spec_tensors, meta_tensors, meta_tensors) assert similarity_score.shape[0] == 2 def test_siamese_model_no_binning_layer(dummy_spectra): - model = SiameseSpectralModel(mz_bin_width=0.1, train_binning_layer=False) + model = SiameseSpectralModel(peak_inputs=990, additional_inputs=0, train_binning_layer=False) assert not model.model_parameters["train_binning_layer"] # Test forward pass - similarity_score = model([dummy_spectra]) - assert similarity_score.shape[0] == 1 + spec_tensors, meta_tensors = tensorize_spectra(dummy_spectra, None, 10, 1000, 1, 0.5) + similarity_score = model(spec_tensors, spec_tensors, meta_tensors, meta_tensors) + assert similarity_score.shape[0] == 2 def test_siamese_model_additional_metadata(dummy_spectra): scaler = StandardScaler("precursor_mz", 200.0, 250.0) vectorizer = MetadataVectorizer([scaler]) - model = SiameseSpectralModel(mz_bin_width=0.1, train_binning_layer=False, metadata_vectorizer=vectorizer) + model = SiameseSpectralModel(peak_inputs=9900, additional_inputs=1, train_binning_layer=False) # Test forward pass - similarity_score = model([dummy_spectra]) - assert similarity_score.shape[0] == 1 - assert model.encoder.dense_layers[0].weight.shape[1] == 10001 + spec_tensors, meta_tensors = tensorize_spectra(dummy_spectra, vectorizer, 10, 1000, 0.1, 0.5) + similarity_score = model(spec_tensors, spec_tensors, meta_tensors, meta_tensors) + assert similarity_score.shape[0] == 2 + assert model.encoder.dense_layers[0].weight.shape[1] == 9901 # Include dense binning layer - model = SiameseSpectralModel(mz_bin_width=0.1, metadata_vectorizer=vectorizer) + model = SiameseSpectralModel(peak_inputs=9900, additional_inputs=1) # Test forward pass - similarity_score = model([dummy_spectra]) - assert model.encoder.dense_layers[0].weight.shape[1] == 1000 + spec_tensors, meta_tensors = tensorize_spectra(dummy_spectra, vectorizer, 10, 1000, 0.1, 0.5) + similarity_score = model(spec_tensors, spec_tensors, meta_tensors, meta_tensors) + assert model.encoder.dense_layers[0].weight.shape[1] == 991 # Compare to no metadata_vectorizer - model = SiameseSpectralModel(mz_bin_width=0.1, metadata_vectorizer=None) + model = SiameseSpectralModel(peak_inputs=9900, additional_inputs=0) # Test forward pass - similarity_score = model([dummy_spectra]) - assert model.encoder.dense_layers[0].weight.shape[1] == 999 + spec_tensors, meta_tensors = tensorize_spectra(dummy_spectra, None, 10, 1000, 0.1, 0.5) + similarity_score = model(spec_tensors, spec_tensors, meta_tensors, meta_tensors) + assert model.encoder.dense_layers[0].weight.shape[1] == 990 From 9ad94a238906a45b9d7dac47a5e0307564b2e19f Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 10:30:25 +0100 Subject: [PATCH 28/40] fix --- ms2deepscore/data_generators.py | 4 ++-- ms2deepscore/models/SiameseSpectralModel.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 397f7479..b47f9b70 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -137,13 +137,13 @@ def _tensorize_all(self, spectrum_pairs): binned_spectra_1, metadata_1 = tensorize_spectra( spectra_1, self.metadata_vectorizer, - self.num_bins, self.min_mz, self.max_mz, + self.min_mz, self.max_mz, self.mz_bin_width, self.intensity_scaling ) binned_spectra_2, metadata_2 = tensorize_spectra( spectra_2, self.metadata_vectorizer, - self.num_bins, self.min_mz, self.max_mz, + self.min_mz, self.max_mz, self.mz_bin_width, self.intensity_scaling ) return binned_spectra_1, binned_spectra_2, metadata_1, metadata_2, torch.tensor(targets, dtype=torch.float32) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 6da30efe..5f950569 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -1,3 +1,4 @@ +import numpy as np import torch from torch import nn import torch.nn.functional as F From 0cbccd16b1fb77ce3621fcb6ef038ffed3ed830e Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 10:38:28 +0100 Subject: [PATCH 29/40] linting --- ms2deepscore/data_generators.py | 9 +++++---- ms2deepscore/models/SiameseSpectralModel.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index b47f9b70..6faa982c 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -51,6 +51,7 @@ def __init__(self, spectrums: list[Spectrum], settings The available settings can be found in GeneratorSettings """ + # pylint: disable=too-many-arguments self.current_index = 0 self.spectrums = spectrums @@ -85,10 +86,9 @@ def __next__(self): batch = self.__getitem__(self.current_index) self.current_index += 1 return batch - else: - self.current_index = 0 # make generator executable again - self.on_epoch_end() - raise StopIteration + self.current_index = 0 # make generator executable again + self.on_epoch_end() + raise StopIteration def _spectrum_pair_generator(self, batch_index: int): """Use the provided SelectedCompoundPairs object to pick pairs.""" @@ -171,6 +171,7 @@ def tensorize_spectra( ): """Convert list of matchms Spectrum objects to pytorch peak and metadata tensors. """ + # pylint: disable=too-many-arguments num_bins = int((max_mz - min_mz) / mz_bin_width) if metadata_vectorizer is None: metadata_tensors = torch.zeros((len(spectra), 0)) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 5f950569..8ec08e35 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -131,7 +131,7 @@ def __init__(self, additional_inputs Integer to specify the number of additional (metadata) input fields. """ - # pylint: disable=too-many-arguments, too-many-locals + # pylint: disable=too-many-arguments super().__init__() #self.binning_layer = BinnedSpectraLayer(min_mz, max_mz, mz_bin_width, intensity_scaling) self.train_binning_layer = train_binning_layer @@ -176,7 +176,7 @@ def forward(self, spectra_tensors, metadata_tensors): def train(model, data_generator, num_epochs, learning_rate, lambda_l1=1e-6, lambda_l2=1e-6): - # pylint: disable=)too-many-arguments + # pylint: disable=too-many-arguments, too-many-locals device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Training will happen on {device}.") From 4ebee6da400d2c144d90ddeca2e9d67e0a8e6e67 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 13:22:42 +0100 Subject: [PATCH 30/40] fix layers with ModuleList and add training test --- ms2deepscore/models/SiameseSpectralModel.py | 2 +- tests/test_siamese_spectra_model.py | 69 ++++++++++++++++++++- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 8ec08e35..dab881c7 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -137,7 +137,7 @@ def __init__(self, self.train_binning_layer = train_binning_layer # First dense layer (no dropout!) - self.dense_layers = [] + self.dense_layers = nn.ModuleList() if self.train_binning_layer: self.peak_binner = PeakBinner(peak_inputs, group_size, output_per_group) diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index b77560fa..6712c102 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -1,10 +1,14 @@ import pytest import numpy as np from matchms import Spectrum -from ms2deepscore.models import SiameseSpectralModel +from ms2deepscore.models.SiameseSpectralModel import SiameseSpectralModel, train from ms2deepscore.MetadataFeatureGenerator import (MetadataVectorizer, StandardScaler) -from ms2deepscore.data_generators import tensorize_spectra +from ms2deepscore.data_generators import DataGeneratorPytorch, tensorize_spectra +from ms2deepscore.train_new_model.SettingMS2Deepscore import \ + SettingsMS2Deepscore +from ms2deepscore.train_new_model.spectrum_pair_selection import \ + select_compound_pairs_wrapper @pytest.fixture @@ -19,6 +23,38 @@ def dummy_spectra(): return [spectrum1, spectrum2] +@pytest.fixture +def simple_training_spectra(): + """Creates many random versions of two very differntly looking types of spectra. + They come with very different compound annotations so that a model should easily be able to learn those. + """ + spectra = [] + for _ in range(1000): + spectra.append( + Spectrum(mz=np.sort(np.random.uniform(0, 100, 10)), + intensities=np.random.uniform(0.2, 1, 10), + metadata={ + "precursor_mz": 222.2, + "smiles": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", + "inchi": "InChI=1S/C8H10N4O2/c1-10-4-9-6-5(10)7(13)12(3)8(14)11(6)2/h4H,1-3H3", + "inchikey": "RYYVLZVUVIJVGH-UHFFFAOYSA-N", + }, + ) + ) + spectra.append( + Spectrum(mz=np.sort(np.random.uniform(100, 200, 10)), + intensities=np.random.uniform(0.2, 1, 10), + metadata={ + "precursor_mz": 444.4, + "smiles": "CCCCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)(O)OC[C@@H](C(=O)O)N)OC(=O)CCCCCCCCCCCCCCCCC", + "inchi": "InChI=1S/C42H82NO10P/c1-3-5-7-9-11-13-15-17-19-21-23-25-27-29-31-33-40(44)50-35-38(36-51-54(48,49)52-37-39(43)42(46)47)53-41(45)34-32-30-28-26-24-22-20-18-16-14-12-10-8-6-4-2/h38-39H,3-37,43H2,1-2H3,(H,46,47)(H,48,49)/t38-,39+/m1/s1", + "inchikey": "TZCPCKNHXULUIY-RGULYWFUSA-N", + }, + ) + ) + return spectra + + def test_siamese_model_defaults(): # Create the model instance model = SiameseSpectralModel(peak_inputs=9900, additional_inputs=0) @@ -78,3 +114,32 @@ def test_siamese_model_additional_metadata(dummy_spectra): spec_tensors, meta_tensors = tensorize_spectra(dummy_spectra, None, 10, 1000, 0.1, 0.5) similarity_score = model(spec_tensors, spec_tensors, meta_tensors, meta_tensors) assert model.encoder.dense_layers[0].weight.shape[1] == 990 + + +def test_model_training(simple_training_spectra): + # Select pairs + settings = SettingsMS2Deepscore({ + "tanimoto_bins": np.array([(x / 4, x / 4 + 0.25) for x in range(0, 4)]), + "average_pairs_per_bin": 20 + }) + scp_simple, _ = select_compound_pairs_wrapper(simple_training_spectra, settings) + + # Create generator + train_generator_simple = DataGeneratorPytorch( + spectrums=simple_training_spectra, + min_mz=0, max_mz=200, mz_bin_width=0.1, intensity_scaling=0.5, + metadata_vectorizer=None, + selected_compound_pairs=scp_simple, + batch_size=2, + num_turns=20, + ) + + # Create and train model + model_simple = SiameseSpectralModel(peak_inputs=2000, additional_inputs=0, train_binning_layer=False) + losses, collection_targets = train(model_simple, train_generator_simple, 50, learning_rate=0.001, lambda_l1=0, lambda_l2=0) + + # Check if model trained to at least an OK result + assert np.mean(losses[-10:]) < 0.02, "Training was not succesfull!" + # Check if bias in data is handled correctly + assert (np.array(collection_targets) == 1).sum() == 1000 + assert (np.array(collection_targets) < .2).sum() == 1000 From d41e7d87fa4d5f4764a953d8d888014b0cec3742 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 17:21:40 +0100 Subject: [PATCH 31/40] large-scale changes to pair sampling --- .../spectrum_pair_selection.py | 164 ++++++++++-------- tests/test_siamese_spectra_model.py | 2 +- tests/test_spectrum_pair_selection.py | 45 +++-- tests/test_train_ms2deepscore.py | 10 +- tests/test_training_wrapper_function.py | 11 +- 5 files changed, 135 insertions(+), 97 deletions(-) diff --git a/ms2deepscore/train_new_model/spectrum_pair_selection.py b/ms2deepscore/train_new_model/spectrum_pair_selection.py index efd320cc..b678e7fa 100644 --- a/ms2deepscore/train_new_model/spectrum_pair_selection.py +++ b/ms2deepscore/train_new_model/spectrum_pair_selection.py @@ -1,6 +1,6 @@ from collections import Counter -from typing import List, Optional, Tuple -from numba import jit +from typing import List, Tuple +from numba import jit, prange import numpy as np from matchms import Spectrum from matchms.filtering import add_fingerprint @@ -118,79 +118,76 @@ def select_compound_pairs_wrapper( if settings.random_seed is not None: np.random.seed(settings.random_seed) - fingerprints, inchikeys14_unique, spectra_selected = compute_fingerprints_for_training(spectrums, - settings.fingerprint_type, - settings.fingerprint_nbits) - - selected_pairs_per_bin = compute_jaccard_similarity_per_bin(fingerprints, - settings.tanimoto_bins, - settings.max_pairs_per_bin, - settings.include_diagonal) - selected_pairs_per_bin = fix_bias(selected_pairs_per_bin, settings.average_pairs_per_bin) - scores_sparse = convert_selected_pairs_per_bin_to_coo_array(selected_pairs_per_bin, fingerprints.shape[0]) + fingerprints, inchikeys14_unique, spectra_selected = compute_fingerprints_for_training( + spectrums, + settings.fingerprint_type, + settings.fingerprint_nbits) + + selected_pairs_per_bin, selected_scores_per_bin = compute_jaccard_similarity_per_bin( + fingerprints, + settings.max_pairs_per_bin, + settings.tanimoto_bins, + settings.include_diagonal) + selected_pairs_per_bin = fix_bias( + selected_pairs_per_bin, + selected_scores_per_bin, + fingerprints.shape[0] * settings.average_pairs_per_bin) + scores_sparse = convert_pair_list_to_coo_array(selected_pairs_per_bin, fingerprints.shape[0]) return SelectedCompoundPairs(scores_sparse, inchikeys14_unique), spectra_selected -def convert_selected_pairs_per_bin_to_coo_array(selected_pairs_per_bin: List[List[Tuple[int, float]]], size): +def convert_pair_array_to_coo_array( + selected_pairs_per_bin, selected_scores_per_bin, size): data = [] inchikey_indexes_i = [] inchikey_indexes_j = [] - for scores_per_inchikey in selected_pairs_per_bin: - assert len(scores_per_inchikey) == size - for inchikey_idx_i, scores_list in enumerate(scores_per_inchikey): - for scores in scores_list: - inchikey_idx_j, score = scores - data.append(score) - inchikey_indexes_i.append(inchikey_idx_i) - inchikey_indexes_j.append(inchikey_idx_j) + for row_id in range(selected_pairs_per_bin.shape[1]): + idx = np.where(selected_pairs_per_bin[:, row_id, :] != -1) + data.extend(selected_scores_per_bin[idx[0], row_id, idx[1]]) + inchikey_indexes_i.extend(row_id * np.ones(len(idx[0]))) + inchikey_indexes_j.extend(selected_pairs_per_bin[idx[0], row_id, idx[1]]) return coo_array((np.array(data), (np.array(inchikey_indexes_i), np.array(inchikey_indexes_j))), shape=(size, size)) -# todo refactor so numba.njit can be used again -# @numba.njit + +def convert_pair_list_to_coo_array(selected_pairs: List[List[Tuple[int, float]]], size): + data = [] + inchikey_indexes_i = [] + inchikey_indexes_j = [] + for inchikey_idx_i, inchikey_idx_j, score in selected_pairs: + data.append(score) + inchikey_indexes_i.append(inchikey_idx_i) + inchikey_indexes_j.append(inchikey_idx_j) + return coo_array((np.array(data), (np.array(inchikey_indexes_i), np.array(inchikey_indexes_j))), + shape=(size, size)) + + +@jit(nopython=True, parallel=True) def compute_jaccard_similarity_per_bin( - fingerprints: np.ndarray, - selection_bins: np.ndarray = np.array([(x/10, x/10 + 0.1) for x in range(0, 10)]), - max_pairs_per_bin: Optional[int] = None, - include_diagonal: bool = True) -> List[List[Tuple[int, float]]]: - """For each inchikey for each bin matches are stored within this bin. - The max pairs per bin specifies how many pairs are selected per bin. This helps reduce the memory load. - - fingerprints - Fingerprint vectors as 2D numpy array. - selection_bins - List of tuples with upper and lower bound for score bins. - The goal is to pick equal numbers of pairs for each score bin. - Sidenote: bins do not have to be of equal size, nor do they have to cover the entire - range of the used scores. - max_pairs_per_bin - Specifies the desired maximum number of pairs to be added for each score bin. - Set to None to select everything (more memory intensive) - - returns: - A list were the indexes are the bin numbers. This contains Lists were the index is the spectrum_i index. - This list contains a Tuple, with first the spectrum_j index and second the score. - """ + fingerprints, + max_pairs_per_bin, + selection_bins = np.array([(x / 10, x / 10 + 0.1) for x in range(10)]), + include_diagonal = True): + size = fingerprints.shape[0] num_bins = len(selection_bins) - # Preallocate arrays instead of using dynamic lists - selected_pairs_per_bin = [[[] for _ in range(size)] for _ in range(num_bins)] + selected_pairs_per_bin = -1 * np.ones((num_bins, size, max_pairs_per_bin), dtype=np.int32) + selected_scores_per_bin = np.zeros((num_bins, size, max_pairs_per_bin), dtype=np.float32) - for idx_fingerprint_i in range(size): + for idx_fingerprint_i in prange(size): tanimoto_scores = tanimoto_scores_row(fingerprints, idx_fingerprint_i, include_diagonal) for bin_number in range(num_bins): selection_bin = selection_bins[bin_number] - - idx = np.where((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - np.random.shuffle(idx) - if max_pairs_per_bin is not None: - idx = idx[:max_pairs_per_bin] - for index in idx: - selected_pairs_per_bin[bin_number][idx_fingerprint_i].append((index, tanimoto_scores[index])) - - return selected_pairs_per_bin + indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :len(indices)] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_i, :len(indices)] = tanimoto_scores[indices] + + return selected_pairs_per_bin, selected_scores_per_bin @jit(nopython=True) @@ -208,7 +205,11 @@ def tanimoto_scores_row(fingerprints, idx, include_diagonal): return tanimoto_scores -def fix_bias(selected_pairs_per_bin, expected_average_pairs_per_bin): +desired_average_pairs_per_bin = 5000 + +def fix_bias(selected_pairs_per_bin, selected_scores_per_bin, + desired_pairs_per_bin, + max_oversampling_rate: float = 1): """ Adjusts the selected pairs for each bin to align with the expected average pairs per bin. @@ -219,22 +220,41 @@ def fix_bias(selected_pairs_per_bin, expected_average_pairs_per_bin): ---------- selected_pairs_per_bin: list of list The list containing bins and for each bin, the list of pairs for each spectrum. - expected_average_pairs_per_bin: int - The expected average number of pairs per bin. + desired_pairs_per_bin: int + The desired number of pairs per bin. Will be used if sufficient scores in each bin are found. + max_oversampling_rate: float + Maximum factor for oversampling. Default is 2. """ + if max_oversampling_rate != 1: + raise NotImplementedError("oversampling is not yet supported") + available_pairs = (selected_pairs_per_bin[:, :, :] != -1).sum(axis=2).sum(axis=1) + minimum_bin_occupation = available_pairs.min() + print(f"Found minimum bin occupation of {minimum_bin_occupation} pairs.") + print(f"Bin occupations are: {available_pairs}.") + + pairs_per_bin = min(minimum_bin_occupation * max_oversampling_rate, desired_pairs_per_bin) + if desired_pairs_per_bin > minimum_bin_occupation * max_oversampling_rate: + print(f"The desired number of {desired_pairs_per_bin} pairs per bin cannot be reached with the current setting.") + print(f"The number of pairs per bin will be set to {minimum_bin_occupation * max_oversampling_rate}.") + new_selected_pairs_per_bin = [] - for bin_nr, scores_per_compound in enumerate(selected_pairs_per_bin): - new_selected_pairs_per_bin.append([]) - # Calculate the nr_of_pairs_in_bin_per_compound - nr_of_pairs_in_bin_per_compound = [len(score_and_idx) for score_and_idx in scores_per_compound] - - cut_offs_to_use = get_nr_of_pairs_needed_to_fix_bias(nr_of_pairs_in_bin_per_compound, - expected_average_pairs_per_bin) - if sum(cut_offs_to_use)/len(cut_offs_to_use) != expected_average_pairs_per_bin: - print(f"For bin {bin_nr} the expected average number of pairs: {expected_average_pairs_per_bin} " - f"does not match the actual average number of pairs: {sum(cut_offs_to_use)/len(cut_offs_to_use)}") - for i, cut_off in enumerate(cut_offs_to_use): - new_selected_pairs_per_bin[bin_nr].append(scores_per_compound[i][:cut_off]) + + for bin_id in range(selected_pairs_per_bin.shape[0]): + goal = pairs_per_bin + for _ in range(int(np.ceil(max_oversampling_rate))): + for col in range(selected_pairs_per_bin.shape[2]): + idx = np.where(selected_pairs_per_bin[bin_id, :, col] != -1)[0] + if len(idx) > goal: + idx = np.random.choice(idx, goal) + if len(idx) == 0 and goal > 0: + print(f"Apply oversampling for bin {bin_id}.") + break + goal -= len(idx) + pairs = [(idx[i], selected_pairs_per_bin[bin_id, idx[i], col], + selected_scores_per_bin[bin_id, idx[i], col]) for i in range(len(idx))] + new_selected_pairs_per_bin.extend(pairs) + if goal <= 0: + break return new_selected_pairs_per_bin diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 6712c102..2d9a8af0 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -119,7 +119,7 @@ def test_siamese_model_additional_metadata(dummy_spectra): def test_model_training(simple_training_spectra): # Select pairs settings = SettingsMS2Deepscore({ - "tanimoto_bins": np.array([(x / 4, x / 4 + 0.25) for x in range(0, 4)]), + "tanimoto_bins": np.array([(0, 0.5), (0.5, 1)]), "average_pairs_per_bin": 20 }) scp_simple, _ = select_compound_pairs_wrapper(simple_training_spectra, settings) diff --git a/tests/test_spectrum_pair_selection.py b/tests/test_spectrum_pair_selection.py index 834b7e26..a2ebd11c 100644 --- a/tests/test_spectrum_pair_selection.py +++ b/tests/test_spectrum_pair_selection.py @@ -4,7 +4,8 @@ from scipy.sparse import coo_array from ms2deepscore.train_new_model.spectrum_pair_selection import ( SelectedCompoundPairs, compute_jaccard_similarity_per_bin, - convert_selected_pairs_per_bin_to_coo_array, fix_bias, + convert_pair_array_to_coo_array, + convert_pair_list_to_coo_array, fix_bias, get_nr_of_pairs_needed_to_fix_bias, select_inchi_for_unique_inchikeys) @@ -63,29 +64,35 @@ def dummy_data(): def test_compute_jaccard_similarity_per_bin(simple_fingerprints): - selected_pairs_per_bin = compute_jaccard_similarity_per_bin(simple_fingerprints) - matrix = convert_selected_pairs_per_bin_to_coo_array(selected_pairs_per_bin, simple_fingerprints.shape[0]) + selected_pairs_per_bin, selected_scores_per_bin = compute_jaccard_similarity_per_bin( + simple_fingerprints, max_pairs_per_bin=4) + matrix = convert_pair_array_to_coo_array( + selected_pairs_per_bin, selected_scores_per_bin, simple_fingerprints.shape[0]) assert matrix.shape == (4, 4) assert np.allclose(matrix.diagonal(), 1.0) assert matrix.nnz > 0 # Make sure there are some non-zero entries def test_compute_jaccard_similarity_per_bin_exclude_diagonal(simple_fingerprints): - selected_pairs_per_bin = compute_jaccard_similarity_per_bin(simple_fingerprints, include_diagonal=False) - matrix = convert_selected_pairs_per_bin_to_coo_array(selected_pairs_per_bin, simple_fingerprints.shape[0]) + selected_pairs_per_bin, selected_scores_per_bin = compute_jaccard_similarity_per_bin( + simple_fingerprints, max_pairs_per_bin=4, include_diagonal=False) + matrix = convert_pair_array_to_coo_array( + selected_pairs_per_bin, selected_scores_per_bin, simple_fingerprints.shape[0]) diagonal = matrix.diagonal() assert np.all(diagonal == 0) # Ensure no non-zero diagonal elements def test_compute_jaccard_similarity_per_bin_correct_counts(fingerprints): - selected_pairs_per_bin = compute_jaccard_similarity_per_bin(fingerprints) - matrix = convert_selected_pairs_per_bin_to_coo_array(selected_pairs_per_bin, fingerprints.shape[0]) + selected_pairs_per_bin, selected_scores_per_bin = compute_jaccard_similarity_per_bin( + fingerprints, max_pairs_per_bin=8) + matrix = convert_pair_array_to_coo_array( + selected_pairs_per_bin, selected_scores_per_bin, fingerprints.shape[0]) dense_matrix = matrix.todense() matrix_histogram = np.histogram(dense_matrix, 10) - expected_histogram = np.array([6, 8, 2, 10, 8, 14, 0, 8, 0, 8]) + expected_histogram = np.array([6, 8, 2, 10, 8, 6, 8, 8, 0, 8]) assert np.all(matrix_histogram[0] == expected_histogram) - +""" def test_fix_bias(): expected_average = 2 results = fix_bias([[ @@ -100,21 +107,25 @@ def test_fix_bias(): [(1, 0.1), (2, 0.1), (3, 0.1)], [], ]] +""" -@pytest.mark.parametrize("average_pairs_per_bin", [1, 2]) -def test_global_bias(fingerprints, average_pairs_per_bin): +@pytest.mark.parametrize("desired_average_pairs_per_bin", [1, 2]) +def test_global_bias(fingerprints, desired_average_pairs_per_bin): bins = np.array([(0, 0.35), (0.35, 0.65), (0.65, 1.0)]) - selected_pairs_per_bin = compute_jaccard_similarity_per_bin(fingerprints, - selection_bins=bins, - max_pairs_per_bin=10) + selected_pairs_per_bin, selected_scores_per_bin = compute_jaccard_similarity_per_bin( + fingerprints, + selection_bins=bins, + max_pairs_per_bin=10) - selected_pairs_per_bin_fixed_bias = fix_bias(selected_pairs_per_bin, average_pairs_per_bin) - matrix = convert_selected_pairs_per_bin_to_coo_array(selected_pairs_per_bin_fixed_bias, fingerprints.shape[0]) + selected_pairs_per_bin_fixed_bias = fix_bias( + selected_pairs_per_bin, selected_scores_per_bin, fingerprints.shape[0] * desired_average_pairs_per_bin) + matrix = convert_pair_list_to_coo_array( + selected_pairs_per_bin_fixed_bias, fingerprints.shape[0]) dense_matrix = matrix.todense() # Check if in each bin the nr of pairs is equal to the nr_of_fingerprints - expected_nr_of_pairs = fingerprints.shape[0] * average_pairs_per_bin + expected_nr_of_pairs = fingerprints.shape[0] * desired_average_pairs_per_bin expected_histogram = np.array([expected_nr_of_pairs, expected_nr_of_pairs, expected_nr_of_pairs]) matrix_histogram = np.histogram(dense_matrix, [0.000001, 0.35000001, 0.6000001, 1.0]) assert np.all(matrix_histogram[0] == expected_histogram) diff --git a/tests/test_train_ms2deepscore.py b/tests/test_train_ms2deepscore.py index 94e1ee41..dddc5c0b 100644 --- a/tests/test_train_ms2deepscore.py +++ b/tests/test_train_ms2deepscore.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest from matchms.importing import load_from_mgf +import numpy as np from ms2deepscore import BinnedSpectrum, SpectrumBinner from ms2deepscore.models import SiameseModel from ms2deepscore.models.load_model import \ @@ -44,9 +45,12 @@ def test_bin_spectra(tmp_path): def test_train_ms2ds_model(tmp_path): spectra = create_test_spectra(8) - settings = SettingsMS2Deepscore({"epochs": 2, - "average_pairs_per_bin": 2, - "batch_size": 8}) + settings = SettingsMS2Deepscore({ + "tanimoto_bins": np.array([(0, 0.5), (0.5, 1)]), + "epochs": 2, + "average_pairs_per_bin": 2, + "batch_size": 8 + }) train_ms2ds_model(spectra, spectra, tmp_path, settings) # check if model is saved diff --git a/tests/test_training_wrapper_function.py b/tests/test_training_wrapper_function.py index 6eab7af5..bba208bc 100644 --- a/tests/test_training_wrapper_function.py +++ b/tests/test_training_wrapper_function.py @@ -1,5 +1,6 @@ import os from matchms.exporting import save_as_mgf +import numpy as np from ms2deepscore.train_new_model.SettingMS2Deepscore import \ SettingsMS2Deepscore from ms2deepscore.wrapper_functions.StoreTrainingData import StoreTrainingData @@ -16,10 +17,12 @@ def test_train_wrapper_ms2ds_model(tmp_path): spectra_file_name = os.path.join(tmp_path, "clean_spectra.mgf") save_as_mgf(positive_mode_spectra+negative_mode_spectra, filename=spectra_file_name) - settings = SettingsMS2Deepscore({"epochs": 2, - "average_pairs_per_bin": 2, - "ionisation_mode": "negative", - "batch_size": 2}) + settings = SettingsMS2Deepscore({ + "tanimoto_bins": np.array([(0, 0.5), (0.5, 1)]), + "epochs": 2, + "average_pairs_per_bin": 2, + "ionisation_mode": "negative", + "batch_size": 2}) train_ms2deepscore_wrapper(spectra_file_name, settings, 5) expected_file_names = StoreTrainingData(spectra_file_name) assert os.path.isfile(os.path.join(tmp_path, expected_file_names.trained_models_folder, From 2f668276ef89c3a4b75f078b1fdb8515546d16aa Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 17:33:34 +0100 Subject: [PATCH 32/40] pylint comment --- ms2deepscore/train_new_model/spectrum_pair_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ms2deepscore/train_new_model/spectrum_pair_selection.py b/ms2deepscore/train_new_model/spectrum_pair_selection.py index b678e7fa..6881de2b 100644 --- a/ms2deepscore/train_new_model/spectrum_pair_selection.py +++ b/ms2deepscore/train_new_model/spectrum_pair_selection.py @@ -174,7 +174,7 @@ def compute_jaccard_similarity_per_bin( selected_pairs_per_bin = -1 * np.ones((num_bins, size, max_pairs_per_bin), dtype=np.int32) selected_scores_per_bin = np.zeros((num_bins, size, max_pairs_per_bin), dtype=np.float32) - + # pylint: disable=not-an-iterable for idx_fingerprint_i in prange(size): tanimoto_scores = tanimoto_scores_row(fingerprints, idx_fingerprint_i, include_diagonal) From 8ad6ae3f49c431ae2cb089bc1a5ce3f10386e397 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 20:49:39 +0100 Subject: [PATCH 33/40] small edits --- .../spectrum_pair_selection.py | 19 +++++++++++++------ tests/test_spectrum_pair_selection.py | 12 ++++++------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/ms2deepscore/train_new_model/spectrum_pair_selection.py b/ms2deepscore/train_new_model/spectrum_pair_selection.py index 6881de2b..ac85a5b0 100644 --- a/ms2deepscore/train_new_model/spectrum_pair_selection.py +++ b/ms2deepscore/train_new_model/spectrum_pair_selection.py @@ -128,7 +128,7 @@ def select_compound_pairs_wrapper( settings.max_pairs_per_bin, settings.tanimoto_bins, settings.include_diagonal) - selected_pairs_per_bin = fix_bias( + selected_pairs_per_bin = balanced_selection( selected_pairs_per_bin, selected_scores_per_bin, fingerprints.shape[0] * settings.average_pairs_per_bin) @@ -136,8 +136,8 @@ def select_compound_pairs_wrapper( return SelectedCompoundPairs(scores_sparse, inchikeys14_unique), spectra_selected -def convert_pair_array_to_coo_array( - selected_pairs_per_bin, selected_scores_per_bin, size): +def convert_pair_array_to_coo_data( + selected_pairs_per_bin, selected_scores_per_bin): data = [] inchikey_indexes_i = [] inchikey_indexes_j = [] @@ -146,7 +146,14 @@ def convert_pair_array_to_coo_array( data.extend(selected_scores_per_bin[idx[0], row_id, idx[1]]) inchikey_indexes_i.extend(row_id * np.ones(len(idx[0]))) inchikey_indexes_j.extend(selected_pairs_per_bin[idx[0], row_id, idx[1]]) - return coo_array((np.array(data), (np.array(inchikey_indexes_i), np.array(inchikey_indexes_j))), + return np.array(data), np.array(inchikey_indexes_i), np.array(inchikey_indexes_j) + + +def convert_pair_array_to_coo_array( + selected_pairs_per_bin, selected_scores_per_bin, size): + data, inchikey_indexes_i, inchikey_indexes_j = convert_pair_array_to_coo_data( + selected_pairs_per_bin, selected_scores_per_bin) + return coo_array((data, (inchikey_indexes_i, np.array(inchikey_indexes_j))), shape=(size, size)) @@ -207,7 +214,7 @@ def tanimoto_scores_row(fingerprints, idx, include_diagonal): desired_average_pairs_per_bin = 5000 -def fix_bias(selected_pairs_per_bin, selected_scores_per_bin, +def balanced_selection(selected_pairs_per_bin, selected_scores_per_bin, desired_pairs_per_bin, max_oversampling_rate: float = 1): """ @@ -258,7 +265,7 @@ def fix_bias(selected_pairs_per_bin, selected_scores_per_bin, return new_selected_pairs_per_bin -def get_nr_of_pairs_needed_to_fix_bias(nr_of_pairs_in_bin_per_compound: List[int], +def get_nr_of_pairs_needed_to_balanced_selection(nr_of_pairs_in_bin_per_compound: List[int], expected_average_pairs_per_bin: int ): """Calculates how many pairs should be selected to get the exact number o """ diff --git a/tests/test_spectrum_pair_selection.py b/tests/test_spectrum_pair_selection.py index a2ebd11c..a7312c19 100644 --- a/tests/test_spectrum_pair_selection.py +++ b/tests/test_spectrum_pair_selection.py @@ -5,8 +5,8 @@ from ms2deepscore.train_new_model.spectrum_pair_selection import ( SelectedCompoundPairs, compute_jaccard_similarity_per_bin, convert_pair_array_to_coo_array, - convert_pair_list_to_coo_array, fix_bias, - get_nr_of_pairs_needed_to_fix_bias, select_inchi_for_unique_inchikeys) + convert_pair_list_to_coo_array, balanced_selection, + get_nr_of_pairs_needed_to_balanced_selection, select_inchi_for_unique_inchikeys) @pytest.fixture @@ -93,9 +93,9 @@ def test_compute_jaccard_similarity_per_bin_correct_counts(fingerprints): assert np.all(matrix_histogram[0] == expected_histogram) """ -def test_fix_bias(): +def test_balanced_selection(): expected_average = 2 - results = fix_bias([[ + results = balanced_selection([[ [(1, 0.1), (2, 0.1), (3, 0.1)], [(1, 0.1), (2, 0.1), (2, 0.1), (2, 0.1)], [(1, 0.1), (2, 0.1), (3, 0.1)], @@ -119,7 +119,7 @@ def test_global_bias(fingerprints, desired_average_pairs_per_bin): selection_bins=bins, max_pairs_per_bin=10) - selected_pairs_per_bin_fixed_bias = fix_bias( + selected_pairs_per_bin_fixed_bias = balanced_selection( selected_pairs_per_bin, selected_scores_per_bin, fingerprints.shape[0] * desired_average_pairs_per_bin) matrix = convert_pair_list_to_coo_array( selected_pairs_per_bin_fixed_bias, fingerprints.shape[0]) @@ -138,7 +138,7 @@ def test_global_bias(fingerprints, desired_average_pairs_per_bin): [[1, 9, 9, 6, 8], 6], ]) def test_get_nr_of_compounds_that_should_be_selected(nr_of_pairs_in_bin_per_compound, expected_average): - cut_offs_to_use = get_nr_of_pairs_needed_to_fix_bias(nr_of_pairs_in_bin_per_compound, expected_average) + cut_offs_to_use = get_nr_of_pairs_needed_to_balanced_selection(nr_of_pairs_in_bin_per_compound, expected_average) assert sum(cut_offs_to_use)/len(cut_offs_to_use) == expected_average assert len(nr_of_pairs_in_bin_per_compound) == len(cut_offs_to_use) From 221118afcc00e10de1b7c6709501a96ae5486b74 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 8 Jan 2024 21:34:40 +0100 Subject: [PATCH 34/40] add custom loss function --- ms2deepscore/models/SiameseSpectralModel.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index dab881c7..06c8a7ce 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -183,7 +183,7 @@ def train(model, data_generator, num_epochs, learning_rate, # Move model to device model.to(device) - criterion = nn.MSELoss() + criterion = mse_away_from_mean # alternative for nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) model.train(True) @@ -242,3 +242,15 @@ def l2_regularization(model, lambda_l2): """L2 regulatization for first dense layer of model.""" l2_loss = torch.linalg.vector_norm(next(model.encoder.dense_layers[0].parameters()), ord=2) return lambda_l2 * l2_loss + +def mse_away_from_mean(output, target): + """MSE weighted to get higher loss for predictions towards the mean of 0.5. + + In addition, we are usually more intereted in the precision for higher scores. + And, we have often fewer pairs in that regime. This is included by an additional + linear factor to shift attention to higher scores. + """ + weighting = torch.exp(-10 * (output - 0.5)**2) + 1 + focus_high_scores = 1 + 0.5 * target + loss = torch.mean(weighting * focus_high_scores * (output - target)**2) + return loss From 0702d15f6bfab115b37f09b50033d1f5b019952c Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Tue, 9 Jan 2024 09:53:12 +0100 Subject: [PATCH 35/40] fix to(cuda) transfers during training --- ms2deepscore/models/SiameseSpectralModel.py | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 06c8a7ce..8c09200f 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -21,7 +21,6 @@ def __init__(self, group_size: int = 30, output_per_group: int = 3, dropout_rate: float = 0.2, - ): """ Construct SiameseSpectralModel @@ -195,18 +194,20 @@ def train(model, data_generator, num_epochs, learning_rate, training.set_description(f"Epoch {epoch}") batch_losses = [] for spectra_1, spectra_2, meta_1, meta_2, targets in training: - # Move data to device - spectra_1.to(device) - spectra_2.to(device) - meta_1.to(device) - meta_2.to(device) - targets.to(device) - # For debugging: keep track of biases collection_targets.extend(targets) - + optimizer.zero_grad() - + + # Forward pass + outputs = model(spectra_1.to(device), spectra_2.to(device), + meta_1.to(device), meta_2.to(device)) + + # Calculate loss + loss = criterion(outputs, targets.to(device)) + + optimizer.zero_grad() + # Forward pass outputs = model(spectra_1, spectra_2, meta_1, meta_2) @@ -215,7 +216,7 @@ def train(model, data_generator, num_epochs, learning_rate, if lambda_l1 > 0 or lambda_l2 > 0: loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) batch_losses.append(float(loss)) - + # Backward pass and optimize loss.backward() optimizer.step() From cc6df44e805b810cd0ef7ee9ba229a86e79d7214 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 12 Jan 2024 18:02:22 +0100 Subject: [PATCH 36/40] expansion of training functions --- ms2deepscore/data_generators.py | 40 ++++++- ms2deepscore/models/SiameseSpectralModel.py | 117 ++++++++++++++++---- tests/test_siamese_spectra_model.py | 28 +++-- 3 files changed, 154 insertions(+), 31 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 6faa982c..312f6f29 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -122,7 +122,10 @@ def __getitem__(self, batch_index: int): if self.settings.use_fixed_set: # Store batches for later epochs - self.fixed_set[batch_index] = (spectra_1, spectra_2, targets) + self.fixed_set[batch_index] = (spectra_1, spectra_2, meta_1, meta_2, targets) + else: + spectra_1 = self._data_augmentation(spectra_1) + spectra_2 = self._data_augmentation(spectra_2) return spectra_1, spectra_2, meta_1, meta_2, targets def _tensorize_all(self, spectrum_pairs): @@ -160,6 +163,40 @@ def _get_spectrum_with_inchikey(self, inchikey: str) -> Spectrum: raise ValueError("No matching inchikey found (note: expected first 14 characters)") return self.spectrums[np.random.choice(matching_spectrum_id)] + def _data_augmentation(self, spectra_tensors): + for i in range(spectra_tensors.shape[0]): + spectra_tensors[i, :] = self._data_augmentation_spectrum(spectra_tensors[i, :]) + return spectra_tensors + + def _data_augmentation_spectrum(self, spectrum_tensor): + """Data augmentation. + + Parameters + ---------- + spectrum_tensor + Spectrum in Pytorch tensor form. + """ + # Augmentation 1: peak removal (peaks < augment_removal_max) + if self.settings.augment_removal_max or self.settings.augment_removal_intensity: + # TODO: Factor out function with documentation + example? + + indices_select = torch.where((spectrum_tensor > 0) + & (spectrum_tensor < self.settings.augment_removal_max))[0] + removal_part = np.random.random(1) * self.settings.augment_removal_max + indices = np.random.choice(indices_select, int(np.ceil((1 - removal_part)*len(indices_select)))) + if len(indices) > 0: + spectrum_tensor[indices] = 0 + + # Augmentation 2: Change peak intensities + if self.settings.augment_intensity: + # TODO: Factor out function with documentation + example? + spectrum_tensor = spectrum_tensor * (1 - self.settings.augment_intensity * 2 * (torch.rand(spectrum_tensor.shape) - 0.5)) + + # Augmentation 3: Peak addition + #if self.settings.augment_noise_max and self.settings.augment_noise_max > 0: + # idx, values = self._peak_addition(idx, values) + return spectrum_tensor + def tensorize_spectra( spectra, @@ -184,6 +221,7 @@ def tensorize_spectra( if min_mz <= mz < max_mz: bin_index = int((mz - min_mz) / mz_bin_width) binned_spectra[i, bin_index] += intensity ** intensity_scaling + # TODO: Consider taking the max instead return binned_spectra, metadata_tensors diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 8c09200f..52487926 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -172,31 +172,59 @@ def forward(self, spectra_tensors, metadata_tensors): ### Model training -def train(model, data_generator, num_epochs, learning_rate, - lambda_l1=1e-6, - lambda_l2=1e-6): +def train(model: torch.nn.Module, + data_generator, + num_epochs: int, + learning_rate: float, + val_generator=None, + early_stopping=True, + patience: int = 10, + checkpoint_filename: str = None, + lambda_l1: float = 1e-6, + lambda_l2: float = 1e-6): + """Train a model with given parameters. + + Parameters + ---------- + model + The neural network model to train. + data_generator + An iterator for training data batches. + num_epochs + Number of epochs for training. + learning_rate + Learning rate for the optimizer. + val_generator (iterator, optional) + An iterator for validation data batches. + early_stopping + Whether to use early stopping. + patience + Number of epochs to wait for improvement before stopping. + checkpoint_filename + File path to save the model checkpoint. + lambda_l1 + L1 regularization strength. + lambda_l2 + L2 regularization strength. + """ # pylint: disable=too-many-arguments, too-many-locals - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Training will happen on {device}.") - - # Move model to device + device = initialize_device() + criterion, optimizer = setup_model(model, learning_rate, device) model.to(device) - criterion = mse_away_from_mean # alternative for nn.MSELoss() - optimizer = optim.Adam(model.parameters(), lr=learning_rate) - - model.train(True) + losses, val_losses, collection_targets = [], [], [] + min_val_loss = np.inf + epochs_no_improve = 0 - losses = [] - collection_targets = [] for epoch in range(num_epochs): + model.train(True) with tqdm(data_generator, unit="batch", mininterval=0) as training: training.set_description(f"Epoch {epoch}") batch_losses = [] for spectra_1, spectra_2, meta_1, meta_2, targets in training: # For debugging: keep track of biases collection_targets.extend(targets) - + optimizer.zero_grad() # Forward pass @@ -205,14 +233,6 @@ def train(model, data_generator, num_epochs, learning_rate, # Calculate loss loss = criterion(outputs, targets.to(device)) - - optimizer.zero_grad() - - # Forward pass - outputs = model(spectra_1, spectra_2, meta_1, meta_2) - - # Calculate loss - loss = criterion(outputs, targets) if lambda_l1 > 0 or lambda_l2 > 0: loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2) batch_losses.append(float(loss)) @@ -226,10 +246,61 @@ def train(model, data_generator, num_epochs, learning_rate, loss=float(loss), ) + if val_generator is not None: + model.eval() + val_batch_losses = [] + for spectra_1, spectra_2, meta_1, meta_2, targets in val_generator: + predictions = model(spectra_1.to(device), spectra_2.to(device), + meta_1.to(device), meta_2.to(device)) + loss = criterion(predictions, targets.to(device)) + val_batch_losses.append(float(loss)) + val_loss = np.mean(val_batch_losses) + val_losses.append(val_loss) + if val_loss < min_val_loss: + if checkpoint_filename: + print("Saving checkpoint model.") + torch.save(model, checkpoint_filename) + epochs_no_improve = 0 + min_val_loss = val_loss + else: + epochs_no_improve += 1 + if early_stopping and epochs_no_improve >= patience: + print("Early stopping!") + break + # Print statistics print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {np.mean(batch_losses):.4f}") losses.append(np.mean(batch_losses)) - return losses, collection_targets + if val_generator is not None: + print(f"Validation Loss: {val_loss:.4f}") + + return losses, val_losses, collection_targets + + +def initialize_device(): + """Initialize and return the device for training.""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Training will happen on {device}.") + return device + + +def setup_model(model, learning_rate, device): + """ + Set up the model for training. + + Parameters + ---------- + model + The model to be set up. + learning_rate + Learning rate for the optimizer. + device + The device to be used for training. + """ + model.to(device) + criterion = mse_away_from_mean # Alternative for nn.MSELoss() + optimizer = optim.Adam(model.parameters(), lr=learning_rate) + return criterion, optimizer ### Helper functions diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 2d9a8af0..39db75ca 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -124,22 +124,36 @@ def test_model_training(simple_training_spectra): }) scp_simple, _ = select_compound_pairs_wrapper(simple_training_spectra, settings) - # Create generator + # Create generators train_generator_simple = DataGeneratorPytorch( spectrums=simple_training_spectra, - min_mz=0, max_mz=200, mz_bin_width=0.1, intensity_scaling=0.5, + min_mz=0, max_mz=200, mz_bin_width=0.2, intensity_scaling=0.5, metadata_vectorizer=None, selected_compound_pairs=scp_simple, batch_size=2, num_turns=20, ) + val_generator_simple = DataGeneratorPytorch( + spectrums=simple_training_spectra, + min_mz=0, max_mz=200, mz_bin_width=0.2, intensity_scaling=0.5, + metadata_vectorizer=None, + selected_compound_pairs=scp_simple, + batch_size=2, + num_turns=2, + use_fixed_set=True, + ) + # Create and train model - model_simple = SiameseSpectralModel(peak_inputs=2000, additional_inputs=0, train_binning_layer=False) - losses, collection_targets = train(model_simple, train_generator_simple, 50, learning_rate=0.001, lambda_l1=0, lambda_l2=0) + model_simple = SiameseSpectralModel(peak_inputs=1000, additional_inputs=0, train_binning_layer=False) + losses, val_losses, collection_targets = train(model_simple, train_generator_simple, + val_generator=val_generator_simple, + num_epochs=25, + learning_rate=0.001, lambda_l1=0, lambda_l2=0) + assert len(losses) == len(val_losses) == 25 # Check if model trained to at least an OK result - assert np.mean(losses[-10:]) < 0.02, "Training was not succesfull!" + assert np.mean(losses[-5:]) < 0.02, "Training was not succesfull!" # Check if bias in data is handled correctly - assert (np.array(collection_targets) == 1).sum() == 1000 - assert (np.array(collection_targets) < .2).sum() == 1000 + assert (np.array(collection_targets) == 1).sum() == 500 + assert (np.array(collection_targets) < .2).sum() == 500 From dbbdbe41902c228c34b74ea5421bb70873d20245 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Sat, 13 Jan 2024 12:37:45 +0100 Subject: [PATCH 37/40] fix model training and test --- ms2deepscore/models/SiameseSpectralModel.py | 7 ++++--- tests/test_siamese_spectra_model.py | 6 ++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ms2deepscore/models/SiameseSpectralModel.py b/ms2deepscore/models/SiameseSpectralModel.py index 52487926..70b6494d 100644 --- a/ms2deepscore/models/SiameseSpectralModel.py +++ b/ms2deepscore/models/SiameseSpectralModel.py @@ -181,7 +181,8 @@ def train(model: torch.nn.Module, patience: int = 10, checkpoint_filename: str = None, lambda_l1: float = 1e-6, - lambda_l2: float = 1e-6): + lambda_l2: float = 1e-6, + progress_bar: bool = True): """Train a model with given parameters. Parameters @@ -218,7 +219,7 @@ def train(model: torch.nn.Module, for epoch in range(num_epochs): model.train(True) - with tqdm(data_generator, unit="batch", mininterval=0) as training: + with tqdm(data_generator, unit="batch", mininterval=0, disable=(not progress_bar)) as training: training.set_description(f"Epoch {epoch}") batch_losses = [] for spectra_1, spectra_2, meta_1, meta_2, targets in training: @@ -245,6 +246,7 @@ def train(model: torch.nn.Module, training.set_postfix( loss=float(loss), ) + losses.append(np.mean(batch_losses)) if val_generator is not None: model.eval() @@ -270,7 +272,6 @@ def train(model: torch.nn.Module, # Print statistics print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {np.mean(batch_losses):.4f}") - losses.append(np.mean(batch_losses)) if val_generator is not None: print(f"Validation Loss: {val_loss:.4f}") diff --git a/tests/test_siamese_spectra_model.py b/tests/test_siamese_spectra_model.py index 39db75ca..f58203f5 100644 --- a/tests/test_siamese_spectra_model.py +++ b/tests/test_siamese_spectra_model.py @@ -149,11 +149,13 @@ def test_model_training(simple_training_spectra): losses, val_losses, collection_targets = train(model_simple, train_generator_simple, val_generator=val_generator_simple, num_epochs=25, - learning_rate=0.001, lambda_l1=0, lambda_l2=0) + learning_rate=0.001, lambda_l1=0, lambda_l2=0, + progress_bar=False, early_stopping=False, + ) assert len(losses) == len(val_losses) == 25 # Check if model trained to at least an OK result - assert np.mean(losses[-5:]) < 0.02, "Training was not succesfull!" + assert np.mean(losses[-5:]) < 0.03, "Training was not succesfull!" # Check if bias in data is handled correctly assert (np.array(collection_targets) == 1).sum() == 500 assert (np.array(collection_targets) < .2).sum() == 500 From fba6920a199add569d94a0f1bc0dc1f918bd2316 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 15 Jan 2024 21:24:44 +0100 Subject: [PATCH 38/40] add more data augmentation and speed up using numba --- ms2deepscore/data_generators.py | 38 +++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 312f6f29..50b0469e 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -193,8 +193,14 @@ def _data_augmentation_spectrum(self, spectrum_tensor): spectrum_tensor = spectrum_tensor * (1 - self.settings.augment_intensity * 2 * (torch.rand(spectrum_tensor.shape) - 0.5)) # Augmentation 3: Peak addition - #if self.settings.augment_noise_max and self.settings.augment_noise_max > 0: - # idx, values = self._peak_addition(idx, values) + if self.settings.augment_noise_max and self.settings.augment_noise_max > 0: + indices_select = torch.where(spectrum_tensor == 0)[0] + if len(indices_select) > self.settings.augment_noise_max: + indices_noise = np.random.choice(indices_select, + np.random.randint(0, self.settings.augment_noise_max), + replace=False, + ) + spectrum_tensor[indices_noise] = self.settings.augment_noise_intensity * torch.rand(len(indices_noise)) return spectrum_tensor @@ -217,14 +223,32 @@ def tensorize_spectra( binned_spectra = torch.zeros((len(spectra), num_bins)) for i, spectrum in enumerate(spectra): - for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): - if min_mz <= mz < max_mz: - bin_index = int((mz - min_mz) / mz_bin_width) - binned_spectra[i, bin_index] += intensity ** intensity_scaling - # TODO: Consider taking the max instead + #for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): + # if min_mz <= mz < max_mz: + # bin_index = int((mz - min_mz) / mz_bin_width) + # Sum all intensties for all peaks in each bin + # binned_spectra[i, bin_index] += intensity ** intensity_scaling + binned_spectra[i, :] = torch.tensor(vectorize_spectrum(spectrum.peaks.mz, spectrum.peaks.intensities, + min_mz, max_mz, mz_bin_width, intensity_scaling + )) return binned_spectra, metadata_tensors +import numba +@numba.jit(nopython=True) +def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling): + num_bins = int((max_mz - min_mz) / mz_bin_width) + vector = np.zeros((num_bins)) + for mz, intensity in zip(mz_array, intensities_array): + if min_mz <= mz < max_mz: + bin_index = int((mz - min_mz) / mz_bin_width) + # Take max intensity peak per bin + vector[bin_index] = max(vector[bin_index], intensity ** intensity_scaling) + # Alternative: Sum all intensties for all peaks in each bin + # vector[bin_index] += intensity ** intensity_scaling + return vector + + class SpectrumPair(NamedTuple): """ Represents a pair of binned spectrums From c94f2b83654664842b3c277f7db4d54d84b609a8 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 15 Jan 2024 21:25:23 +0100 Subject: [PATCH 39/40] linting --- ms2deepscore/data_generators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 50b0469e..179619ec 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -2,6 +2,7 @@ """ from typing import Iterator, List, NamedTuple, Optional from matchms import Spectrum +import numba import numpy as np import pandas as pd import torch @@ -234,7 +235,6 @@ def tensorize_spectra( return binned_spectra, metadata_tensors -import numba @numba.jit(nopython=True) def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling): num_bins = int((max_mz - min_mz) / mz_bin_width) From b49abbcf0dc63c8fb5b4fb8786afc23176c5996e Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Mon, 15 Jan 2024 21:33:33 +0100 Subject: [PATCH 40/40] linting --- ms2deepscore/data_generators.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ms2deepscore/data_generators.py b/ms2deepscore/data_generators.py index 179619ec..cd5c1390 100644 --- a/ms2deepscore/data_generators.py +++ b/ms2deepscore/data_generators.py @@ -224,19 +224,16 @@ def tensorize_spectra( binned_spectra = torch.zeros((len(spectra), num_bins)) for i, spectrum in enumerate(spectra): - #for mz, intensity in zip(spectrum.peaks.mz, spectrum.peaks.intensities): - # if min_mz <= mz < max_mz: - # bin_index = int((mz - min_mz) / mz_bin_width) - # Sum all intensties for all peaks in each bin - # binned_spectra[i, bin_index] += intensity ** intensity_scaling binned_spectra[i, :] = torch.tensor(vectorize_spectrum(spectrum.peaks.mz, spectrum.peaks.intensities, - min_mz, max_mz, mz_bin_width, intensity_scaling + min_mz, max_mz, mz_bin_width, intensity_scaling )) return binned_spectra, metadata_tensors @numba.jit(nopython=True) def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling): + """Fast function to convert mz and intensity arrays into dense spectrum vector.""" + # pylint: disable=too-many-arguments num_bins = int((max_mz - min_mz) / mz_bin_width) vector = np.zeros((num_bins)) for mz, intensity in zip(mz_array, intensities_array):