Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change the default additional metadata to retention_time #198

Merged
merged 6 commits into from
Aug 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 1.2.2
### fixed
- Set version of matchmsextras to 0.4.0, to fix dependency issue
- Fix test with wrong sklearn version.

### Changed
- Set default additional metadata from rtinseconds to retention_time

## 1.2.1
### fixed
- Fix bug in downloading models from command line
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ optional arguments:
--download This will download the most up to date model and library.The model will be stored in the folder given as the second argumentThe model will be downloaded in the in the ionization mode specified under --mode
--results RESULTS The folder in which the results should be stored. The default is a new results folder in the folder with the spectra
--filter_ionmode Filter out all spectra that are not in the specified ion-mode. The ion mode can be specified by using --ionmode
--addional_metadata Return additional metadata columns in the results, for example --additional_metadata rtinseconds feature_id
--addional_metadata Return additional metadata columns in the results, for example --additional_metadata retention_time feature_id
```

## Build MS2Query into other tools
Expand Down
4 changes: 2 additions & 2 deletions ms2query/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ def command_line():
help="Filter out all spectra that are not in the specified ion-mode. "
"The ion mode can be specified by using --ionmode")
parser.add_argument("--additional_metadata", action="store",
default=("rtinseconds", "feature_id",),
default=("retention_time", "feature_id",),
nargs="+",
type=str,
help="Return additional metadata columns in the results, for example --additional_metadata rtinseconds feature_id")
help="Return additional metadata columns in the results, for example --additional_metadata retention_time feature_id")
args = parser.parse_args()
ms2query_library_files_directory = args.library
ms2_spectra_location = args.spectra
Expand Down
2 changes: 1 addition & 1 deletion ms2query/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.2.1'
__version__ = '1.2.2'
4 changes: 2 additions & 2 deletions ms2query/create_new_library/train_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import os
from spec2vec.model_building import train_new_word2vec_model
from ms2query.create_new_library.train_ms2deepscore import train_ms2deepscore_wrapper
from ms2query.create_new_library.train_ms2query_model import train_ms2query_model
from ms2query.create_new_library.train_ms2query_model import train_ms2query_model, convert_to_onnx_model
from ms2query.create_new_library.library_files_creator import LibraryFilesCreator
from ms2query.utils import load_matchms_spectrum_objects_from_file, convert_to_onnx_model
from ms2query.utils import load_matchms_spectrum_objects_from_file
from ms2query.clean_and_filter_spectra import create_spectrum_documents, clean_normalize_and_split_annotated_spectra


Expand Down
18 changes: 17 additions & 1 deletion ms2query/create_new_library/train_ms2query_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import os
from typing import List
import pandas as pd
from onnxconverter_common import FloatTensorType
from skl2onnx import convert_sklearn
from tqdm import tqdm
from matchms import Spectrum
from sklearn.ensemble import RandomForestRegressor
Expand All @@ -15,7 +17,7 @@
from ms2query.create_new_library.library_files_creator import LibraryFilesCreator
from ms2query.create_new_library.split_data_for_training import split_spectra_on_inchikeys, split_training_and_validation_spectra
from ms2query.create_new_library.calculate_tanimoto_scores import calculate_tanimoto_scores_from_smiles
from ms2query.utils import save_pickled_file
from ms2query.utils import save_pickled_file, return_non_existing_file_name


class DataCollectorForTraining():
Expand Down Expand Up @@ -142,3 +144,17 @@ def train_ms2query_model(training_spectra,
# Train MS2Query model
ms2query_model = train_random_forest(training_scores, training_labels)
return ms2query_model


def convert_to_onnx_model(random_forest_model, file_name = None):
"""The randomforest model is stored as an onnx model for backwards compatability"""
FloatTensorType([None, 5])
onnx = convert_sklearn(random_forest_model, initial_types=[("input",
FloatTensorType([None, random_forest_model.n_features_in_]))],
target_opset=12)
if file_name is not None:
file_name = return_non_existing_file_name(file_name)

with open(file_name, "wb") as file:
file.write(onnx.SerializeToString())
return onnx
16 changes: 0 additions & 16 deletions ms2query/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import numpy as np
from matchms import importing
from spec2vec.Spec2Vec import Spectrum
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnxruntime import InferenceSession


Expand Down Expand Up @@ -216,20 +214,6 @@ def __init__(self,
self.filter_on_ion_mode = filter_on_ion_mode


def convert_to_onnx_model(random_forest_model, file_name = None):
"""The randomforest model is stored as an onnx model for backwards compatability"""
FloatTensorType([None, 5])
onnx = convert_sklearn(random_forest_model, initial_types=[("input",
FloatTensorType([None, random_forest_model.n_features_in_]))],
target_opset=12)
if file_name is not None:
file_name = return_non_existing_file_name(file_name)

with open(file_name, "wb") as file:
file.write(onnx.SerializeToString())
return onnx


def predict_onnx_model(random_forest_onnx_model: InferenceSession, input_values):
"""Makes predictions for an onnx model"""
# input_name = random_forest_onnx_model.get_inputs()[0].name
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"ms2deepscore",
"gensim>=4.0.0",
"pandas>=1.2.5,<2.0.0",
"matchmsextras>=0.4.0",
"matchmsextras==0.4.0",
"pubchempy", #This is a dependency for matchmsextras, which is missing in setup
"tqdm",
"matplotlib",
Expand Down
Binary file not shown.
16 changes: 12 additions & 4 deletions tests/test_train_ms2query_model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import os

import numpy as np
import pytest
import sys
import pandas as pd
from ms2query.create_new_library.train_ms2query_model import \
DataCollectorForTraining, calculate_tanimoto_scores_with_library, train_random_forest, train_ms2query_model
from ms2query.utils import load_pickled_file, load_matchms_spectrum_objects_from_file, convert_to_onnx_model
DataCollectorForTraining, calculate_tanimoto_scores_with_library, train_random_forest, train_ms2query_model, \
convert_to_onnx_model
from ms2query.utils import load_pickled_file, load_matchms_spectrum_objects_from_file, load_ms2query_model, \
predict_onnx_model
from onnxruntime import InferenceSession
from ms2query.utils import predict_onnx_model
from ms2query.ms2library import MS2Library
Expand Down Expand Up @@ -76,15 +80,19 @@ def test_calculate_all_tanimoto_scores(tmp_path, ms2library, query_spectra):
pd.testing.assert_frame_equal(result, expected_result, check_dtype=False)


def test_train_random_forest():
def test_train_and_save_random_forest():
training_scores, training_labels = load_pickled_file(os.path.join(
os.path.split(os.path.dirname(__file__))[0],
"tests/test_files/test_files_train_ms2query_nn",
"expected_train_and_val_data.pickle"))[:2]
ms2query_model = train_random_forest(training_scores, training_labels)
onnx_model = convert_to_onnx_model(ms2query_model)
onnx_model_session = InferenceSession(onnx_model.SerializeToString())
predictions = predict_onnx_model(onnx_model_session, training_scores.values)
predictions_onnx_model = predict_onnx_model(onnx_model_session, training_scores.values)

# check if saving onnx model works
predictions_sklearn_model = ms2query_model.predict(training_scores.values.astype(np.float32))
assert np.allclose(predictions_onnx_model, predictions_sklearn_model)


@pytest.mark.integration
Expand Down
23 changes: 1 addition & 22 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@
from io import StringIO
from typing import List

import numpy as np
import pandas as pd
import pytest
from matchms import Spectrum
from ms2query.utils import (add_unknown_charges_to_spectra,
load_matchms_spectrum_objects_from_file,
load_pickled_file,
convert_to_onnx_model, load_ms2query_model, predict_onnx_model)
load_matchms_spectrum_objects_from_file)


def test_convert_files_to_matchms_spectrum_objects_unknown_file(tmp_path):
Expand Down Expand Up @@ -51,24 +48,6 @@ def test_add_unknown_charges_to_spectra(hundred_test_spectra):
assert spectrum.get("charge") == 2, "The charge is expected to be 2"


def test_save_as_onnx_model(tmp_path):
path_to_test_dir = os.path.join(
os.path.split(os.path.dirname(__file__))[0],
'tests/test_files/')
rf_model_file = os.path.join(path_to_test_dir, 'general_test_files', "test_ms2q_rf_model.pickle")
rf_model = load_pickled_file(rf_model_file)
expected_result = load_pickled_file(os.path.join(
os.path.split(os.path.dirname(__file__))[0],
"tests/test_files/test_files_train_ms2query_nn",
"expected_train_and_val_data.pickle"))[0]
new_model = os.path.join(tmp_path, "rf_model.onnx")
convert_to_onnx_model(rf_model, new_model)
ms2query_model = load_ms2query_model(new_model)
result = predict_onnx_model(ms2query_model, expected_result.values)
original_result = rf_model.predict(expected_result.values.astype(np.float32))
assert np.allclose(result, original_result)


def check_correct_results_csv_file(dataframe_found: pd.DataFrame,
expected_headers: List[str],
nr_of_rows_to_check=2):
Expand Down
Loading