diff --git a/CHANGELOG.md b/CHANGELOG.md index 62e896c7..1dbb7ce7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 1.2.2 +### fixed +- Set version of matchmsextras to 0.4.0, to fix dependency issue +- Fix test with wrong sklearn version. + +### Changed +- Set default additional metadata from rtinseconds to retention_time + ## 1.2.1 ### fixed - Fix bug in downloading models from command line diff --git a/README.md b/README.md index 6e8952d0..505ac943 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ optional arguments: --download This will download the most up to date model and library.The model will be stored in the folder given as the second argumentThe model will be downloaded in the in the ionization mode specified under --mode --results RESULTS The folder in which the results should be stored. The default is a new results folder in the folder with the spectra --filter_ionmode Filter out all spectra that are not in the specified ion-mode. The ion mode can be specified by using --ionmode - --addional_metadata Return additional metadata columns in the results, for example --additional_metadata rtinseconds feature_id + --addional_metadata Return additional metadata columns in the results, for example --additional_metadata retention_time feature_id ``` ## Build MS2Query into other tools diff --git a/ms2query/__init__.py b/ms2query/__init__.py index 41c103b6..1c225e2d 100644 --- a/ms2query/__init__.py +++ b/ms2query/__init__.py @@ -51,10 +51,10 @@ def command_line(): help="Filter out all spectra that are not in the specified ion-mode. " "The ion mode can be specified by using --ionmode") parser.add_argument("--additional_metadata", action="store", - default=("rtinseconds", "feature_id",), + default=("retention_time", "feature_id",), nargs="+", type=str, - help="Return additional metadata columns in the results, for example --additional_metadata rtinseconds feature_id") + help="Return additional metadata columns in the results, for example --additional_metadata retention_time feature_id") args = parser.parse_args() ms2query_library_files_directory = args.library ms2_spectra_location = args.spectra diff --git a/ms2query/__version__.py b/ms2query/__version__.py index 3f262a63..923b9879 100644 --- a/ms2query/__version__.py +++ b/ms2query/__version__.py @@ -1 +1 @@ -__version__ = '1.2.1' +__version__ = '1.2.2' diff --git a/ms2query/create_new_library/train_models.py b/ms2query/create_new_library/train_models.py index d23cf057..fb3e012e 100644 --- a/ms2query/create_new_library/train_models.py +++ b/ms2query/create_new_library/train_models.py @@ -6,9 +6,9 @@ import os from spec2vec.model_building import train_new_word2vec_model from ms2query.create_new_library.train_ms2deepscore import train_ms2deepscore_wrapper -from ms2query.create_new_library.train_ms2query_model import train_ms2query_model +from ms2query.create_new_library.train_ms2query_model import train_ms2query_model, convert_to_onnx_model from ms2query.create_new_library.library_files_creator import LibraryFilesCreator -from ms2query.utils import load_matchms_spectrum_objects_from_file, convert_to_onnx_model +from ms2query.utils import load_matchms_spectrum_objects_from_file from ms2query.clean_and_filter_spectra import create_spectrum_documents, clean_normalize_and_split_annotated_spectra diff --git a/ms2query/create_new_library/train_ms2query_model.py b/ms2query/create_new_library/train_ms2query_model.py index 78f3825b..5500a282 100644 --- a/ms2query/create_new_library/train_ms2query_model.py +++ b/ms2query/create_new_library/train_ms2query_model.py @@ -6,6 +6,8 @@ import os from typing import List import pandas as pd +from onnxconverter_common import FloatTensorType +from skl2onnx import convert_sklearn from tqdm import tqdm from matchms import Spectrum from sklearn.ensemble import RandomForestRegressor @@ -15,7 +17,7 @@ from ms2query.create_new_library.library_files_creator import LibraryFilesCreator from ms2query.create_new_library.split_data_for_training import split_spectra_on_inchikeys, split_training_and_validation_spectra from ms2query.create_new_library.calculate_tanimoto_scores import calculate_tanimoto_scores_from_smiles -from ms2query.utils import save_pickled_file +from ms2query.utils import save_pickled_file, return_non_existing_file_name class DataCollectorForTraining(): @@ -142,3 +144,17 @@ def train_ms2query_model(training_spectra, # Train MS2Query model ms2query_model = train_random_forest(training_scores, training_labels) return ms2query_model + + +def convert_to_onnx_model(random_forest_model, file_name = None): + """The randomforest model is stored as an onnx model for backwards compatability""" + FloatTensorType([None, 5]) + onnx = convert_sklearn(random_forest_model, initial_types=[("input", + FloatTensorType([None, random_forest_model.n_features_in_]))], + target_opset=12) + if file_name is not None: + file_name = return_non_existing_file_name(file_name) + + with open(file_name, "wb") as file: + file.write(onnx.SerializeToString()) + return onnx \ No newline at end of file diff --git a/ms2query/utils.py b/ms2query/utils.py index 53cb5d62..291c68fc 100644 --- a/ms2query/utils.py +++ b/ms2query/utils.py @@ -5,8 +5,6 @@ import numpy as np from matchms import importing from spec2vec.Spec2Vec import Spectrum -from skl2onnx import convert_sklearn -from skl2onnx.common.data_types import FloatTensorType from onnxruntime import InferenceSession @@ -216,20 +214,6 @@ def __init__(self, self.filter_on_ion_mode = filter_on_ion_mode -def convert_to_onnx_model(random_forest_model, file_name = None): - """The randomforest model is stored as an onnx model for backwards compatability""" - FloatTensorType([None, 5]) - onnx = convert_sklearn(random_forest_model, initial_types=[("input", - FloatTensorType([None, random_forest_model.n_features_in_]))], - target_opset=12) - if file_name is not None: - file_name = return_non_existing_file_name(file_name) - - with open(file_name, "wb") as file: - file.write(onnx.SerializeToString()) - return onnx - - def predict_onnx_model(random_forest_onnx_model: InferenceSession, input_values): """Makes predictions for an onnx model""" # input_name = random_forest_onnx_model.get_inputs()[0].name diff --git a/setup.py b/setup.py index 9d9f8393..7865cffa 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ "ms2deepscore", "gensim>=4.0.0", "pandas>=1.2.5,<2.0.0", - "matchmsextras>=0.4.0", + "matchmsextras==0.4.0", "pubchempy", #This is a dependency for matchmsextras, which is missing in setup "tqdm", "matplotlib", diff --git a/tests/test_files/general_test_files/test_ms2q_rf_model.pickle b/tests/test_files/general_test_files/test_ms2q_rf_model.pickle deleted file mode 100644 index 2ac757d1..00000000 Binary files a/tests/test_files/general_test_files/test_ms2q_rf_model.pickle and /dev/null differ diff --git a/tests/test_train_ms2query_model.py b/tests/test_train_ms2query_model.py index 72941fda..b0efe5ec 100644 --- a/tests/test_train_ms2query_model.py +++ b/tests/test_train_ms2query_model.py @@ -1,10 +1,14 @@ import os + +import numpy as np import pytest import sys import pandas as pd from ms2query.create_new_library.train_ms2query_model import \ - DataCollectorForTraining, calculate_tanimoto_scores_with_library, train_random_forest, train_ms2query_model -from ms2query.utils import load_pickled_file, load_matchms_spectrum_objects_from_file, convert_to_onnx_model + DataCollectorForTraining, calculate_tanimoto_scores_with_library, train_random_forest, train_ms2query_model, \ + convert_to_onnx_model +from ms2query.utils import load_pickled_file, load_matchms_spectrum_objects_from_file, load_ms2query_model, \ + predict_onnx_model from onnxruntime import InferenceSession from ms2query.utils import predict_onnx_model from ms2query.ms2library import MS2Library @@ -76,7 +80,7 @@ def test_calculate_all_tanimoto_scores(tmp_path, ms2library, query_spectra): pd.testing.assert_frame_equal(result, expected_result, check_dtype=False) -def test_train_random_forest(): +def test_train_and_save_random_forest(): training_scores, training_labels = load_pickled_file(os.path.join( os.path.split(os.path.dirname(__file__))[0], "tests/test_files/test_files_train_ms2query_nn", @@ -84,7 +88,11 @@ def test_train_random_forest(): ms2query_model = train_random_forest(training_scores, training_labels) onnx_model = convert_to_onnx_model(ms2query_model) onnx_model_session = InferenceSession(onnx_model.SerializeToString()) - predictions = predict_onnx_model(onnx_model_session, training_scores.values) + predictions_onnx_model = predict_onnx_model(onnx_model_session, training_scores.values) + + # check if saving onnx model works + predictions_sklearn_model = ms2query_model.predict(training_scores.values.astype(np.float32)) + assert np.allclose(predictions_onnx_model, predictions_sklearn_model) @pytest.mark.integration diff --git a/tests/test_utils.py b/tests/test_utils.py index 788105fc..4cf9778c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,14 +2,11 @@ from io import StringIO from typing import List -import numpy as np import pandas as pd import pytest from matchms import Spectrum from ms2query.utils import (add_unknown_charges_to_spectra, - load_matchms_spectrum_objects_from_file, - load_pickled_file, - convert_to_onnx_model, load_ms2query_model, predict_onnx_model) + load_matchms_spectrum_objects_from_file) def test_convert_files_to_matchms_spectrum_objects_unknown_file(tmp_path): @@ -51,24 +48,6 @@ def test_add_unknown_charges_to_spectra(hundred_test_spectra): assert spectrum.get("charge") == 2, "The charge is expected to be 2" -def test_save_as_onnx_model(tmp_path): - path_to_test_dir = os.path.join( - os.path.split(os.path.dirname(__file__))[0], - 'tests/test_files/') - rf_model_file = os.path.join(path_to_test_dir, 'general_test_files', "test_ms2q_rf_model.pickle") - rf_model = load_pickled_file(rf_model_file) - expected_result = load_pickled_file(os.path.join( - os.path.split(os.path.dirname(__file__))[0], - "tests/test_files/test_files_train_ms2query_nn", - "expected_train_and_val_data.pickle"))[0] - new_model = os.path.join(tmp_path, "rf_model.onnx") - convert_to_onnx_model(rf_model, new_model) - ms2query_model = load_ms2query_model(new_model) - result = predict_onnx_model(ms2query_model, expected_result.values) - original_result = rf_model.predict(expected_result.values.astype(np.float32)) - assert np.allclose(result, original_result) - - def check_correct_results_csv_file(dataframe_found: pd.DataFrame, expected_headers: List[str], nr_of_rows_to_check=2):