From 89580465a5c91b0edf21c1043f2cb05465b50c84 Mon Sep 17 00:00:00 2001 From: issar Date: Wed, 24 Jul 2024 20:47:59 +0200 Subject: [PATCH] Modify Projection to Random Gaussion --- src/ann_solo/config.py | 4 +++ src/ann_solo/spectral_library.py | 33 ++++++++++++++----- src/ann_solo/spectrum.py | 54 +++++++++++++++++++++++++++++++- 3 files changed, 83 insertions(+), 8 deletions(-) diff --git a/src/ann_solo/config.py b/src/ann_solo/config.py index c92ba55..d4b7f1e 100644 --- a/src/ann_solo/config.py +++ b/src/ann_solo/config.py @@ -179,6 +179,10 @@ def __init__(self) -> None: self._parser.add_argument( '--bin_size', default=0.04, type=float, help='ANN vector bin width (default: %(default)s Da)') + # ANN vector length after gaussian random projection. + self._parser.add_argument( + '--low_dim', default=400, type=int, + help='ANN vector length (default: %(default)s)') # ANN vector length after hashing. self._parser.add_argument( '--hash_len', default=800, type=int, diff --git a/src/ann_solo/spectral_library.py b/src/ann_solo/spectral_library.py index ecceb82..4f5edd9 100755 --- a/src/ann_solo/spectral_library.py +++ b/src/ann_solo/spectral_library.py @@ -13,12 +13,14 @@ import numexpr as ne import numpy as np import tqdm +from sklearn.random_projection import SparseRandomProjection from spectrum_utils.spectrum import MsmsSpectrum from ann_solo import reader from ann_solo import spectrum_match from ann_solo import utils from ann_solo.config import config +from ann_solo.spectrum import get_dim from ann_solo.spectrum import process_spectrum from ann_solo.spectrum import spectrum_to_vector from ann_solo.spectrum import SpectrumSpectrumMatch @@ -115,6 +117,12 @@ def __init__(self, filename: str) -> None: if create_ann_charges: self._create_ann_indexes(create_ann_charges) + # Gaussian vector projection + _vec_len, _, _ = get_dim(config.min_mz, config.max_mz, config.bin_size) + self._transformation = ( + SparseRandomProjection(config.low_dim, random_state=0).fit( + np.zeros((1, _vec_len))).components_.astype(np.float32).T) + def _get_hyperparameter_hash(self) -> str: """ Get a unique string representation of the hyperparameters used to @@ -155,10 +163,15 @@ def _create_ann_indexes(self, charges: List[int]) -> None: smoothing=0.1): charge = lib_spectrum.precursor_charge if charge in charge_vectors.keys(): - spectrum_to_vector(process_spectrum(lib_spectrum, True), - config.min_mz, config.max_mz, - config.bin_size, config.hash_len, True, - charge_vectors[charge][i[charge]]) + charge_vectors[charge][i[charge]] = spectrum_to_vector( + process_spectrum(lib_spectrum, True), + self._transformation, + config.min_mz, + config.max_mz, + config.bin_size, + config.low_dim, + norm=True, + ) i[charge] += 1 # Build an individual FAISS index per charge. logging.info('Build the spectral library ANN indexes') @@ -435,9 +448,15 @@ def _get_library_candidates(self, query_spectra: List[MsmsSpectrum], query_vectors = np.zeros((len(query_spectra), config.hash_len), np.float32) for i, query_spectrum in enumerate(query_spectra): - spectrum_to_vector( - query_spectrum, config.min_mz, config.max_mz, - config.bin_size, config.hash_len, True, query_vectors[i]) + query_vectors[i] = spectrum_to_vector( + query_spectrum, + self._transformation, + config.min_mz, + config.max_mz, + config.bin_size, + config.low_dim, + norm=True, + ) mask = np.zeros_like(candidate_filters) # noinspection PyArgumentList for mask_i, ann_filter in zip(mask, ann_index.search( diff --git a/src/ann_solo/spectrum.py b/src/ann_solo/spectrum.py index 8b0e4ec..dfecb54 100644 --- a/src/ann_solo/spectrum.py +++ b/src/ann_solo/spectrum.py @@ -5,6 +5,7 @@ import mmh3 import numba as nb import numpy as np +import scipy.sparse as ss from spectrum_utils.spectrum import MsmsSpectrum from ann_solo.config import config @@ -163,7 +164,7 @@ def hash_idx(bin_idx: int, hash_len: int) -> int: return mmh3.hash(str(bin_idx), 42, signed=False) % hash_len -def spectrum_to_vector(spectrum: MsmsSpectrum, min_mz: float, max_mz: float, +def _spectrum_to_vector(spectrum: MsmsSpectrum, min_mz: float, max_mz: float, bin_size: float, hash_len: int, norm: bool = True, vector: np.ndarray = None) -> np.ndarray: """ @@ -214,6 +215,57 @@ def spectrum_to_vector(spectrum: MsmsSpectrum, min_mz: float, max_mz: float, return vector +def spectrum_to_vector(spectrum: MsmsSpectrum, transformation: ss.csr_matrix, + min_mz: float, max_mz: float, bin_size: float, dim: int, + norm: bool) -> np.ndarray: + """ + Convert a single spectrum to a dense NumPy vector. + + Peaks are first discretized to mass bins of width `bin_size` starting from + `min_mz`, after which they are transformed using sparse random projections. + + Parameters + ---------- + spectrum : MsmsSpectrum + The spectrum to be converted to a vector. + transformation : ss.csr_matrix + Sparse random projection transformation to convert sparse spectrum + vectors to low-dimensional dense vectors. + min_mz : float + The minimum m/z to include in the vector. + max_mz : float + The maximum m/z to include in the vector. + bin_size : float + The bin size in m/z used to divide the m/z range. + dim : int + The high-resolution vector dimensionality. + norm : bool + Normalize the vector to unit length or not. + Returns + ------- + np.ndarray + The low-dimensional transformed spectrum vector with unit length. + """ + # set the spectrum range between min and max mz + spectrum = spectrum.set_mz_range(min_mz, max_mz) + # Convert a spectrum to a binned sparse vector + data = np.array(spectrum.intensity, dtype=np.float32) + indices = np.array( + [math.floor((mz - min_mz) / bin_size) for mz in spectrum.mz], + dtype=np.int32) + indptr = np.array([0, len(spectrum.mz)], dtype=np.int32) + + # Instantiate the sparse matrix + sparse_vector = ss.csr_matrix( + (data, indices, indptr), (1, dim), np.float32, False) + + # Transform + transformed_vector = (sparse_vector @ transformation).toarray() + if norm: + transformed_vector /= np.linalg.norm(transformed_vector) + + return transformed_vector.ravel() + class SpectrumSpectrumMatch: def __init__(