bigbio · ypriverol · Oct 17, 2022 · Aug 23, 2022 · Aug 26, 2022 · Aug 28, 2022
diff --git a/conda-enviroment.yaml b/conda-enviroment.yaml
@@ -16,4 +16,5 @@ dependencies:
   - ratelimit=2.2.1
   - bioconda::pyteomics
   - pybedtools=0.8.2
+  - matplotlib=3.5.1
   - bioconda::pyopenms
diff --git a/pypgatk/commands/get_subpos.py b/pypgatk/commands/get_subpos.py
@@ -0,0 +1,40 @@
+import logging
+
+import click
+
+from pypgatk.toolbox.general import read_yaml_from_file
+from pypgatk.subpos.get_subpos import GetSubPos
+from pypgatk.commands.utils import print_help
+
+
+log = logging.getLogger(__name__)
+
+
+@click.command('get_subpos', short_help='Get position in peptide indicate which amino acid is substituted')
+@click.option('-c', '--config_file', help='Configuration file for the get subpos')
+@click.option('-in', '--input_psm_table', help='Input variant peptide PSMs table')
+@click.option('-i', '--input_fasta', help='Protein sequence used')
+@click.option('-o', '--output_psm_table', help='Output variant peptide PSMs table')
+@click.pass_context
+def get_subpos(ctx, config_file, input_psm_table, input_fasta, output_psm_table):
+
+    config_data = None
+    if config_file is not None:
+        config_data = read_yaml_from_file(config_file)
+
+    if input_psm_table is None or input_fasta is None or output_psm_table is None:
+        print_help()
+
+    pipeline_arguments = {}  
+
+    if input_psm_table is not None:
+        pipeline_arguments[GetSubPos.CONFIG_INPUT_PSM_TABLE] = input_psm_table
+    if input_fasta is not None:
+        pipeline_arguments[GetSubPos.CONFIG_INPUT_FASTA] = input_fasta
+    if output_psm_table is not None:
+        pipeline_arguments[GetSubPos.CONFIG_OUTPUT_PSM_TABLE] = output_psm_table
+
+    get_subpos_service = GetSubPos(config_data, pipeline_arguments)
+    get_subpos_service.get_subpos(input_psm_table, input_fasta, output_psm_table)
+
+
diff --git a/pypgatk/commands/validate_peptides.py b/pypgatk/commands/validate_peptides.py
@@ -0,0 +1,44 @@
+import logging
+
+import click
+
+from pypgatk.toolbox.general import read_yaml_from_file
+from pypgatk.spectrumAI.validate_peptides import ValidatePeptidesService
+from pypgatk.commands.utils import print_help
+
+
+log = logging.getLogger(__name__)
+
+
+@click.command('validate_peptides', short_help='Command to inspect MS2 spectra of single-subsititution peptide identifications')
+@click.option('-c', '--config_file', help='Configuration file for the validate peptides pipeline')
+@click.option('-m', '--mzml_path', help='The mzml file path')
+@click.option('-in', '--infile_name', help='Variant peptide PSMs table')
+@click.option('-o', '--outfile_name', help='Output file for the results')
+@click.option('-i', '--ions_tolerance', help='MS2 fragment ions mass accuracy', default=0.02)
+@click.option('-r', '--relative', help='relative', is_flag=True)
+@click.pass_context
+def validate_peptides(ctx, config_file, mzml_path, infile_name, outfile_name, ions_tolerance, relative):
+
+    config_data = None
+    if config_file is not None:
+        config_data = read_yaml_from_file(config_file)
+
+    if infile_name is None or mzml_path is None or outfile_name is None:
+        print_help()
+
+    pipeline_arguments = {}  
+
+    if mzml_path is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path
+    if infile_name is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_INFILE_NAME] = infile_name
+    if outfile_name is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_OUTFILE_NAME] = outfile_name
+    if ions_tolerance is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance
+
+    validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
+    validate_peptides_service.validate(infile_name,outfile_name,mzml_path ,ions_tolerance, relative)
+
+
diff --git a/pypgatk/spectrumAI/validate_peptides.py b/pypgatk/spectrumAI/validate_peptides.py
@@ -0,0 +1,293 @@
+import datetime
+import os.path
+import pandas as pd
+from matplotlib import pyplot as plt
+from pyopenms import *
+import re
+
+from pypgatk.toolbox.general import ParameterConfiguration
+
+class ValidatePeptidesService(ParameterConfiguration):
+    CONFIG_KEY_VALIDATE_PEPTIDES = 'validate_peptides'
+    CONFIG_MZML_PATH = 'mzml_path'
+    CONFIG_INFILE_NAME = 'infile_name'
+    CONFIG_OUTFILE_NAME = 'outfile_name'
+    CONFIG_IONS_TOLERANCE = 'ions_tolerance'
+    CONFIG_RELATIVE = 'relative'
+
+    def __init__(self, config_data, pipeline_arguments):
+        """
+      Init the class with the specific parameters.
+      :param config_data configuration file
+      :param pipeline_arguments pipelines arguments
+      """
+
+        super(ValidatePeptidesService, self).__init__(self.CONFIG_KEY_VALIDATE_PEPTIDES, config_data, pipeline_arguments)
+
+        if self.CONFIG_MZML_PATH in self.get_pipeline_parameters():
+            self._mzml_path = self.get_pipeline_parameters()[self.CONFIG_MZML_PATH]
+        elif self.CONFIG_KEY_VALIDATE_PEPTIDES in self.get_default_parameters() and \
+                self.CONFIG_MZML_PATH in self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES]:
+            self._mzml_path = self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES][
+                self.CONFIG_MZML_PATH]
+
+        if self.CONFIG_INFILE_NAME in self.get_pipeline_parameters():
+            self._infile_name = self.get_pipeline_parameters()[self.CONFIG_INFILE_NAME]
+        elif self.CONFIG_KEY_VALIDATE_PEPTIDES in self.get_default_parameters() and \
+                self.CONFIG_INFILE_NAME in self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES]:
+            self._infile_name = self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES][
+                self.CONFIG_INFILE_NAME]
+
+        if self.CONFIG_OUTFILE_NAME in self.get_pipeline_parameters():
+            self._outfile_name = self.get_pipeline_parameters()[self.CONFIG_OUTFILE_NAME]
+        elif self.CONFIG_KEY_VALIDATE_PEPTIDES in self.get_default_parameters() and \
+                self.CONFIG_OUTFILE_NAME in self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES]:
+            self._outfile_name = self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES][
+                self.CONFIG_OUTFILE_NAME]
+
+        if self.CONFIG_IONS_TOLERANCE in self.get_pipeline_parameters():
+            self._ions_tolerance = self.get_pipeline_parameters()[self.CONFIG_IONS_TOLERANCE]
+        elif self.CONFIG_KEY_VALIDATE_PEPTIDES in self.get_default_parameters() and \
+                self.CONFIG_IONS_TOLERANCE in self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES]:
+            self._ions_tolerance = self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES][
+                self.CONFIG_IONS_TOLERANCE]
+
+        if self.CONFIG_RELATIVE in self.get_pipeline_parameters():
+            self._relative = self.get_pipeline_parameters()[self.CONFIG_RELATIVE]
+        elif self.CONFIG_KEY_VALIDATE_PEPTIDES in self.get_default_parameters() and \
+                self.CONFIG_RELATIVE in self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES]:
+            self._relative = self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES][
+                self.CONFIG_RELATIVE]
+
+    def predict_MS2_spectrum(self, Peptide, product_ion_charge = 1):
+        Peptide = re.sub("[-?]", "", Peptide)
+        modification = re.finditer("(\+\d{1,}\.\d{1,})", Peptide)
+        seq = re.sub("[^A-Z]", "", Peptide)
+        size = len(seq)
+
+        a = 0
+        for i in modification:
+            Peptide = Peptide[:i.start() + a] + '[' + Peptide[i.start() + a:i.end() + a] + ']' + Peptide[i.end() + a:]
+            a += 2
+
+        tsg = TheoreticalSpectrumGenerator()
+        spec = MSSpectrum()
+        peptide = AASequence.fromString(Peptide)
+        p = Param()
+        p.setValue("add_metainfo", "true")
+        p.setValue("add_first_prefix_ion", "true")
+        p.setValue("add_precursor_peaks", "true")
+        tsg.setParameters(p)
+        tsg.getSpectrum(spec, peptide, 1, 1)  # charge range 1:1
+
+        b_y_ions = []
+        for i in spec.getStringDataArrays()[0]:
+            b_y_ions.append(i.decode())
+        mz = []
+        for i in spec:
+            mz.append(i.getMZ())
+
+        ions = pd.DataFrame({"mz":mz,"ion":b_y_ions,"z":1})
+
+        ions.loc[2 * size - 2, "ion"] = "b" + str(size)
+        ions = ions.drop(2 * size - 1)
+        ions.loc[2 * size, "ion"] = "y" + str(size)
+
+        ions.loc[:, "ion"] = ions.apply(lambda x: re.sub("[+]", "", x["ion"]), axis=1)
+        ions.loc[:, "pos"] = ions.apply(lambda x: re.sub("[^\d]", "", x["ion"]), axis=1)
+        ions.loc[:, "type"] = ions.apply(lambda x: re.sub("[^a-z]", "", x["ion"]), axis=1)
+
+        proton_mono_mass = 1.007276
+        if product_ion_charge > 1:
+            ions2 = ions.copy()
+            ions2.loc[:, "mz"] = ions2.apply(lambda x: (x["mz"]+proton_mono_mass)/2, axis=1)
+            ions2.loc[:, "z"] = 2
+
+            ions = ions.merge(ions2, how='outer')
+
+        ions = ions.reset_index(drop=True)
+
+        return ions    
+
+    def get_intensity(self, exp_peak,ion_mz):
+        exp_peak.loc[:,"mz_difference"] = exp_peak.apply(lambda x:abs(float(ion_mz) - x["mz"]), axis = 1)
+        min_index=exp_peak["mz_difference"].idxmin()
+        return exp_peak.loc[exp_peak["mz_difference"]==exp_peak["mz_difference"].min()].loc[min_index,"intensity"]
+
+    def match_exp2predicted(self, exp_peak, pred_peak, tolerance, relative):
+        pred_peak.loc[:,"error"] = pred_peak.apply(lambda x:min(abs(float(x["mz"])-exp_peak["mz"])), axis = 1)
+        pred_peak.loc[:,"intensity"] = pred_peak.apply(lambda x:self.get_intensity(exp_peak,x["mz"]), axis = 1)
+        pred_peak.loc[:,"ppm"] = pred_peak.apply(lambda x:round(x["error"]/x["mz"]*1000000,2), axis = 1)
+
+        if relative:
+            match_ions = pred_peak[pred_peak["ppm"] < tolerance]
+        else:
+            match_ions = pred_peak[pred_peak["error"] < tolerance]
+
+        match_ions = match_ions.reset_index(drop=True)
+
+        return match_ions
+
+    def InspectSpectrum(self, DF, mzml_path, tolerance, relative):
+        DF.loc[:, "peptide_length"] = DF.apply(lambda x: len(x["Sequence"]), axis=1)
+
+        DF["status"] = "skiped"
+
+        DF["ions_support"] = "NO"
+        DF["support_ions"] = ""
+        DF["sum.supportions.intensity"] = float(0)
+
+        DF["flanking_ions_support"] = "NO"
+        DF["flanking_ions"] = ""
+        DF["sum.flanking.ions.intensity"] = float(0)
+
+        DF["matched_ions"] = ""
+        DF["sum.matchedions.intensity"] = float(0)
+        DF["sum.fragmentions.intensity"] = float(0)
+        DF["maxintensity"] = float(0)
+        DF["average_intensity"] = float(0)
+        DF["median_intensity"] = float(0)
+
+        Spectra_list = {}
+        for k in range(DF["#SpecFile"].nunique()):
+            Spectra_list[DF["#SpecFile"].unique()[k]] = []
+
+        for i in range(DF.shape[0]):
+            spectra_file = str(DF.loc[i, "#SpecFile"])
+            mzml_file = os.path.join(mzml_path, spectra_file)
+            ScanNum = int(DF.loc[i, "ScanNum"])
+            sub_pos = int(DF.loc[i, "sub_pos"])
+            seq = DF.loc[i, "Sequence"]
+            length = DF.loc[i, "peptide_length"]
+
+            if not Spectra_list[spectra_file]:
+                # 读取质谱文件
+                exp = MSExperiment()
+                MzMLFile().load(mzml_file, exp)
+                Spectra_list[spectra_file].append(exp)
+                look = SpectrumLookup()
+                look.readSpectra(exp, "((?<SCAN>)\d+$)")
+                Spectra_list[spectra_file].append(look)
+            # 通过ScanNum获取peaks
+            index = Spectra_list[spectra_file][1].findByScanNumber(ScanNum)
+            exp_peaks = Spectra_list[spectra_file][0].getSpectrum(index).get_peaks()
+            exp_peaks = pd.DataFrame({"mz": exp_peaks[0], "intensity": exp_peaks[1]})
+
+            predicted_peaks = self.predict_MS2_spectrum(Peptide=str(DF.loc[i, "Peptide"]))
+            match_ions = self.match_exp2predicted(exp_peaks, predicted_peaks, tolerance, relative)
+
+            maxintensity = exp_peaks["intensity"].max()
+            average_intensity = exp_peaks["intensity"].mean()
+            median_intensity = exp_peaks["intensity"].median()
+
+            DF.loc[i, "sum.fragmentions.intensity"] = exp_peaks["intensity"].sum()
+            DF.loc[i, "maxintensity"] = maxintensity
+            DF.loc[i, "average_intensity"] = average_intensity
+            DF.loc[i, "median_intensity"] = median_intensity
+
+            if match_ions.shape[0] == 0:
+                continue
+            DF.loc[i, "matched_ions"] = ','.join(match_ions["ion"].unique().tolist())
+            DF.loc[i, "sum.matchedions.intensity"] = match_ions["intensity"].sum()
+
+            if sub_pos == 0:
+                continue
+            if sub_pos > DF.loc[i, "peptide_length"]:
+                continue
+
+            DF.loc[i, "status"] = "checked"
+            supportions_intensity = 0
+            ions_support = "NO"
+            supportions = ""
+
+            for j in range(match_ions.shape[0]):
+                type = match_ions.loc[j, "type"]
+                pos = int(match_ions.loc[j, "pos"])
+                ion = match_ions.loc[j, "ion"]
+
+                if type == "b" and pos >= sub_pos:
+                    ions_support = "YES"
+                    supportions_intensity = supportions_intensity + match_ions.loc[j, "intensity"]
+                    supportions = supportions + ',' + ion
+                elif type == "y" and pos > length - sub_pos:
+                    ions_support = "YES"
+                    supportions_intensity = supportions_intensity + match_ions.loc[j, "intensity"]
+                    supportions = supportions + ',' + ion
+
+            DF.loc[i, "ions_support"] = ions_support
+            DF.loc[i, "support_ions"] = supportions
+            DF.loc[i, "sum.supportions.intensity"] = supportions_intensity
+
+            # check if it is a noise peak or isotope peak supporting mutant ions
+            if DF.loc[i, "sum.supportions.intensity"] < DF.loc[i, "median_intensity"]:
+                DF.loc[i, "ions_support"] = "NO"
+
+            flanking_ions_support = "NO"
+            n1 = DF.loc[i, "peptide_length"]
+            n2 = sub_pos
+            match_ions_set = set(match_ions["ion"].tolist())
+
+            if n2 == 1:
+                flanking_ions = {"b1", "y" + str(n1 - 1)}
+                flanking_ions = flanking_ions.intersection(match_ions_set)
+                if len(flanking_ions) > 0:
+                    flanking_ions_support = "YES"
+            elif n2 == n1:
+                flanking_ions = {"y1", "b" + str(n1 - 1)}
+                flanking_ions = flanking_ions.intersection(match_ions_set)
+                if len(flanking_ions) > 0:
+                    flanking_ions_support = "YES"
+            else:
+                flanking_ions_left = {"b" + str(n2 - 1), "y" + str(n1 - n2 + 1)}
+                flanking_ions_right = {"b" + str(n2), "y" + str(n1 - n2)}
+
+                flanking_ions_left = flanking_ions_left.intersection(match_ions_set)
+                flanking_ions_right = flanking_ions_right.intersection(match_ions_set)
+
+                flanking_ions = flanking_ions_left.union(flanking_ions_right)
+                if len(flanking_ions_left) > 0 and len(flanking_ions_right) > 0:
+                    flanking_ions_support = "YES"
+
+            DF.loc[i, "flanking_ions_support"] = flanking_ions_support
+            DF.loc[i, "flanking_ions"] = ",".join(flanking_ions)
+            if flanking_ions:
+                DF.loc[i, "sum.flanking.ions.intensity"] = match_ions[match_ions['ion'].str.contains("|".join(flanking_ions))]["intensity"].sum()
+
+            if DF.loc[i, "sum.flanking.ions.intensity"] < DF.loc[i, "median_intensity"]:
+                DF.loc[i, "flanking_ions_support"] = "NO"
+
+            # fragmentation is not preferable at Cterm side of proline, so only require supporting ions
+            if re.search("P", seq[sub_pos - 1:sub_pos]):
+                DF.loc[i, "flanking_ions_support"] = DF.loc[i, "ions_support"]
+
+        return DF
+
+    def validate(self, infile_name, outfile_name, mzml_path, tolerance, relative):
+        start_time = datetime.datetime.now()
+        print("Start time :", start_time)
+        df_psm = pd.read_table(infile_name, header=0, dtype="str", sep="\t")
+        df_output = self.InspectSpectrum(df_psm, mzml_path, tolerance, relative)
+        df_output.to_csv(outfile_name, header=1, sep="\t")
+
+        df_sub = df_output[df_output["status"] == "checked"]
+        saav_psm_passed = df_sub[df_sub["flanking_ions_support"]=="YES"]["PrecursorError(ppm)"]
+        saav_psm_failed = df_sub[df_sub["flanking_ions_support"]=="NO"]["PrecursorError(ppm)"]
+        plot=plt.figure(figsize=(10,7))
+        plot1=plot.add_subplot(1,2,1)
+        plot2=plot.add_subplot(1,2,2)
+        plot1.hist(saav_psm_passed,bins=20)
+        plot1.set_xlabel("PrecursorError(ppm)")
+        plot1.set_title("SpectrumAI curated")
+        plot2.hist(saav_psm_failed,bins=20)
+        plot2.set_xlabel("PrecursorError(ppm)")
+        plot2.set_title("SpectrumAI discarded")
+        plt.savefig("precursorError_histogram.pdf")
+
+        end_time = datetime.datetime.now()
+        print("End time :", end_time)
+        time_taken = end_time - start_time
+        print("Time consumption :", time_taken)
+
+
+
+