RECETOX · hechth · Jul 25, 2024 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/tools/ipapy2/ipapy2_MS1_annotation.xml b/tools/ipapy2/ipapy2_MS1_annotation.xml
@@ -0,0 +1,86 @@
+<tool id="ipapy2_MS1_annotation" name="IPA MS1 annotation" version="@TOOL_VERSION@+galaxy0" profile="21.09">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '${ipapy2_MS1_annotation_cli}'
+    ]]></command>
+
+<configfiles>
+<configfile name="ipapy2_MS1_annotation_cli">
+@init_import@
+import os
+
+df = pd.read_csv('${MS1_table}', keep_default_na=False)
+df = df.replace('', None)
+allAdds = pd.read_csv('${all_adducts}', keep_default_na=False)
+allAdds = allAdds.replace('', None)
+ppmunk = os.getenv('ppmunk', $ppm)
+ppmthr = os.getenv('ppmthr', 2*$ppm)
+annotations = ipa.MS1annotation(df, allAdds, ppm=$ppm, ratiosd=$ratiosd, ppmunk=ppmunk, ratiounk=$ratiounk, ppmthr=ppmthr, pRTNone=$pRTNone, pRTout=$pRTout, ncores=int(os.environ.get('GALAXY_SLOTS')))
+@annotation_flatten@
+annotations_flat.to_csv('${output}', index=False)
+</configfile>
+</configfiles>
+
+    <inputs>
+        <param label="MS1 table" name="MS1_table" type="data" format="csv" help="pandas dataframe containing the MS1 data." />
+        <param label="all possible adducts table" name="all_adducts" type="data" format="csv" help="pandas dataframe containing the information on all the possible adducts given the database." />
+        <param label="ppm" name="ppm" type="float" help="accuracy of the MS instrument used."/>
+        <section name="unknown" title="unknown settings">
+            <param name="ppmunk" type="float" optional="true">
+                <label>ppm for unknown</label>
+                <help>ppm associated to the 'unknown' annotation. If not provided equal to ppm.</help>
+            </param>
+            <param name="ratiounk" type="float" optional="true" value="0.5">
+                <label>isotope ratio for unknown</label>
+                <help>isotope ratio associated to the 'unknown' annotation.</help>
+            </param>
+        </section>
+        <section name="optional_settings" title="optional settings">
+            <param name="ratiosd" type="float" value="0.9" optional="true">
+                <label>intensity ratio</label>
+                <help>acceptable ratio between predicted intensity and observed intensity of isotopes</help>
+            </param>
+            <param name="ppmthr" type="float" optional="true">
+                <label>ppm threshold</label>
+                <help>maximum ppm possible for the annotations. if not provided equal to 2*ppm.</help>
+            </param>
+            <param name="pRTNone" type="float" optional="true" value="0.8">
+                <label>no RT factor</label>
+                <help>multiplicative factor for the RT if no RTrange present in the database.</help>
+            </param>
+            <param name="pRTout" type="float" optional="true" value="0.4">
+                <label>outside RT factor</label>
+                <help>multiplicative factor for the RT if measured RT is outside the RTrange present in the database.</help>
+            </param>
+        </section>
+    </inputs>
+
+    <outputs>
+        <data label="${tool.name} on ${on_string}" name="output" format="csv"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="MS1_table" value="MS1_data.csv"/>
+            <param name="all_adducts" value="all_adducts.csv"/>
+            <param name="ppm" value="3"/>
+            <output name="output" file="annotations.csv"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    ::
+        Annotation of the dataset base on the MS1 information. Prior probabilities
+        are based on mass only, while post probabilities are based on mass, RT,
+        previous knowledge and isotope patterns.
+    ]]></help>
+
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/ipapy2/ipapy2_MS2_annotation.xml b/tools/ipapy2/ipapy2_MS2_annotation.xml
@@ -0,0 +1,108 @@
+<tool id="ipapy2_MS2_annotation" name="IPA MS2 annotation" version="@TOOL_VERSION@+galaxy0" profile="21.05">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '${ipapy2_MS2_annotation_cli}'
+    ]]></command>
+
+<configfiles>
+<configfile name="ipapy2_MS2_annotation_cli">
+@init_import@
+import os
+
+df = pd.read_csv('${MS1_table}', keep_default_na=False)
+df = df.replace('', None)
+dfMS2 = pd.read_csv('${MS2_table}', keep_default_na=False)
+dfMS2 = dfMS2.replace('', None)
+allAdds = pd.read_csv('${all_adducts}', keep_default_na=False)
+allAdds = allAdds.replace('', None)
+DBMS2 = pd.read_csv('${DBMS2_table}', keep_default_na=False)
+DBMS2 = DBMS2.replace('', None)
+ppmunk = os.getenv('ppmunk', $ppm)
+ppmthr = os.getenv('ppmthr', 2*$ppm)
+annotations = ipa.MSMSannotation(df, dfMS2, allAdds, DBMS2, ppm=$ppm, ratiosd=$ratiosd, ppmunk=ppmunk, ratiounk=$ratiounk, ppmthr=ppmthr, pRTNone=$pRTNone, pRTout=$pRTout, mzdCS=$mzdCS, ppmCS=$ppmCS, CSunk=$CSunk, evfilt=$evfilt, ncores=int(os.environ.get('GALAXY_SLOTS')))
+@annotation_flatten@
+annotations_flat.to_csv('${output}', index=False)
+</configfile>
+</configfiles>
+
+    <inputs>
+        <param label="MS1 table" name="MS1_table" type="data" format="csv" help="pandas dataframe containing the MS1 data." />
+        <param label="MS2 table" name="MS2_table" type="data" format="csv" help="pandas dataframe containing the MS2 data." />
+        <param label="all possible adducts table" name="all_adducts" type="data" format="csv" help="pandas dataframe containing the information on all the possible adducts given the database." />
+        <param label="DBMS2 table" name="DBMS2_table" type="data" format="csv" help="pandas dataframe containing the database containing the MS2 information." />
+        <param label="ppm" name="ppm" type="float" help="accuracy of the MS instrument used."/>
+        <section name="unknown" title="unknown settings">
+            <param name="ppmunk" type="float" optional="true">
+                <label>ppm for unknown</label>
+                <help>ppm associated to the 'unknown' annotation. If not provided equal to ppm.</help>
+            </param>
+            <param name="ratiounk" type="float" value="0.5">
+                <label>isotope ratio for unknown</label>
+                <help>isotope ratio associated to the 'unknown' annotation. Default 0.5.</help>
+            </param>
+            <param name="CSunk" type="float" value="0.7">
+                <label>cosine similarity for unknown</label>
+                <help>cosine similarity score associated with the 'unknown' annotation. Default 0.7.</help>
+            </param>
+        </section>
+
+        <section name="optional_settings" title="optional settings">
+            <param name="ratiosd" type="float" value="0.9" label="intensity ratio">
+                <help>acceptable ratio between predicted intensity and observed intensity of isotopes</help>
+            </param>
+            <param name="ppmthr" type="float" optional="true" label="ppm threshold">
+                <help>maximum ppm possible for the annotations. if not provided equal to 2*ppm.</help>
+            </param>
+            <param name="pRTNone" type="float" value="0.8" label="no RT factor">
+                <help>multiplicative factor for the RT if no RTrange present in the database.</help>
+            </param>
+            <param name="pRTout" type="float" value="0.4" label="outside RT factor">
+                <help>multiplicative factor for the RT if measured RT is outside the RTrange present in the database.</help>
+            </param>
+            <param name="mzdCS" type="float" value="0" label="MS2 mz threshold">
+                <help>maximum mz difference allowed when computing cosine similarity scores. 
+                If one wants to use this parameter instead of ppmCS, this must be set to 0. Default 0.</help>
+            </param>
+            <param name="ppmCS" type="float" value="10" label="maximum ppm for cosine similarity scores">
+                <help>maximum ppm allowed when computing cosine similarity scores. 
+                If one wants to use this parameter instead of mzdCS, this must be set to 0. Default 10.</help>
+            </param>
+            <param name="evfilt" type="select" label="same collision energy">
+                <help>If true, only spectrum acquired with the same collision energy are considered. Default value False.</help>
+                <option value="False">False</option>
+                <option value="True">True</option>
+            </param>
+        </section>
+    </inputs>
+
+    <outputs>
+        <data label="${tool.name} on ${on_string}" name="output" format="csv" help="a dictionary containing all the possible annotations for the measured features."/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="MS1_table" value="MS1_data.csv"/>
+            <param name="MS2_table" value="MS2_data.csv"/>
+            <param name="all_adducts" value="all_adducts.csv"/>
+            <param name="DBMS2_table" value="MS2_DB.csv"/>
+            <param name="ppm" value="3"/>
+            <output name="output" file="annotations.csv"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    ::
+        Annotation of the dataset base on the MS1 and MS2 information. Prior
+        probabilities are based on mass only, while post probabilities are based
+        on mass, RT, previous knowledge and isotope patterns.
+    ]]></help>
+
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/ipapy2/ipapy2_all_adducts.xml b/tools/ipapy2/ipapy2_all_adducts.xml
@@ -0,0 +1,51 @@
+<tool id="ipapy2_all_adducts" name="IPA compute all adducts" version="@TOOL_VERSION@+galaxy0" profile="21.05">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '${ipapy2_all_adducts_cli}'
+    ]]></command>
+
+<configfiles>
+<configfile name="ipapy2_all_adducts_cli">
+@init_import@
+import os
+
+adducts_table = pd.read_csv('${adducts_table}')
+DB_table = pd.read_csv('${DB_table}')
+all_adducts = ipa.compute_all_adducts(adducts_table, DB_table, ionisation=int(${ionisation}), ncores=int(os.environ.get('GALAXY_SLOTS')))
+all_adducts.to_csv('${output}', index=False)
+</configfile>
+</configfiles>
+
+    <inputs>
+        <param label="Adducts table" name="adducts_table" type="data" format="csv" help="Dataframe containing information on all possible adducts."/>
+        <param label="DB table" name="DB_table" type="data" format="csv" help="Dataframe containing a database."/>
+        <expand macro="ionisation"/>
+    </inputs>
+
+    <outputs>
+        <data label="${tool.name} on ${on_string}" name="output" format="csv"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="adducts_table" value="adducts.csv"/>
+            <param name="DB_table" value="DB.csv"/>
+            <param name="ionisation" value="1"/>
+            <output name="output" file="all_adducts.csv"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    ::
+        compute all adducts table based on the information present in the database
+    ]]></help>
+
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/ipapy2/ipapy2_clustering.py b/tools/ipapy2/ipapy2_clustering.py
diff --git a/tools/ipapy2/ipapy2_clustering.xml b/tools/ipapy2/ipapy2_clustering.xml
@@ -1,19 +1,42 @@
-<tool id="ipapy2_clustering" name="ipaPy2 clustering" version="@TOOL_VERSION@+galaxy0" python_template_version="3.5" profile="21.05">
+<tool id="ipapy2_clustering" name="IPA clustering" version="@TOOL_VERSION@+galaxy0" profile="21.05">
     <macros>
         <import>macros.xml</import>
     </macros>
 
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">ipapy2</requirement>
-        <requirement type="package" version="8.0.1">click</requirement>
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
-        python3 ${__tool_directory__}/ipapy2_clustering.py --i '${intensity_table}' --o '${output}'
+        python3 '${ipapy2_clustering_cli}'
     ]]></command>
 
+<configfiles>
+<configfile name="ipapy2_clustering_cli">
+@init_import@
+
+intensity_table = pd.read_csv('${intensity_table}')
+result = ipa.clusterFeatures(intensity_table, Cthr=$clustering.Cthr, RTwin=$clustering.RTwin, Intmode='${clustering.Intmode}')
+result.to_csv('${output}', index=False)
+</configfile>
+</configfiles>
+
     <inputs>
-        <param label="Intensity table" name="intensity_table" type="data" format="csv" help="Mass spectral library file." />
+        <param label="Intensity table" name="intensity_table" type="data" format="csv" help="a dataframe containing the measured intensities across several samples." />
+        <section name="clustering" title="clustering settings">
+            <param name="Cthr" type="float" value="0.8">
+                <label>correlation threshold</label>
+                <help>Minimum correlation allowed in each cluster. Default value 0.8.</help>
+            </param>
+            <param name="RTwin" type="float" value="1">
+                <label>RT threshold</label>
+                <help>Maximum difference in RT time between features in the same cluster. Default value 1.</help>
+            </param>
+            <param name="Intmode" type="text" value="max">
+                <label>intensity mode</label>
+                <help>intensity mode. Default 'max' or 'ave'.</help>
+            </param>
+        </section>
     </inputs>
 
     <outputs>
@@ -22,16 +45,16 @@
 
     <tests>
         <test>
-            <param name="intensity_table" value="minimal_input.csv"/>
-            <output name="output" file="clustering.csv"/>
+            <param name="intensity_table" value="unclustered.csv"/>
+            <output name="output" file="clustered features.csv"/>
         </test>
     </tests>
 
     <help><![CDATA[
     Before using the ipaPy2 package, the processed data coming from an untargeted metabolomics experiment must be properly prepared.
     The data must be organized in a pandas dataframe containing the following columns:
 
-    - **ids**: an unique numeric id for each mass spectrometry feature
+    - **ids**: an unique numeric id for each mass spectrometry feature 
     - **rel.ids**: relation ids. Features must be clustered based on correlation/peak shape/retention time. Features in the same cluster are likely to come from the same metabolite.
     - **mzs**: mass-to-charge ratios, usually the average across different samples.
     - **RTs**: retention times in seconds, usually the average across different samples.
@@ -53,8 +76,6 @@
     | 3  | 300  | 30  | 1100        | 1200        | 1300        |
     +----+------+-----+-------------+-------------+-------------+
     ]]></help>
-    <citations>
-        <citation type="doi">10.1021/acs.analchem.9b02354</citation>
-        <citation type="doi">10.1093/bioinformatics/btad455</citation>
-    </citations>
+
+    <expand macro="citations"/>
 </tool>
diff --git a/tools/ipapy2/ipapy2_compute_bio.py b/tools/ipapy2/ipapy2_compute_bio.py
@@ -0,0 +1,41 @@
+import argparse
+import sys
+import os
+import pandas as pd
+from ipaPy2 import ipa
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(description="cluster features before IPA pipeline.")
+    parser.add_argument("--DB_MS1", type=str, required=True, help="a dataframe containing the measured intensities across several samples.")
+    parser.add_argument("--annotations", type=str, help="Default value 0.8. Minimum correlation allowed in each cluster.")
+    parser.add_argument("--biochemical_mode", type=str, required=True, help="Default value 1. Maximum difference in RT time between features in the same cluster.")
+    parser.add_argument("--connection_list", type=str, help="intensity mode. Default 'max' or 'ave'.")
+    parser.add_argument("--bio_out", type=str, required=True, help="a dataframe of clustered features.")
+    args = parser.parse_args()
+
+    DB_MS1 = pd.read_csv(args.DB_MS1)
+    DB_MS1 = DB_MS1.replace('', None)
+
+    if args.annotations:
+        annotations_df = pd.read_csv(args.annotations, keep_default_na=False)
+        annotations_df = annotations_df.replace('', None)
+        annotations = {}
+        keys = set(annotations_df["peak_id"])
+        for i in keys:
+            annotations[i] = annotations_df[annotations_df["peak_id"] == i].drop('peak_id', axis=1)
+    else:
+        annotations = None
+
+    if args.biochemical_mode == "connections":
+        if args.connection_list:
+            connections = args.connection_list
+        else:
+            connections = ['C3H5NO', 'C6H12N4O', 'C4H6N2O2', 'C4H5NO3', 'C3H5NOS', 'C6H10N2O3S2', 'C5H7NO3', 'C5H8N2O2', 'C2H3NO', 'C6H7N3O', 'C6H11NO', 'C6H11NO', 'C6H12N2O', 'C5H9NOS', 'C9H9NO', 'C5H7NO', 'C3H5NO2', 'C4H7NO2', 'C11H10N2O', 'C9H9NO2', 'C5H9NO', 'C4H4O2', 'C3H5O', 'C10H12N5O6P', 'C10H15N2O3S', 'C10H14N2O2S', 'CH2ON', 'C21H34N7O16P3S', 'C21H33N7O15P3S', 'C10H15N3O5S', 'C5H7', 'C3H2O3', 'C16H30O', 'C8H8NO5P', 'CH3N2O', 'C5H4N5', 'C10H11N5O3', 'C10H13N5O9P2', 'C10H12N5O6P', 'C9H13N3O10P2', 'C9H12N3O7P', 'C4H4N3O', 'C10H13N5O10P2', 'C10H12N5O7P', 'C5H4N5O', 'C10H11N5O4', 'C10H14N2O10P2', 'C10H12N2O4', 'C5H5N2O2', 'C10H13N2O7P', 'C9H12N2O11P2', 'C9H11N2O8P', 'C4H3N2O2', 'C9H10N2O5', 'C2H3O2', 'C2H2O', 'C2H2', 'CO2', 'CHO2', 'H2O', 'H3O6P2', 'C2H4', 'CO', 'C2O2', 'H2', 'O', 'P', 'C2H2O', 'CH2', 'HPO3', 'NH2', 'PP', 'NH', 'SO3', 'N', 'C6H10O5', 'C6H10O6', 'C5H8O4', 'C12H20O11', 'C6H11O8P', 'C6H8O6', 'C6H10O5', 'C18H30O15']
+
+    Bio = ipa.Compute_Bio(DB_MS1, annotations=annotations, mode=args.biochemical_mode, connections=connections, ncores=int(os.environ.get('GALAXY_SLOTS')))
+    Bio.to_csv(args.bio_out, index=False)
+
+
+if __name__ == '__main__':
+    main(argv=sys.argv[1:])