galaxyproject · bgruening · Mar 24, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/tools/colabfold/.shed.yml b/tools/colabfold/.shed.yml
@@ -0,0 +1,13 @@
+categories:
+- Proteomics
+- Graphics
+description: Protein prediction based on AlphaFold2
+homepage_url: https://github.com/sokrypton/ColabFold
+long_description: |
+  ColabFold offers accelerated (40-60x faster) prediction of protein structures 
+  and complexes by combining the fast homology search of MMseqs2 
+  |with AlphaFold2 or RoseTTAFold. 
+name: suite_colabfold
+owner: iuc
+remote_repository_url: https://github.com/sokrypton/ColabFold
+type: repository_suite_definition 
diff --git a/tools/colabfold/colabfold_alphafold.xml b/tools/colabfold/colabfold_alphafold.xml
@@ -0,0 +1,189 @@
+<tool id="colabfold_alphafold" name="colabfold_alphafold" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
+    <description>This tool runs the alphafold step of the collabfold tool in Galaxy</description>
+    <macros>
+        <token name="@VERSION@">1.5.3</token>
+        <token name="@CUDA_VERSION@">12.2.2</token>
+        <token name="@VERSION_SUFFIX@">1</token>
+    </macros>
+    <requirements>
+        <container type="docker"> ghcr.io/sokrypton/colabfold:@VERSION@-cuda@CUDA_VERSION@</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[ 
+    #import os
+    ln -s $input input.tar &&
+    mkdir input_data &&
+    tar -xmf input.tar --strip-components 1 -C input_data &&
+    mkdir output &&
+    colabfold_batch
+    #if $num_recycles.set_num_recycles == "manual"
+        --num-recycle $num_recycles.num_recycles
+    #end if
+    #if $recycle_tolerance.set_early_stop == "manual":
+        --recycle-early-stop-tolerance $recycle_tolerance.recycle_early_stop_tolerance
+    #end if
+    #if $num_ensemble:
+        --num-ensemble $num_ensemble 
+    #end if
+    #if $random_seed:
+        --random-seed $random_seed 
+    #end if
+    #if $num_seeds:
+        --num-seeds $num_seeds 
+    #end if    
+    #if $num_models:
+        --num-models $num_models
+    #end if
+    $use_dropout
+    #if $max_msa:
+        --max-msa $max_msa
+    #end if
+    #if $amber.use_amber == "yes":
+        --amber
+        --num-relax $amber.num_relaxed
+    #end if
+    $output_options.save_all
+    $output_options.save_recycles
+    $output_options.save_single_representations
+    $output_options.save_pair_representations
+    input_data
+    output
+    && cd output
+    && mv *.a3m output.a3m
+    && mkdir png_out
+    && mkdir json_out
+    && mkdir pdb_out
+    && mv ./*.png png_out
+    && mv ./*.json json_out
+    && mv ./*.pdb pdb_out
+    && mv json_out/config.json .
+    #if  $output_options.save_all:
+        && mkdir pickle_out    
+        && mv ./*.pickle pickle_out
+    #end if
+    ## #if  $output_options.save_pair_representations or $output_options.save_single_representations:
+    ##     && mkdir npy_out    
+    ##     && mv ./*.npy npy_out
+    ## #end if
+
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tar" label="zip file output from colabfold msa tool"/>
+        <conditional name="num_recycles">
+            <param name="set_num_recycles" label="Set number of recycles" type="select" help="If auto selected, will set to 20, assuming selected model type is alphafold2_multimer_v3. If not will set to 3.">
+                <option value="auto">Set automatically</option>
+                <option value="manual">Set manually</option>
+            </param>
+            <when value="auto"/>
+            <when value="manual">
+                <param name="num_recycles" label="How many recycles to run?" type="integer" value="3" min="0"/>
+            </when>
+        </conditional>
+        <conditional name="recycle_tolerance">
+            <param name="set_early_stop" label="Set early set tolerance" type="select" help="if auto selected, will set tolerance to 0.5, assuming the selected model is alphafold2_multimer_v3 otherwise, tolderance is set to 0.0.">
+                <option value="auto">Set automatically</option>
+                <option value="manual">Set manually</option>
+            </param>
+            <when value="auto"/>
+            <when value="manual">
+                <param argument="--recycle-early-stop-tolerance" type="float" value="0.5" min="0.0" max="1.0" help="Specify convergence criteria. Run recycles until the distance between recycles is within the given tolerance value."/>
+            </when>
+        </conditional>
+        <param argument='--num-ensemble' label="Number of ensembles" type="integer" min="1" optional="true" help="Number of ensembles. The trunk of the network is run multiple times with different random choices for the MSA cluster centers. This can result in a better prediction at the cost of longer runtime."/>
+        <param argument="--random-seed" label="Set seed" type="integer" min="0" optional="true"/>
+        <param argument="--num-seeds" label="Number of seeds" type="integer" min="0" optional="true" help="Number of seeds to try iterated based on random seed"/>
+        <param argument="--num-models" label="Number of models to use for structure prediction" type="select" help="Reducing the number of models speeds up the prediction but results in lower quality">
+            <option value="1">1</option>
+            <option value="2">2</option>
+            <option value="3">3</option>
+            <option value="4">4</option>
+            <option value="5">5</option>
+        </param>
+        <param name="max_msa" label="Max msa" type="select" help="Enable dropouts and increase number of seeds to sample predictions from uncertainty of the model. Decrease to increase uncertainity">
+            <!-- <option value="auto">auto</option> -->
+            <option value="512:1024">512:1024</option>
+            <option value="256:512">256:512</option>
+            <option value="64:128">64:128</option>
+            <option value="32:64">32:64</option>
+            <option value="16:32">16:32</option>
+        </param>
+        <param argument="--use-dropout" label="Use dropouts" type="boolean" truevalue="--use-dropout" falsevalue="" help="Activate dropouts during inference to sample from uncertainity of the models."/>
+        <conditional name="amber">
+            <param name="use_amber" label="Use amber" type="select" help="Use amber for structure refinement">
+                <option value="yes">Use amber</option>
+                <option value="no">Don't use Amber</option>
+            </param>
+            <when value="no"/>
+            <when value="yes">
+                <param name="num_relaxed" label="How many top ranked structres to relax using Amber" type="integer" min="0" value="0"/>
+            </when>
+        </conditional>
+        <!-- Add for second version of tool for batch jobs -->
+        <!-- <param name="stop_at" label="Stop score" type="float" min="0.0" optional="true" help="Compute models until pLDDT (single chain) or pTM-score (multimer) > threshold is reached. This speeds up prediction by running less models for easier queries."/> -->
+        <section name="output_options" title="Output Options">
+            <param argument="--save-all" type="boolean" label="Save raw outputs from model to a pickle file" truevalue="--save-all" falsevalue=""/>
+            <param argument="--save-recycles" type="boolean" label="Save all intermediate predictions at each recycle iteration" truevalue="--save-recycles" falsevalue=""/>
+            <param argument="--save-single-representations" type="boolean" label="Save the single representation embeddings of all models." truevalue="--save-single-representations" falsevalue=""/>
+            <param argument="--save-pair-representations" type="boolean" label="Save the pair representation embeddings of all models." truevalue="--save-pair-representations" falsevalue=""/>
+        </section>
+    </inputs>
+    <outputs>
+        <collection name="png_files" type="list" format="png" label="${tool.name} on ${on_string}: Figures">
+            <discover_datasets format="png" pattern="__name_and_ext__" directory="output/png_out"/>
+        </collection>
+        <collection name="json_files" type="list" format="json" label="${tool.name} on ${on_string}: JSON predictions">
+            <discover_datasets format="json" pattern="__name_and_ext__" directory="output/json_out"/>
+        </collection>
+        <collection name="pdb" type="list" format="pdb" label="${tool.name} on ${on_string}: PDB predictions">
+            <discover_datasets format="pdb" pattern="__name_and_ext__" directory="output/pdb_out"/>
+        </collection>
+        <collection name="pickle" type="list" format="pickle" label="${tool.name} on ${on_string}: Pickle file outputs">
+            <discover_datasets format="pickle" pattern="__name_and_ext__" directory="output/pickle_out"/>
+            <filter>output_options['save_all'] == "--save-all"</filter>
+        </collection>
+        <!-- <collection name="npy" type="list" format="npy" label="${tool.name} on ${on_string}: Numpy embeddings">
+            <discover_datasets format="npy" pattern="__name_and_ext__" directory="output/npy_out"/>
+            <filter>output_options['save_single_representations'] == "save-single-representations" or output_options['save_pair_representations'] == "save-pair-representations"</filter>
+        </collection> -->
+        <data name="a3m_out" format="a3m" from_work_dir="output/output.a3m" label="${tool.name} on ${on_string}: a3m file"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="input.tar"/>
+            <conditional name="num_recycles">
+                <param name="set_num_recycles" value="manual"/>
+                <param name="num_recycles" value="4"/>
+            </conditional>
+            <conditional name="recycle_tolerance">
+                <param name="set_early_stop" value="manual"/>
+                <param name="recycle_early_stop_tolerance" value="0.4"/>
+            </conditional>
+            <param name="num_ensemble" value="1"/>
+            <param name="random_seed" value="43"/>
+            <param name="num_seeds" value="2"/>
+            <param name="num_models" value="2"/>
+            <param name="max_msa" value="64:128"/>
+            <param name="use_dropout" value="--use-dropout"/>
+            <conditional name="amber">
+                <param name="use_amber" value="yes"/>
+                <param name="num_relaxed" value="0"/>
+            </conditional>
+            <section name="output_options">
+                <param name="save_all" value="--save-all"/>
+                <param name="save_recycles" value="--save-recycles"/>
+                <param name="save_single_representations" value="--save-single-representations"/>
+                <param name="save_pair_representations" value="--save-pair-representations"/>
+            </section>
+            <assert_command>
+                <has_text text="colabfold_batch --num-recycle 4 --recycle-early-stop-tolerance 0.4 --num-ensemble 1 --random-seed 43"/>
+                <has_text text="--num-seeds 2 --num-models 2 --use-dropout --max-msa 64:128 --amber --num-relax 0"/>
+                <has_text text="--save-all --save-recycles --save-single-representations --save-pair-representations"/>
+            </assert_command>
+        </test>
+    </tests>
+    <help><![CDATA[
+        Generate run a folding step on the output of the colabfold msa run
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/s41592-022-01488-1</citation>
+    </citations>
+</tool>
diff --git a/tools/colabfold/colabfold_msa.xml b/tools/colabfold/colabfold_msa.xml
@@ -0,0 +1,127 @@
+<tool id="colabfold_msa" name="colabfold_msa" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
+    <description>This tool runs the alphafold step of the collabfold tool in Galaxy</description>
+    <macros>
+        <token name="@VERSION@">1.5.3</token>
+        <token name="@CUDA_VERSION@">12.2.2</token>
+        <token name="@VERSION_SUFFIX@">1</token>
+        <xml name="db_selector">
+            <conditional name="select_db">
+                <param name="use_db" type="select" label="Manually set database?">
+                    <option value="yes">Yes</option>
+                    <option value="no">No</option>
+                </param>
+                <when value="no"></when>
+                <when value="yes">
+                    <param name="msa_mode" label="MSA mode" type="select">
+                        <option value="mmseqs2_uniref_env">mmseqs2_uniref_env</option>
+                        <option value="mmseqs2_uniref">mmseqs2_uniref</option>
+                        <option value="single_sequence">Use single sequence input</option>
+                        <!-- <option value="custom">custom</option> -->
+                    </param>
+                </when>
+            </conditional>
+        </xml>
+    </macros>
+    <requirements>
+        <container type="docker"> ghcr.io/sokrypton/colabfold:@VERSION@-cuda@CUDA_VERSION@</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[ 
+    #import re
+    #if $custom_template:
+        mkdir template_dir &&
+        #for $file in $custom_template:
+            ln -s $file "template_dir/${file.element_identifier}.pdb" &&
+        #end for
+    #end if   
+
+    #if $query_type.select_query_type == "fasta":
+        #set input_file = re.sub('[^\w\-_\.]', '_', str($query_type.input.element_identifier)) + '.fasta'
+        ln -s $query_type.input "$input_file" &&
+    #end if
+    #if $query_type.select_query_type == "a3m":
+        #set input_file = re.sub('[^\w\-_\.]', '_', str($query_type.input.element_identifier)) + '.a3m'
+        ln -s $query_type.input "$input_file" &&
+    #end if
+    mkdir output &&
+    colabfold_batch --msa-only
+    #if $query_type.select_query_type == "fasta":
+        #if $query_type.select_db.use_db == "yes":
+            --msa-mode $query_type.select_db.msa_mode
+        #end if
+    #end if
+    --pair-mode $pair_mode
+    ## --pair-strategy $pairing_strategy
+    $templates
+    #if $custom_template:
+        --custom-template-path template_dir
+    #end if
+    "$input_file"
+    output &&
+    tar -cf output.tar output
+    ]]></command>
+    <inputs>
+        <conditional name="query_type">
+            <param name="select_query_type" label="Data input method" type="select">
+                <option value="fasta">FASTA file</option>
+                <option value="a3m">a3m file</option>
+            </param>
+            <when value="fasta">
+                <param name="input" type="data" format="fasta" label="Query sequence fasta"/>
+                <expand macro="db_selector"/>
+            </when>
+            <when value="a3m">
+                <param name="input" type="data" format="a3m" label="Query sequence"/>
+            </when>
+        </conditional>
+        <param name="pair_mode" label="Pair mode" type="select">
+            <option value="unpaired_paired">Attempt to pair sequences from the same operon within the genome</option>
+            <option value="paired">Only used sequences that were successfully paired</option>
+            <option value="unpaired">Generated seperate MSA for each protein</option>
+        </param> 
+        <!-- Non functional in current release of colabfold, planned to expose in next one -->
+        <!-- <param name="pairing_strategy" label="Pairing strategy" type="select">
+            <option value="greedy">Greedy: MSA sequences should only be paired if the same species exist in at least two MSAs </option>
+            <option value="complete">Complete: MSA sequences should only be paired if the same species exist in all MSAs</option>
+        </param> -->
+        <param argument="--templates" label="Query PDB templates from the MSA server" type="boolean" truevalue="--templates" falsevalue=""/>
+        <param name="custom_template" label="List of pdb files to provide the prediciton as custom templates" type="data" format="pdb" multiple="true" optional="true"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="tar" from_work_dir="output.tar" label="${tool.name} on ${on_string}: tar file"/>
+    </outputs>
+    <tests>
+        <!-- fasta -->
+        <test expect_num_outputs="1">
+            <conditional name="query_type">
+                <param name="select_query_type" value="fasta"/>
+                <param name="input" value="test.fasta"/>
+                <conditional name="select_db">
+                    <param name="use_db" value="yes"/>
+                    <param name="msa_mode" value="mmseqs2_uniref"/>
+                </conditional>
+            </conditional>
+            <assert_command>
+                <has_text text="colabfold_batch --msa-only --msa-mode mmseqs2_uniref --pair-mode unpaired_paired"/>
+            </assert_command>
+        </test>
+        <!-- a3m -->
+        <test expect_num_outputs="1">
+            <conditional name="query_type">
+                <param name="select_query_type" value="a3m"/>
+                <param name="input" value="test.a3m"/>
+            </conditional>
+            <param name="pair_mode" value="paired"/>
+            <param name="templates" value="--templates"/>
+            <param name="custom_template" value="test.pdb,test_2.pdb"/>
+            <assert_command>
+                <has_text text="colabfold_batch --msa-only --pair-mode paired --templates --custom-template-path template_dir"/>
+            </assert_command>        
+        </test>
+    </tests>
+    <help><![CDATA[
+        Generate MSAs for the alphafold step of Colabfold
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/s41592-022-01488-1</citation>
+    </citations>
+</tool>
diff --git a/tools/colabfold/test-data/input.tar b/tools/colabfold/test-data/input.tar
diff --git a/tools/colabfold/test-data/test.a3m b/tools/colabfold/test-data/test.a3m
@@ -0,0 +1,39 @@
+#38	1
+>101
+MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS
+>UniRef100_N6VR80	62	0.971	1.153E-09	3	37	38	17	51	52
+---IKRSSRRWKKKGRMRWKWYKKRLRRLKRERRRARS
+>UniRef100_A0A534JJZ5	55	0.722	4.940E-07	3	37	38	0	35	80
+---MKRSSRAWKKRGKMRWKWRKKRMRRRKREQKlRART
+>UniRef100_A0A8T5HQN6	54	0.647	9.355E-07	3	36	38	14	47	48
+---MKRSSRRWKKKGQMRWKWQRKRMKKEKRKRAKSR-
+>UniRef100_A0A2E4RG04	53	0.666	1.772E-06	3	35	38	36	68	71
+---MKRGSRAWKKQGNQRWKWRKKKLRRRKASRKRA--
+>UniRef100_R1E4G0	53	0.617	2.438E-06	3	36	38	0	33	38
+---MRRSSRRWKKYLRSRWKWQRRRIREEKRLRKIAR-
+>UniRef100_A0A397WLW3	53	0.676	2.438E-06	3	36	38	0	33	38
+---MKRSSRRWKKYKRSRWKWQKKRMKEEKRLRKLAR-
+>UniRef100_A0A2K3J9R6	52	0.700	4.619E-06	3	32	38	0	29	32
+---MKRSSRVWKKRHKMRWKWRKKRMRREKRSR-----
+>UniRef100_A6VJM7	51	0.862	8.751E-06	3	31	38	5	33	39
+---IKRSSRRWKKKGQMRWKHYKKRIRRMKRE------
+>UniRef100_A0A7M3WK46	51	0.666	8.751E-06	3	35	38	30	62	66
+---MKRGSRAWKKQGKQRWKWRKKKLRRRKAARKRA--
+>UniRef100_A0A915SG42	51	0.617	1.205E-05	3	36	38	0	33	38
+---MKRSSRRWKKYLRSRWKWQRRRIREEKRLRKVTR-
+>UniRef100_A0A510BD48	51	0.900	1.205E-05	3	32	38	31	60	64
+---IKRSSRRWKKKGRMRWRHYKKRLRRRKRER-----
+>UniRef100_A0A075M0T1	50	0.638	1.658E-05	1	36	38	22	57	61
+-VIMKRRPRKWKKKGRMRWKWLKKRIRRLKRQHRKER-
+>UniRef100_A4FYQ5	48	0.851	8.201E-05	3	29	38	5	31	39
+---IKRSSRRWKKKGQMRWKHYKKRIRRMK--------
+>UniRef100_A0A8J7USD9	48	0.888	8.201E-05	3	29	38	13	39	47
+---IKRSSRRWKKKGQMRWKHYKKRLRRMK--------
+>UniRef100_A0A2K3JJ52	48	0.700	1.129E-04	3	32	38	0	29	34
+---MKRSSRVWKKRRKMRWKWRKKRMRREKRMR-----
+>UniRef100_A6UVG5	48	0.821	1.129E-04	3	30	38	9	36	42
+---IKRSSRRWKKKGQMRWSHYKKRIRRMKR-------
+>UniRef100_A0A5E4HZQ2	43	0.750	7.251E-03	7	34	38	25	52	58
+-------PRKWKKKGRMRWKWVKKRRKRLKRKIKR---
+>UniRef100_A0A2H6JYE4	36	0.566	1.234E+00	4	33	38	3	32	37
+----KHSSRKWKKRGKCRWKTRKKKLKERRRQRK----
diff --git a/tools/colabfold/test-data/test.fasta b/tools/colabfold/test-data/test.fasta
@@ -0,0 +1,2 @@
+>testing
+MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		>testing
		MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS