Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add colabfold #5785

Merged
merged 33 commits into from
Mar 24, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
84ed956
add colabfold
Feb 20, 2024
060543d
missed shed in git add
Feb 20, 2024
a2a3b29
tar file
Feb 20, 2024
1b2a6ab
Update tools/colabfold/.shed.yml
astrovsky01 Feb 21, 2024
75bc21a
update with most of the suggestion
Feb 21, 2024
a7ed2f2
add tests for msa, change minor params
Mar 11, 2024
f6c0b30
fix shed file to make suite
Mar 11, 2024
a7a50d2
fix file name problem, add pre-set param for naming files to prevent …
Mar 11, 2024
e722272
add num_outputs test to alphafold tool
Mar 12, 2024
9852142
fix test because inputs removed conditionals
Mar 12, 2024
f645e67
fix missing text modifier
Mar 12, 2024
75ba87b
Add hardcoded file names to prevent file headers from breaking datase…
Mar 15, 2024
522a5e1
fix typo from creating an advanced section
Mar 19, 2024
d513720
fix msa filenames
Mar 19, 2024
fdfee66
update archive member paths, assert expect_error
Mar 19, 2024
ae36f0f
test update
Mar 19, 2024
c30fc11
print dir content test
Mar 19, 2024
eb62b2f
try as just tar
Mar 20, 2024
0a84144
add back ls
Mar 20, 2024
e305800
wrong place for &&
Mar 20, 2024
7a025df
quotes and another archive test
Mar 20, 2024
c5531fc
make tests match now that one passed
Mar 20, 2024
031b711
switch back to colab.tar
Mar 20, 2024
7d612ba
test fail state of alphafold tool
Mar 20, 2024
b181a94
Failed properly, tool ready
Mar 20, 2024
a75d4bb
Try expect code
Mar 21, 2024
783c58d
Expect both failure and exit code
Mar 21, 2024
e94b467
Expect fail, exit code, and num outputs
Mar 21, 2024
5f9df6d
remove num_outputs to fix lint
Mar 21, 2024
dfebe1e
matthias final pass
Mar 22, 2024
eb670e9
better descrption for alphafold
Mar 22, 2024
0295aae
small changes
bgruening Mar 24, 2024
bedeb9f
another round
bgruening Mar 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions tools/colabfold/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: colabfold
owner: iuc
categories:
- Proteomics
- Graphics
description: "Protein prediction based on AlphaFold2"
homepage_url: https://github.com/sokrypton/ColabFold
long_description: |
ColabFold offers accelerated (40-60x faster) prediction of protein structures
and complexes by combining the fast homology search of MMseqs2
with AlphaFold2 or RoseTTAFold.
remote_repository_url: https://github.com/sokrypton/ColabFold
type: unrestricted
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper for the colabfold tool suite: {{ tool_name }}"
suite:
name: "suite_colabfold"
description: "Protein prediction based on AlphaFold2"
long_description: |
ColabFold offers accelerated (40-60x faster) prediction of protein structures
and complexes by combining the fast homology search of MMseqs2
with AlphaFold2 or RoseTTAFold.
158 changes: 158 additions & 0 deletions tools/colabfold/colabfold_alphafold.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
<tool id="colabfold_alphafold" name="colabfold alphafold" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>AlphaFold step of colabfold in Galaxy</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="biotools"/>
<expand macro="requirements" />
<command detect_errors="exit_code"><![CDATA[
#import os
mkdir input_data &&
tar -xmf '$input' --strip-components 1 -C input_data &&
mkdir output &&
colabfold_batch
#if str($advanced.num_recycles)!="":
--num-recycle $advanced.num_recycles
#end if
#if str($advanced.recycle_early_stop_tolerance)!="":
--recycle-early-stop-tolerance $advanced.recycle_early_stop_tolerance
#end if
#if $advanced.num_ensemble:
--num-ensemble $advanced.num_ensemble
#end if
#if $random_seed:
--random-seed $advanced.random_seed
#end if
#if $advanced.num_seeds:
--num-seeds $advanced.num_seeds
#end if
#if $advanced.num_models:
--num-models $advanced.num_models
#end if
$advanced.use_dropout
#if $advanced.max_msa:
--max-msa $advanced.max_msa
#end if
#if $advanced.amber.use_amber == "yes":
--amber
--num-relax $advanced.amber.num_relaxed
#end if
$output_options.save_all
$output_options.save_recycles
$output_options.save_single_representations
$output_options.save_pair_representations
--jobname-prefix "galaxy"
input_data
output
&& cd output
&& mv *.a3m output.a3m
&& mkdir png_out
&& mkdir json_out
&& mkdir pdb_out
&& mv ./*.png png_out
&& mv ./*.json json_out
&& mv ./*.pdb pdb_out
&& mv json_out/config.json .
#if $output_options.save_all:
&& mkdir pickle_out
&& mv ./*.pickle pickle_out
#end if
#if $output_options.save_pair_representations or $output_options.save_single_representations:
&& mkdir npy_out
&& mv ./*.npy npy_out
#end if

]]></command>
<inputs>
<param name="input" type="data" format="colab.tar" label="Tar file output from colabfold msa tool"/>
<section name="advanced" title="Advanced options">
<param argument="--num-recycles" label="How many recycles to run?" type="integer" optional="true" help="Number of prediction recycles. Increasing recycles can improve the prediction quality but slows down the prediction."/>
<param argument="--recycle-early-stop-tolerance" type="float" optional="true" min="0.0" max="1.0" help="Specify convergence criteria. Run recycles until the distance between recycles is within the given tolerance value."/>
<param argument='--num-ensemble' label="Number of ensembles" type="integer" min="1" optional="true" help="Number of ensembles. The trunk of the network is run multiple times with different random choices for the MSA cluster centers. This can result in a better prediction at the cost of longer runtime."/>
<param argument="--random-seed" label="Set seed" type="integer" min="0" optional="true"/>
<param argument="--num-seeds" label="Number of seeds" type="integer" min="0" optional="true" help="Number of seeds to try iterated based on random seed"/>
<param argument="--num-models" label="Number of models to use for structure prediction" type="integer" min="1" max="5" help="Reducing the number of models speeds up the prediction but results in lower quality"/>
<param name="max_msa" label="Max msa" type="select" help="Defines the ratio of max-seq to max-extra-seq for one run. Enable dropouts and increase number of seeds to sample predictions from uncertainty of the model. Decrease to increase uncertainity">
<!-- <option value="auto">auto</option> -->
<option value="512:1024">512:1024</option>
<option value="256:512">256:512</option>
<option value="64:128">64:128</option>
<option value="32:64">32:64</option>
<option value="16:32">16:32</option>
</param>
<param argument="--use-dropout" label="Use dropouts" type="boolean" truevalue="--use-dropout" falsevalue="" help="Activate dropouts during inference to sample from uncertainity of the models."/>
<conditional name="amber">
<param name="use_amber" label="Use AMBER" type="select" help="Use AMBER force field for structure refinement and side chain optimization">
<option value="yes">Use AMBER</option>
<option value="no">Don't use AMBER</option>
</param>
<when value="no"/>
<when value="yes">
<param argument="--num-relaxed" label="How many top ranked structres to relax using AMBER?" type="integer" min="0" value="0" help="Inceased values may increase runtime"/>
</when>
</conditional>
</section>
<!-- Add for second version of tool for batch jobs -->
<!-- <param name="stop_at" label="Stop score" type="float" min="0.0" optional="true" help="Compute models until pLDDT (single chain) or pTM-score (multimer) > threshold is reached. This speeds up prediction by running less models for easier queries."/> -->
<section name="output_options" title="Output Options">
<param argument="--save-all" type="boolean" label="Save raw outputs from model to a pickle file" truevalue="--save-all" falsevalue=""/>
<param argument="--save-recycles" type="boolean" label="Save all intermediate predictions at each recycle iteration" truevalue="--save-recycles" falsevalue=""/>
<param argument="--save-single-representations" type="boolean" label="Save the single representation embeddings of all models." truevalue="--save-single-representations" falsevalue=""/>
<param argument="--save-pair-representations" type="boolean" label="Save the pair representation embeddings of all models." truevalue="--save-pair-representations" falsevalue=""/>
</section>
</inputs>
<outputs>
<collection name="png_files" type="list" format="png" label="${tool.name} on ${on_string}: Figures">
<discover_datasets format="png" pattern="__name_and_ext__" directory="output/png_out"/>
</collection>
<collection name="json_files" type="list" format="json" label="${tool.name} on ${on_string}: JSON predictions">
<discover_datasets format="json" pattern="__name_and_ext__" directory="output/json_out"/>
</collection>
<collection name="pdb" type="list" format="pdb" label="${tool.name} on ${on_string}: PDB predictions">
<discover_datasets format="pdb" pattern="__name_and_ext__" directory="output/pdb_out"/>
</collection>
<collection name="pickle" type="list" format="pickle" label="${tool.name} on ${on_string}: Pickle file outputs">
<discover_datasets format="pickle" pattern="__name_and_ext__" directory="output/pickle_out"/>
<filter>output_options['save_all']</filter>
</collection>
<collection name="npy" type="list" format="npy" label="${tool.name} on ${on_string}: Numpy embeddings">
<discover_datasets format="npy" pattern="__name_and_ext__" directory="output/npy_out"/>
<filter>output_options['save_single_representations'] or output_options['save_pair_representations']</filter>
</collection>
<data name="a3m_out" format="a3m" from_work_dir="output/output.a3m" label="${tool.name} on ${on_string}: a3m file"/>
</outputs>
<tests>
<test expect_exit_code="1">
<param name="input" value="input.tar"/>
<section name="advanced">
<param name="num_recycles" value="4"/>
<param name="recycle_early_stop_tolerance" value="0.4"/>
<param name="num_ensemble" value="1"/>
<param name="random_seed" value="43"/>
<param name="num_seeds" value="2"/>
<param name="num_models" value="2"/>
<param name="max_msa" value="64:128"/>
<param name="use_dropout" value="--use-dropout"/>
<conditional name="amber">
<param name="use_amber" value="yes"/>
<param name="num_relaxed" value="0"/>
</conditional>
</section>
<section name="output_options">
<param name="save_all" value="--save-all"/>
<param name="save_recycles" value="--save-recycles"/>
<param name="save_single_representations" value="--save-single-representations"/>
<param name="save_pair_representations" value="--save-pair-representations"/>
</section>
<assert_command>
<has_text text="colabfold_batch --num-recycle 4 --recycle-early-stop-tolerance 0.4 --num-ensemble 1 --random-seed 43"/>
<has_text text="--num-seeds 2 --num-models 2 --use-dropout --max-msa 64:128 --amber --num-relax 0"/>
<has_text text="--save-all --save-recycles --save-single-representations --save-pair-representations"/>
</assert_command>
</test>
</tests>
<help><![CDATA[
Generate run a folding step on the output of the colabfold msa run
]]></help>
<expand macro="citations"/>
</tool>
115 changes: 115 additions & 0 deletions tools/colabfold/colabfold_msa.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
<tool id="colabfold_msa" name="colabfold msa" version="@VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
<description>Generate MSAs for the alphafold step of Colabfold</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="biotools"/>
<expand macro="requirements" />
<command detect_errors="exit_code"><![CDATA[
#import re

## Symlinking and formatting
#if $custom_template:
mkdir template_dir &&
#for $file in $custom_template:
#set input_file = re.sub('[^\w\-_\.]', '_', str($file.element_identifier)) + '.pdb'
ln -s $file "template_dir/${file.element_identifier}.pdb" &&
#end for
#end if
## For single file runs. Will need to be updated for multiple file calls
#set input_file = re.sub('[^\w\-_\.]', '_', str($query_type.input.element_identifier)) + "." + str($query_type.input.ext)
ln -s $query_type.input '$input_file' &&

mkdir output &&
colabfold_batch --msa-only
#if $query_type.select_query_type == "fasta":
#if $query_type.select_db.use_db == "yes":
--msa-mode $query_type.select_db.msa_mode
#end if
#end if
--pair-mode $pair_mode
## --pair-strategy $pairing_strategy
$templates
#if $custom_template:
--custom-template-path template_dir
#end if
--jobname-prefix 'galaxy'
'$input_file'
output.colab &&
ls output.colab &&
tar -cf output.colab.tar output.colab
]]></command>
<inputs>
<conditional name="query_type">
<param name="select_query_type" label="Data input method" type="select">
<option value="fasta">FASTA file</option>
<option value="a3m">a3m file</option>
</param>
<when value="fasta">
<param name="input" type="data" format="fasta" label="Query sequence fasta"/>
<expand macro="db_selector"/>
</when>
<when value="a3m">
<param name="input" type="data" format="a3m" label="Query sequence a3m file"/>
</when>
</conditional>
<param name="pair_mode" label="Pair mode" type="select">
<option value="unpaired_paired">Attempt to pair sequences from the same operon within the genome</option>
<option value="paired">Only used sequences that were successfully paired</option>
<option value="unpaired">Generate separate MSA for each protein</option>
</param>
<!-- Non functional in current release of colabfold, planned to expose in next one -->
<!-- <param name="pairing_strategy" label="Pairing strategy" type="select">
<option value="greedy">Greedy: MSA sequences should only be paired if the same species exist in at least two MSAs </option>
<option value="complete">Complete: MSA sequences should only be paired if the same species exist in all MSAs</option>
</param> -->
<param argument="--templates" label="Query PDB templates from the MSA server" type="boolean" truevalue="--templates" falsevalue=""/>
<param name="custom_template" label="List of PDB files to provide the prediciton as custom templates" type="data" format="pdb" multiple="true" optional="true"/>
</inputs>
<outputs>
<data name="output" format="colab.tar" from_work_dir="output.colab.tar" label="${tool.name} on ${on_string}: tar file"/>
</outputs>
<tests>
<!-- fasta -->
<test expect_num_outputs="1">
<conditional name="query_type">
<param name="select_query_type" value="fasta"/>
<param name="input" value="test.fasta"/>
<conditional name="select_db">
<param name="use_db" value="yes"/>
<param name="msa_mode" value="mmseqs2_uniref"/>
</conditional>
</conditional>
<assert_command>
<has_text text="colabfold_batch --msa-only --msa-mode mmseqs2_uniref --pair-mode unpaired_paired"/>
</assert_command>
<output name="output">
<assert_contents>
<has_archive_member path=".*\/galaxy_0_all/msa.sh"/>
</assert_contents>
</output>
</test>
<!-- a3m -->
<test expect_num_outputs="1">
<conditional name="query_type">
<param name="select_query_type" value="a3m"/>
<param name="input" value="test.a3m" ftype="a3m"/>
</conditional>
<param name="pair_mode" value="paired"/>
<param name="templates" value="--templates"/>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these shipped in the container?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the templates are stored on their server, they are not in the container itself from my understanding

<param name="custom_template" value="test.pdb,test_2.pdb"/>
<assert_command>
<has_text text="colabfold_batch --msa-only --pair-mode paired --templates --custom-template-path template_dir"/>
</assert_command>
<output name="output">
<assert_contents>
<has_archive_member path=".*\/galaxy_0.pickle"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
Generate MSAs for the alphafold step of Colabfold
]]></help>
<expand macro="citations"/>
</tool>
37 changes: 37 additions & 0 deletions tools/colabfold/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<macros>
<token name="@VERSION@">1.5.5</token>
<token name="@CUDA_VERSION@">12.2.2</token>
<token name="@VERSION_SUFFIX@">0</token>
<xml name="requirements">
<requirements>
<container type="docker">ghcr.io/sokrypton/colabfold:@VERSION@-cuda@CUDA_VERSION@</container>
</requirements>
</xml>
<xml name="biotools">
<xrefs>
<xref type="bio.tools">Colabfold</xref>
</xrefs>
</xml>
<xml name="citations">
<citations>
<citation type="doi">10.1038/s41592-022-01488-1</citation>
</citations>
</xml>
<xml name="db_selector">
<conditional name="select_db">
<param name="use_db" type="select" label="Manually set database?">
<option value="yes">Yes</option>
<option value="no">No</option>
</param>
<when value="no"></when>
<when value="yes">
<param name="msa_mode" label="MSA mode" type="select">
<option value="mmseqs2_uniref_env">mmseqs2_uniref_env</option>
<option value="mmseqs2_uniref">mmseqs2_uniref</option>
<option value="single_sequence">Use single sequence input</option>
<!-- <option value="custom">custom</option> -->
</param>
</when>
</conditional>
</xml>
</macros>
Binary file added tools/colabfold/test-data/input.tar
Binary file not shown.
39 changes: 39 additions & 0 deletions tools/colabfold/test-data/test.a3m
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#38 1
>101
MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS
>UniRef100_N6VR80 62 0.971 1.153E-09 3 37 38 17 51 52
---IKRSSRRWKKKGRMRWKWYKKRLRRLKRERRRARS
>UniRef100_A0A534JJZ5 55 0.722 4.940E-07 3 37 38 0 35 80
---MKRSSRAWKKRGKMRWKWRKKRMRRRKREQKlRART
>UniRef100_A0A8T5HQN6 54 0.647 9.355E-07 3 36 38 14 47 48
---MKRSSRRWKKKGQMRWKWQRKRMKKEKRKRAKSR-
>UniRef100_A0A2E4RG04 53 0.666 1.772E-06 3 35 38 36 68 71
---MKRGSRAWKKQGNQRWKWRKKKLRRRKASRKRA--
>UniRef100_R1E4G0 53 0.617 2.438E-06 3 36 38 0 33 38
---MRRSSRRWKKYLRSRWKWQRRRIREEKRLRKIAR-
>UniRef100_A0A397WLW3 53 0.676 2.438E-06 3 36 38 0 33 38
---MKRSSRRWKKYKRSRWKWQKKRMKEEKRLRKLAR-
>UniRef100_A0A2K3J9R6 52 0.700 4.619E-06 3 32 38 0 29 32
---MKRSSRVWKKRHKMRWKWRKKRMRREKRSR-----
>UniRef100_A6VJM7 51 0.862 8.751E-06 3 31 38 5 33 39
---IKRSSRRWKKKGQMRWKHYKKRIRRMKRE------
>UniRef100_A0A7M3WK46 51 0.666 8.751E-06 3 35 38 30 62 66
---MKRGSRAWKKQGKQRWKWRKKKLRRRKAARKRA--
>UniRef100_A0A915SG42 51 0.617 1.205E-05 3 36 38 0 33 38
---MKRSSRRWKKYLRSRWKWQRRRIREEKRLRKVTR-
>UniRef100_A0A510BD48 51 0.900 1.205E-05 3 32 38 31 60 64
---IKRSSRRWKKKGRMRWRHYKKRLRRRKRER-----
>UniRef100_A0A075M0T1 50 0.638 1.658E-05 1 36 38 22 57 61
-VIMKRRPRKWKKKGRMRWKWLKKRIRRLKRQHRKER-
>UniRef100_A4FYQ5 48 0.851 8.201E-05 3 29 38 5 31 39
---IKRSSRRWKKKGQMRWKHYKKRIRRMK--------
>UniRef100_A0A8J7USD9 48 0.888 8.201E-05 3 29 38 13 39 47
---IKRSSRRWKKKGQMRWKHYKKRLRRMK--------
>UniRef100_A0A2K3JJ52 48 0.700 1.129E-04 3 32 38 0 29 34
---MKRSSRVWKKRRKMRWKWRKKRMRREKRMR-----
>UniRef100_A6UVG5 48 0.821 1.129E-04 3 30 38 9 36 42
---IKRSSRRWKKKGQMRWSHYKKRIRRMKR-------
>UniRef100_A0A5E4HZQ2 43 0.750 7.251E-03 7 34 38 25 52 58
-------PRKWKKKGRMRWKWVKKRRKRLKRKIKR---
>UniRef100_A0A2H6JYE4 36 0.566 1.234E+00 4 33 38 3 32 37
----KHSSRKWKKRGKCRWKTRKKKLKERRRQRK----
2 changes: 2 additions & 0 deletions tools/colabfold/test-data/test.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>testing
MIPIKRSSRRWKKKGRMRWKWYKKRLRRLKRERKRARS
Loading
Loading