Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bwa-mem2 data manager #6376

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions data_managers/data_manager_bwa_mem2_index_builder/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
categories:
- Data Managers
description: Bwa-mem2 is the next version of the bwa-mem algorithm in bwa.
homepage_url: https://github.com/bwa-mem2/bwa-mem2
long_description: |
Bwa-mem2 is the next version of the bwa-mem algorithm in bwa. It produces
alignment identical to bwa and is ~1.3-3.1x faster depending on the use-case,
dataset and the running machine. Bwa-mem2 uses a different index format that
is efficient on disk space and runtime memory but requires larger amounts of
memory (roughly 27x the reference) when building.
name: data_manager_bwa_mem2_index_builder
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_bwa_mem2_index_builder
type: unrestricted
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python
# Based heavily on the bwa-mem data manager wrapper script by Dan Blankenberg
from __future__ import print_function

import argparse
import json
import os
import subprocess
import sys

DEFAULT_DATA_TABLE_NAME = "bwa_mem2_indexes"


def get_id_name(params, dbkey, fasta_description=None):
# TODO: ensure sequence_id is unique and does not already appear in location file
sequence_id = params['param_dict']['sequence_id']
if not sequence_id:
sequence_id = dbkey

sequence_name = params['param_dict']['sequence_name']
if not sequence_name:
sequence_name = fasta_description
if not sequence_name:
sequence_name = dbkey
return sequence_id, sequence_name


def build_bwa_mem2_index(data_manager_dict, options, params, sequence_id, sequence_name):
data_table_name = options.data_table_name or DEFAULT_DATA_TABLE_NAME
target_directory = params['output_data'][0]['extra_files_path']
if not os.path.exists(target_directory):
os.mkdir(target_directory)
fasta_base_name = os.path.split(options.fasta_filename)[-1]
sym_linked_fasta_filename = os.path.join(target_directory, fasta_base_name)
os.symlink(options.fasta_filename, sym_linked_fasta_filename)
args = ['bwa-mem2', 'index', sym_linked_fasta_filename]
proc = subprocess.Popen(args=args, shell=False, cwd=target_directory)
return_code = proc.wait()
if return_code:
print("Error building index.", file=sys.stderr)
sys.exit(return_code)
data_table_entry = dict(value=sequence_id, dbkey=options.fasta_dbkey, name=sequence_name, path=fasta_base_name)
_add_data_table_entry(data_manager_dict, data_table_name, data_table_entry)


def _add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
return data_manager_dict


def main():
# Parse Command Line
parser = argparse.ArgumentParser()
parser.add_argument('--output', dest='output', action='store', type=str, default=None)
parser.add_argument('--fasta_filename', dest='fasta_filename', action='store', type=str, default=None)
parser.add_argument('--fasta_dbkey', dest='fasta_dbkey', action='store', type=str, default=None)
parser.add_argument('--fasta_description', dest='fasta_description', action='store', type=str, default=None)
parser.add_argument('--data_table_name', dest='data_table_name', action='store', type=str, default='bwa_mem2_indexes')
options = parser.parse_args()

filename = options.output

with open(filename) as fh:
params = json.load(fh)
data_manager_dict = {}

if options.fasta_dbkey in [None, '', '?']:
raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (options.fasta_dbkey))

sequence_id, sequence_name = get_id_name(params, dbkey=options.fasta_dbkey, fasta_description=options.fasta_description)

# build the index
build_bwa_mem2_index(data_manager_dict, options, params, sequence_id, sequence_name)

# save info to json file
with open(filename, 'w') as fh:
json.dump(data_manager_dict, fh, sort_keys=True)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<tool id="bwa_mem2_index_builder_data_manager" name="BWA-MEM2 index" tool_type="manage_data" version="0.0.1" profile="19.05">
<description>builder</description>
<requirements>
<requirement type="package" version="2.2.1">bwa-mem2</requirement>
<requirement type="package" version="3.9">python</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/bwa_mem2_index_builder.py'
--output '${out_file}'
--fasta_filename '${all_fasta_source.fields.path}'
--fasta_dbkey '${all_fasta_source.fields.dbkey}'
--fasta_description '${all_fasta_source.fields.name}'
--data_table_name bwa_mem2_indexes
]]>
</command>
<inputs>
<param name="all_fasta_source" type="select" label="Source FASTA Sequence">
<options from_data_table="all_fasta"/>
</param>
<param name="sequence_name" type="text" value="" label="Name of sequence" />
<param name="sequence_id" type="text" value="" label="ID for sequence" />
</inputs>
<outputs>
<data name="out_file" format="data_manager_json" />
</outputs>
<tests>
<test>
<param name="all_fasta_source" value="phiX174"/>
<output name="out_file" file="bwa_mem2_data_manager.json"/>
</test>
</tests>
<help>
<![CDATA[
.. class:: infomark
**Notice:** If you leave name, description, or id blank, it will be generated automatically.
Bwa-mem2 is the next version of the bwa-mem algorithm in bwa. It produces
alignment identical to bwa and is ~1.3-3.1x faster depending on the use-case,
dataset and the running machine. Bwa-mem2 uses a different index format that
is efficient on disk space and runtime memory but requires larger amounts of
memory (roughly 27x the reference) when building.
]]>
</help>
<citations>
<citation type="doi">10.1038/nmeth.3317</citation>
</citations>
</tool>
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0"?>
<data_managers>
<data_manager tool_file="data_manager/bwa_mem2_index_builder.xml" id="bwa_mem2_index_builder">
<data_table name="bwa_mem2_indexes">
<output>
<column name="value" />
<column name="dbkey" />
<column name="name" />
<column name="path" output_ref="out_file" >
<move type="directory" relativize_symlinks="True">
<!-- <source>${path}</source>--> <!-- out_file.extra_files_path is used as base by default --> <!-- if no source, eg for type=directory, then refers to base -->
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">${dbkey}/bwa_mem2_index/${value}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/${dbkey}/bwa_mem2_index/${value}/${path}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
</data_manager>
</data_managers>
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#This file lists the locations and dbkeys of all the fasta files
#under the "genome" directory (a directory that contains a directory
#for each build). The script extract_fasta.py will generate the file
#all_fasta.loc. This file has the format (white space characters are
#TAB characters):
#
#<unique_build_id> <dbkey> <display_name> <file_path>
#
#So, all_fasta.loc could look something like this:
#
#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
#
#Your all_fasta.loc file should contain an entry for each individual
#fasta file. So there will be multiple fasta files for each build,
#such as with hg19 above.
#
phiX174 phiX174 phiX174 ${__HERE__}/phiX174.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"bwa_mem2_indexes": [{"dbkey": "phiX174", "name": "phiX174", "path": "phiX174.fasta", "value": "phiX174"}]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
>phiX174
GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT
GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA
ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG
TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA
GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC
TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT
TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT
CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT
TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG
TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC
GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA
CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAG
TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT
AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC
CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA
TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC
TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA
CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA
GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT
GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA
ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC
TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT
TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC
ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCGTGATGTTATTTCTTCATTTGGAGGTAAAAC
CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT
GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC
CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC
TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG
TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT
TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA
AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT
TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT
ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC
GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC
TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT
TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA
TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG
TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC
CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG
AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC
CGGGCAATAATGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT
TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG
CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA
AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT
GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG
GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA
TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT
CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG
TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA
GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC
CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA
TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA
AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC
TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT
CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA
TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG
TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT
CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT
TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC
ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG
TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA
ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG
GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC
CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT
GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTACTATTCAGCGTTTGATGAATGCAATGCGACAG
GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT
ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG
CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC
CGTCTTCATTTCCATGCGGTGCATTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC
GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT
CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG
CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA
TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT
TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG
TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC
AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC
TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#This file lists the locations and dbkeys of all the fasta files
#under the "genome" directory (a directory that contains a directory
#for each build). The script extract_fasta.py will generate the file
#all_fasta.loc. This file has the format (white space characters are
#TAB characters):
#
#<unique_build_id> <dbkey> <display_name> <file_path>
#
#So, all_fasta.loc could look something like this:
#
#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
#
#Your all_fasta.loc file should contain an entry for each individual
#fasta file. So there will be multiple fasta files for each build,
#such as with hg19 above.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#This is a sample file distributed with Galaxy that enables tools
#to use a directory of BWA indexed sequences data files. You will need
#to create these data files and then create a bwa_index.loc file
#similar to this one (store it in this directory) that points to
#the directories in which those files are stored. The bwa_index.loc
#file has this format (longer white space characters are TAB characters):
#
#<unique_build_id> <dbkey> <display_name> <file_path>
#
#So, for example, if you had phiX indexed stored in
#/depot/data2/galaxy/phiX/base/,
#then the bwa_index.loc entry would look like this:
#
#phiX174 phiX phiX Pretty /depot/data2/galaxy/phiX/base/phiX.fa
#
#and your /depot/data2/galaxy/phiX/base/ directory
#would contain phiX.fa.* files:
#
#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 phiX.fa.amb
#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 phiX.fa.ann
#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 phiX.fa.bwt
#...etc...
#
#Your bwa_index.loc file should include an entry per line for each
#index set you have stored. The "file" in the path does not actually
#exist, but it is the prefix for the actual index files. For example:
#
#phiX174 phiX phiX174 /depot/data2/galaxy/phiX/base/phiX.fa
#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/hg18/base/hg18canon.fa
#hg18full hg18 hg18 Full /depot/data2/galaxy/hg18/base/hg18full.fa
#/orig/path/hg19.fa hg19 hg19 /depot/data2/galaxy/hg19/base/hg19.fa
#...etc...
#
#Note that for backwards compatibility with workflows, the unique ID of
#an entry must be the path that was in the original loc file, because that
#is the value stored in the workflow for that parameter. That is why the
#hg19 entry above looks odd. New genomes can be better-looking.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
<tables>
<!-- Locations of indexes in the BWA-MEM2 mapper format-->
<table name="bwa_mem2_indexes" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="tool-data/bwa_mem2_index.loc" />
</table>
</tables>
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<tables>
<!-- Locations of all fasta files under genome directory -->
<table name="all_fasta" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="${__HERE__}/test-data/all_fasta.loc" />
</table>
<!-- Locations of indexes in the BWA-MEM2 mapper format-->
<table name="bwa_mem2_indexes" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="${__HERE__}/test-data/bwa_mem2_index.loc" />
</table>
</tables>