Skip to content
This repository has been archived by the owner on Nov 9, 2023. It is now read-only.

Split libraries rework #1657

Closed
wants to merge 67 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
8697aad
initial workflow object
wasade Dec 9, 2013
45b2b57
core workflow object
wasade Dec 9, 2013
6a7e130
improved doc, simplified logic
wasade Dec 10, 2013
7d89866
few things:
wasade Dec 10, 2013
8e39195
few things:
wasade Dec 10, 2013
525a2dd
Merge branch 'shortcircuit' into wf_qual_filter_fasta
wasade Dec 10, 2013
5001f4c
starting workflow version of quality_filter_fasta
wasade Dec 10, 2013
62d5b2c
support for 'that the value exists' type requirements
wasade Dec 10, 2013
75758ac
lighted a requirement
wasade Dec 10, 2013
28a18d6
added
wasade Dec 11, 2013
8eeb7a4
updated docs, tagging wf_ functions
wasade Dec 11, 2013
6e7fe9b
removed a priority from the example
wasade Dec 11, 2013
c71cd07
possible parallel support, doc/example
wasade Dec 11, 2013
c2429e8
full python path for import example
wasade Dec 11, 2013
0653803
merging in more methods
wasade Dec 11, 2013
c23574b
Merge branch 'master' into wf_qual_filter_fasta
wasade Dec 17, 2013
cab54f4
added more methods
wasade Dec 17, 2013
a34b5b3
Merge branch 'master' of github.com:qiime/qiime into shortcircuit
wasade Dec 17, 2013
8bdf43d
removed parallel, added staging function
wasade Dec 17, 2013
0964d11
Merge branch 'shortcircuit' into wf_qual_filter_fasta
wasade Dec 17, 2013
f1ae29c
added sanity_check
wasade Dec 17, 2013
b98a02d
sanity check, staging method
wasade Dec 17, 2013
61ec0d9
further progress
wasade Dec 18, 2013
357fbaf
added a ValidData method
wasade Dec 18, 2013
19776b6
fasta/fastq iterators
wasade Dec 22, 2013
d545782
methods now use item as a dict, update keys in finalstate
wasade Dec 22, 2013
0ea4153
more workflow progress
wasade Dec 30, 2013
cc309ee
Merge branch 'master' of github.com:qiime/qiime into wf_qual_filter_f…
wasade Jan 30, 2014
27dfca6
resolving pep8 warnings/errors
wasade Jan 30, 2014
4cf60da
starting ProcessSeqsWorkflowTests
wasade Jan 30, 2014
e38c252
reorganizing groups
wasade Jan 30, 2014
f899099
split_libraries_fastq methods in place
wasade Jan 31, 2014
b7c6a8f
tests for quality_max_bad_run_length
wasade Jan 31, 2014
a58c154
tests for min_per_read_length_fraction
wasade Jan 31, 2014
d02e017
added tests for _demultiplex_encoded_barcode
wasade Jan 31, 2014
5b7d60b
added tests for _demultiplex_max_barcode_error
wasade Jan 31, 2014
c44aee1
added test docstrings
wasade Jan 31, 2014
2fe6baf
tests for primer_check_forward
wasade Jan 31, 2014
2ef930d
tests for sequence_length_check
wasade Jan 31, 2014
ac0a392
tests for sequence_ambiguous_count and force init of wf_obj.Failed
wasade Jan 31, 2014
3c11bc0
tests for _count_mismatches
wasade Jan 31, 2014
e9357db
whitespace
wasade Jan 31, 2014
90a8cf8
removed some extraneous comments
wasade Jan 31, 2014
7d5d040
stuff and stuff
wasade Mar 16, 2014
2adbff1
MAINT: reflecting finalized workflow and iterators in skbio
wasade Apr 21, 2014
9018eaf
MAINT: more updated to reflect finalized workflow
wasade Apr 21, 2014
c488932
ENH: new Seqs object for generating sequence data for the workflow
wasade Apr 21, 2014
c5f9f33
ENH: added IterAdapter and tests
wasade Apr 21, 2014
1fb4e27
Merge branch 'master' of github.com:qiime/qiime into wf_qual_filter_f…
wasade Aug 11, 2014
27b8ef4
MAINT: back inline with skbio changes
wasade Aug 12, 2014
5ce6106
DOC: major docstring updates
wasade Aug 12, 2014
c8d0f9c
ENH/DOC: once through for cleanup, added quick method to count runs o…
wasade Aug 12, 2014
a05cf86
STY: pep8
wasade Aug 12, 2014
ccfc596
MAINT: removing unused variables
wasade Aug 12, 2014
a44062f
MAINT: removing unused code
wasade Aug 12, 2014
b7ff5b1
TST: inc. coverage, simple full workflow test
wasade Aug 12, 2014
2040e6e
DOC/BUG: added options, fixed min_per_read_length
wasade Aug 12, 2014
71173a0
ENH/TST: ambig check on barcode
wasade Aug 13, 2014
30a24ca
ENH: success stats
wasade Aug 13, 2014
0606b60
TST: checking of stats
wasade Aug 13, 2014
21b9318
ENH: removed need to specify option demulitplex
wasade Aug 13, 2014
1a5dfda
ENH/MAINT: adding click interface, removing old files
wasade Aug 13, 2014
97ac7b5
ENH: output fastq or fasta
wasade Aug 13, 2014
27d801a
TST: tests for click command
wasade Aug 13, 2014
3319d97
DOC: usage exampleS
wasade Aug 13, 2014
417a541
BLD: adding click
wasade Aug 14, 2014
03a5da4
MAINT: removed print
wasade Aug 14, 2014
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions qiime/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import click

import qiime


def print_version(ctx, param, value):
if not value or ctx.resilient_parsing:
return
click.echo('Version %s' % qiime.__version__)
ctx.exit()


@click.group()
@click.option('--version', is_flag=True, callback=print_version,
expose_value=False, is_eager=True)
@click.pass_context
def qiime_cli(ctx):
"""QIIME, canonically pronounced 'chime'"""
pass
162 changes: 162 additions & 0 deletions qiime/click_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import os

import click

from .cli import qiime_cli


@qiime_cli.command()
# I/O options
@click.option('--sequence-read-fp', '-i', multiple=True, required=True,
type=click.Path(exists=True), help='Input sequence reads')
@click.option('--output-dir', '-o', type=click.Path(exists=False),
required=True)
@click.option('--mapping-fp', '-m', required=True,
type=click.File('U'), help='Mapping file')
@click.option('--barcode-read-fp', '-b', multiple=True, required=False,
type=click.Path(exists=True), help='Barcode read files')
@click.option('--rev-comp/--no-rev-comp', default=False,
help='Reverse complement sequences on output')
@click.option('--start-seq-id', type=int, default=0,
help='The starting unique ID for sequences')
@click.option('--to-fastq', is_flag=True,
help='Write out in fastq')
# Iterator options
@click.option('--rev-comp-barcodes/--no-rev-comp-barcodes', default=False,
help='Reverse complement barcode reads')
@click.option('--phred-offset', default='33', type=click.Choice(['33', '64']),
help='The ASCII offset used to decode PHRED scores')
# Runtime options
@click.option('--phred-quality-threshold', '-q', default=3, type=int,
help='Minimum PHRED quality score')
@click.option('--barcode-type', help='The type of barcode used',
default='golay_12', type=click.Choice(['golay_12', 'hamming_8',
'not-barcoded']))
@click.option('--max-barcode-error', default=1.5, type=float,
help='The maximum number of barcode errors allowed')
@click.option('--retain-primer/--no-retain-primer', default=False,
help='Whether to retain the primers or not (if applicable)')
@click.option('--max-primer-mismatch', type=int, default=0,
help='Maximum mismatches allowed within the primers')
@click.option('--min-seq-len', type=int,
help='The minimum sequence length')
@click.option('--max-ambig-count', default=0, type=int,
help='Maximum ambiguous bases allowed')
# Other options
@click.option('--rev-comp-mapping-barcodes/--no-rev-comp-mapping-barcodes',
default=False, help='Reverse complement the mapping barcodes')
@click.pass_context
def slib(ctx, **kwargs):
"""Quality filter and demultiplex sequences

Examples
--------

Demultiplex and quality filter (at Phred >= Q20) one lane of Illumina fastq
data and write results to ./slout_q20:

$ qiime slib -i $PWD/lane1_read1.fastq.gz -b $PWD/lane1_barcode.fastq.gz \
-m $PWD/map.txt -o slout_q20 --rev-comp-mapping-barcodes -q 20

Demultiplex and quality filter (at Phred >= Q20) two lanes of Illumina
fastq data and write results to ./slout_q20:

$ qiime slib -i $PWD/lane1_read1.fastq.gz -i $PWD/lane2_read1.fastq.gz \
-b $PWD/lane1_barcode.fastq.gz -b $PWD/lane2_barcode.fastq.gz \
-m $PWD/map.txt -o slout_q20 --rev-comp-mapping-barcodes -q 20
"""
from skbio import DNA
from skbio.parse.sequences.factory import load
from skbio.format.sequences.fastq import format_fastq_record

from qiime.parse import parse_mapping_file_to_dict
from qiime.process_seqs import SequenceWorkflow, IterAdapter

# qiime_config is available under ctx.obj['qiime_config']

phred_offset = int(kwargs.pop('phred_offset'))

# reverse complement for reversing mapping barcodes
def rc(seq):
return str(DNA(seq).rc())

# reverse complement for reversing barcode reads
def rc_it(st):
st['Sequence'] = rc(st['Sequence'])
st['Qual'] = st['Qual'][::-1] if st['Qual'] is not None else None

# id formatter for writing
def format_id(idx, state):
seq_id = "%s_%d" % (state['Sample'], idx)
ori_id = state['SequenceID']
ori_bc = "orig_bc=%s" % state['Original barcode']
new_bc = "new_bc=%s" % state['Final barcode']
bc_diff = "bc_diffs=%d" % (state['Barcode errors'] or 0)
return " ".join([seq_id, ori_id, ori_bc, new_bc, bc_diff])

# should be sourced from skbio but there doesn't appear to be a method that
# takes a single seq and ID
def format_fasta(id_, item):
return ">%s\n%s\n" % (id_, item['Sequence'])

def make_format_fastq(offset):
def f(id_, state):
seq = state['Sequence']
qual = state['Qual']
return format_fastq_record(id_, seq, qual, offset)
return f
format_fastq = make_format_fastq(phred_offset)

# setup sequence iterator
seqs = load(kwargs.pop('sequence_read_fp'), phred_offset=phred_offset)

# setup barcode iterator
barcode_read_fp = kwargs.pop('barcode_read_fp')
if barcode_read_fp:
transform = rc_it if kwargs.pop('rev_comp_barcodes') else None
barcodes = load(barcode_read_fp, transform=transform,
phred_offset=phred_offset)
else:
barcodes = None

# load mapping, setup barcode and primer maps
mapping, comments = parse_mapping_file_to_dict(kwargs.pop('mapping_fp'))
barcode_map = {v['BarcodeSequence']: k for k, v in mapping.items()}
primer_map = {v['BarcodeSequence']: v['LinkerPrimerSequence'].split(',')
for v in mapping.values()}

# reverse complement barcodes if necessary
if kwargs.pop('rev_comp_mapping_barcodes'):
barcode_map = {rc(k): v for k, v in barcode_map.items()}

# setup outputs and options
output_dir = kwargs.pop('output_dir')
if not os.path.exists(output_dir):
os.mkdir(output_dir)

to_fastq = kwargs.pop('to_fastq')
ext = 'fq' if to_fastq else 'fna'
formatter = format_fastq if to_fastq else format_fasta
success_fp = os.path.join(output_dir, 'seqs.%s' % ext)
fail_fp = os.path.join(output_dir, 'unassigned.%s' % ext)

if os.path.exists(success_fp):
raise IOError("%s already exists!" % success_fp)

if os.path.exists(fail_fp):
raise IOError("%s already exists!" % fail_fp)

# setup starting sequence ID and whether to RC on write
seq_id = kwargs.pop('start_seq_id')
rc = rc if kwargs.pop('rev_comp') else lambda x: x

# setup sequence/barcode iterator
iter_ = IterAdapter(seqs, barcodes)
wf = SequenceWorkflow(options=kwargs, mapping=mapping,
barcodes=barcode_map, primers=primer_map)

with open(success_fp, 'w') as success, open(fail_fp, 'w') as failed:
for idx, item in enumerate(wf(iter_, fail_callback=lambda x: x.state)):
id_ = format_id(seq_id + idx, item)
formatted = formatter(id_, item)
failed.write(formatted) if wf.failed else success.write(formatted)
Loading